{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 107695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00046427410743302844, "grad_norm": 124.61332702636719, "learning_rate": 2.999832861321324e-07, "logits/chosen": -18.516504287719727, "logits/rejected": -17.817121505737305, "logps/chosen": -445.40460205078125, "logps/rejected": -415.01776123046875, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": 0.0033424377907067537, "rewards/margins": 0.0064355479553341866, "rewards/rejected": -0.003093109233304858, "step": 10 }, { "epoch": 0.0009285482148660569, "grad_norm": 107.68170928955078, "learning_rate": 2.9995542968568644e-07, "logits/chosen": -18.345867156982422, "logits/rejected": -16.276630401611328, "logps/chosen": -534.8252563476562, "logps/rejected": -272.29302978515625, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.003727264702320099, "rewards/margins": -0.006568240933120251, "rewards/rejected": 0.0028409764636307955, "step": 20 }, { "epoch": 0.0013928223222990854, "grad_norm": 39.0323600769043, "learning_rate": 2.9992757323924043e-07, "logits/chosen": -18.070751190185547, "logits/rejected": -17.93671226501465, "logps/chosen": -367.52423095703125, "logps/rejected": -338.6728210449219, "loss": 0.6928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0031161499209702015, "rewards/margins": 0.0006535340216942132, "rewards/rejected": 0.0024626157246530056, "step": 30 }, { "epoch": 0.0018570964297321137, "grad_norm": 109.64766693115234, "learning_rate": 2.998997167927944e-07, "logits/chosen": -18.798688888549805, "logits/rejected": -18.041812896728516, "logps/chosen": -339.1958923339844, "logps/rejected": -371.5032043457031, "loss": 0.6943, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.005602397955954075, "rewards/margins": -0.002326908288523555, "rewards/rejected": -0.0032754899002611637, "step": 40 }, { "epoch": 0.0023213705371651423, "grad_norm": 72.47455596923828, "learning_rate": 2.9987186034634846e-07, "logits/chosen": -17.376811981201172, "logits/rejected": -17.02350616455078, "logps/chosen": -374.75042724609375, "logps/rejected": -285.984619140625, "loss": 0.6937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0018678283086046576, "rewards/margins": -0.0010569955920800567, "rewards/rejected": 0.0029248236678540707, "step": 50 }, { "epoch": 0.0027856446445981707, "grad_norm": 47.02146911621094, "learning_rate": 2.998440038999025e-07, "logits/chosen": -18.28298568725586, "logits/rejected": -17.140811920166016, "logps/chosen": -457.6845703125, "logps/rejected": -278.9885559082031, "loss": 0.6896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.006516723893582821, "rewards/margins": 0.0071869282983243465, "rewards/rejected": -0.000670204171910882, "step": 60 }, { "epoch": 0.003249918752031199, "grad_norm": 132.49191284179688, "learning_rate": 2.998161474534565e-07, "logits/chosen": -17.531217575073242, "logits/rejected": -17.095962524414062, "logps/chosen": -470.1263122558594, "logps/rejected": -417.475341796875, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": 0.0045166015625, "rewards/margins": -0.003314361674711108, "rewards/rejected": 0.007830962538719177, "step": 70 }, { "epoch": 0.0037141928594642275, "grad_norm": 144.476318359375, "learning_rate": 2.9978829100701053e-07, "logits/chosen": -18.04976463317871, "logits/rejected": -17.793651580810547, "logps/chosen": -412.2423400878906, "logps/rejected": -313.37310791015625, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": 0.0043769837357103825, "rewards/margins": -0.0024969482328742743, "rewards/rejected": 0.006873931735754013, "step": 80 }, { "epoch": 0.004178466966897256, "grad_norm": 16.772865295410156, "learning_rate": 2.9976043456056457e-07, "logits/chosen": -17.66376495361328, "logits/rejected": -17.256549835205078, "logps/chosen": -416.95086669921875, "logps/rejected": -358.5758056640625, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.009490509517490864, "rewards/margins": -0.004141197074204683, "rewards/rejected": 0.01363170612603426, "step": 90 }, { "epoch": 0.004642741074330285, "grad_norm": 50.893314361572266, "learning_rate": 2.9973257811411856e-07, "logits/chosen": -17.5812931060791, "logits/rejected": -17.17832374572754, "logps/chosen": -419.19775390625, "logps/rejected": -295.79345703125, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.008758926764130592, "rewards/margins": 0.0003203391970600933, "rewards/rejected": 0.008438587188720703, "step": 100 }, { "epoch": 0.005107015181763313, "grad_norm": 88.90615844726562, "learning_rate": 2.9970472166767255e-07, "logits/chosen": -17.955537796020508, "logits/rejected": -17.228273391723633, "logps/chosen": -352.43609619140625, "logps/rejected": -321.91558837890625, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.014391021803021431, "rewards/margins": 0.0012944601476192474, "rewards/rejected": 0.013096561655402184, "step": 110 }, { "epoch": 0.0055712892891963415, "grad_norm": 122.4727554321289, "learning_rate": 2.9967965086587117e-07, "logits/chosen": -17.70781898498535, "logits/rejected": -17.3111572265625, "logps/chosen": -506.93377685546875, "logps/rejected": -416.7353515625, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.022905882447957993, "rewards/margins": 0.00677261408418417, "rewards/rejected": 0.016133271157741547, "step": 120 }, { "epoch": 0.00603556339662937, "grad_norm": 113.77164459228516, "learning_rate": 2.996517944194252e-07, "logits/chosen": -18.747058868408203, "logits/rejected": -17.63005828857422, "logps/chosen": -584.73583984375, "logps/rejected": -370.0539245605469, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.014714051969349384, "rewards/margins": 0.0008625035989098251, "rewards/rejected": 0.013851547613739967, "step": 130 }, { "epoch": 0.006499837504062398, "grad_norm": 125.54212188720703, "learning_rate": 2.9962393797297925e-07, "logits/chosen": -18.499052047729492, "logits/rejected": -17.265247344970703, "logps/chosen": -420.01605224609375, "logps/rejected": -327.77691650390625, "loss": 0.6937, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.014460906386375427, "rewards/margins": -0.0009240717627108097, "rewards/rejected": 0.015384979546070099, "step": 140 }, { "epoch": 0.006964111611495427, "grad_norm": 42.49266052246094, "learning_rate": 2.9959608152653324e-07, "logits/chosen": -17.775524139404297, "logits/rejected": -16.99835205078125, "logps/chosen": -366.99346923828125, "logps/rejected": -280.3469543457031, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.013161907903850079, "rewards/margins": 0.0007877920870669186, "rewards/rejected": 0.012374116107821465, "step": 150 }, { "epoch": 0.007428385718928455, "grad_norm": 125.74044799804688, "learning_rate": 2.995682250800873e-07, "logits/chosen": -18.573213577270508, "logits/rejected": -18.078336715698242, "logps/chosen": -465.82049560546875, "logps/rejected": -413.57501220703125, "loss": 0.6945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.021803054958581924, "rewards/margins": -0.0026668556965887547, "rewards/rejected": 0.02446991205215454, "step": 160 }, { "epoch": 0.007892659826361484, "grad_norm": 68.36880493164062, "learning_rate": 2.9954036863364127e-07, "logits/chosen": -17.78134536743164, "logits/rejected": -16.72995948791504, "logps/chosen": -462.5884704589844, "logps/rejected": -360.9506530761719, "loss": 0.69, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.026696395128965378, "rewards/margins": 0.006340332329273224, "rewards/rejected": 0.020356062799692154, "step": 170 }, { "epoch": 0.008356933933794513, "grad_norm": 65.17937469482422, "learning_rate": 2.995125121871953e-07, "logits/chosen": -17.909406661987305, "logits/rejected": -16.855159759521484, "logps/chosen": -359.82757568359375, "logps/rejected": -266.0791015625, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.020154420286417007, "rewards/margins": 0.004425964318215847, "rewards/rejected": 0.015728455036878586, "step": 180 }, { "epoch": 0.008821208041227541, "grad_norm": 71.08460235595703, "learning_rate": 2.9948465574074935e-07, "logits/chosen": -18.689373016357422, "logits/rejected": -18.638090133666992, "logps/chosen": -395.7193908691406, "logps/rejected": -323.3312072753906, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.019093839451670647, "rewards/margins": 0.0022332281805574894, "rewards/rejected": 0.016860608011484146, "step": 190 }, { "epoch": 0.00928548214866057, "grad_norm": 118.3180923461914, "learning_rate": 2.9945679929430334e-07, "logits/chosen": -18.335163116455078, "logits/rejected": -18.216075897216797, "logps/chosen": -480.4960021972656, "logps/rejected": -466.251708984375, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.027765503153204918, "rewards/margins": 0.006825828459113836, "rewards/rejected": 0.020939674228429794, "step": 200 }, { "epoch": 0.009749756256093598, "grad_norm": 87.74407196044922, "learning_rate": 2.994289428478574e-07, "logits/chosen": -17.466915130615234, "logits/rejected": -17.691272735595703, "logps/chosen": -344.15264892578125, "logps/rejected": -364.1688537597656, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.024986574426293373, "rewards/margins": 0.0006168557447381318, "rewards/rejected": 0.02436971664428711, "step": 210 }, { "epoch": 0.010214030363526626, "grad_norm": 17.48921012878418, "learning_rate": 2.9940108640141137e-07, "logits/chosen": -17.882333755493164, "logits/rejected": -16.649333953857422, "logps/chosen": -356.9591064453125, "logps/rejected": -295.86932373046875, "loss": 0.6892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.025779912248253822, "rewards/margins": 0.007972965016961098, "rewards/rejected": 0.017806950956583023, "step": 220 }, { "epoch": 0.010678304470959655, "grad_norm": 48.08696365356445, "learning_rate": 2.993732299549654e-07, "logits/chosen": -18.55428123474121, "logits/rejected": -17.854938507080078, "logps/chosen": -317.8580322265625, "logps/rejected": -287.0545654296875, "loss": 0.6895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023449402302503586, "rewards/margins": 0.007257157005369663, "rewards/rejected": 0.016192246228456497, "step": 230 }, { "epoch": 0.011142578578392683, "grad_norm": 42.050296783447266, "learning_rate": 2.993453735085194e-07, "logits/chosen": -17.342771530151367, "logits/rejected": -16.550457000732422, "logps/chosen": -323.3433532714844, "logps/rejected": -280.668701171875, "loss": 0.689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02253013849258423, "rewards/margins": 0.008381424471735954, "rewards/rejected": 0.014148712158203125, "step": 240 }, { "epoch": 0.011606852685825711, "grad_norm": 65.760498046875, "learning_rate": 2.9931751706207344e-07, "logits/chosen": -17.718605041503906, "logits/rejected": -16.84149742126465, "logps/chosen": -319.7337951660156, "logps/rejected": -238.4444122314453, "loss": 0.6915, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02471763640642166, "rewards/margins": 0.0034769438207149506, "rewards/rejected": 0.02124069258570671, "step": 250 }, { "epoch": 0.01207112679325874, "grad_norm": 89.24271392822266, "learning_rate": 2.992896606156275e-07, "logits/chosen": -18.852455139160156, "logits/rejected": -17.975814819335938, "logps/chosen": -375.7437438964844, "logps/rejected": -285.25177001953125, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.02279144525527954, "rewards/margins": 0.0015187837416306138, "rewards/rejected": 0.02127266116440296, "step": 260 }, { "epoch": 0.012535400900691768, "grad_norm": 70.10780334472656, "learning_rate": 2.9926180416918146e-07, "logits/chosen": -17.781845092773438, "logits/rejected": -16.461864471435547, "logps/chosen": -460.16888427734375, "logps/rejected": -301.3876953125, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": 0.03393203765153885, "rewards/margins": 0.005333099979907274, "rewards/rejected": 0.028598938137292862, "step": 270 }, { "epoch": 0.012999675008124796, "grad_norm": 119.49842071533203, "learning_rate": 2.992339477227355e-07, "logits/chosen": -18.10146141052246, "logits/rejected": -17.58162498474121, "logps/chosen": -494.388671875, "logps/rejected": -458.9991149902344, "loss": 0.69, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.03721512109041214, "rewards/margins": 0.006527024321258068, "rewards/rejected": 0.030688095837831497, "step": 280 }, { "epoch": 0.013463949115557825, "grad_norm": 134.66197204589844, "learning_rate": 2.992060912762895e-07, "logits/chosen": -17.624534606933594, "logits/rejected": -17.121746063232422, "logps/chosen": -432.2972717285156, "logps/rejected": -321.5616149902344, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.033826179802417755, "rewards/margins": 0.0035026934929192066, "rewards/rejected": 0.030323484912514687, "step": 290 }, { "epoch": 0.013928223222990853, "grad_norm": 83.20530700683594, "learning_rate": 2.9917823482984353e-07, "logits/chosen": -17.74274444580078, "logits/rejected": -17.108789443969727, "logps/chosen": -426.3805236816406, "logps/rejected": -335.43695068359375, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 0.0380372628569603, "rewards/margins": 0.0026830672286450863, "rewards/rejected": 0.035354193300008774, "step": 300 }, { "epoch": 0.014392497330423882, "grad_norm": 81.40511322021484, "learning_rate": 2.991503783833975e-07, "logits/chosen": -18.42173194885254, "logits/rejected": -17.99380874633789, "logps/chosen": -405.02532958984375, "logps/rejected": -349.31732177734375, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04568786919116974, "rewards/margins": 0.011531447060406208, "rewards/rejected": 0.03415641933679581, "step": 310 }, { "epoch": 0.01485677143785691, "grad_norm": 65.87089538574219, "learning_rate": 2.9912252193695156e-07, "logits/chosen": -16.98788070678711, "logits/rejected": -16.468351364135742, "logps/chosen": -326.56256103515625, "logps/rejected": -304.9242858886719, "loss": 0.6924, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.034358978271484375, "rewards/margins": 0.0015859983395785093, "rewards/rejected": 0.0327729806303978, "step": 320 }, { "epoch": 0.015321045545289938, "grad_norm": 109.31501770019531, "learning_rate": 2.990946654905056e-07, "logits/chosen": -17.185468673706055, "logits/rejected": -17.300960540771484, "logps/chosen": -422.2196350097656, "logps/rejected": -418.39801025390625, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.04229980707168579, "rewards/margins": -0.0020274347625672817, "rewards/rejected": 0.04432723671197891, "step": 330 }, { "epoch": 0.01578531965272297, "grad_norm": 95.58434295654297, "learning_rate": 2.990668090440596e-07, "logits/chosen": -17.219715118408203, "logits/rejected": -17.561904907226562, "logps/chosen": -379.39434814453125, "logps/rejected": -387.96514892578125, "loss": 0.6959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03867912292480469, "rewards/margins": -0.005166930146515369, "rewards/rejected": 0.04384605213999748, "step": 340 }, { "epoch": 0.016249593760155997, "grad_norm": 120.189697265625, "learning_rate": 2.990389525976136e-07, "logits/chosen": -17.778987884521484, "logits/rejected": -18.65744400024414, "logps/chosen": -449.38787841796875, "logps/rejected": -486.13885498046875, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.039754945784807205, "rewards/margins": 0.0021942134480923414, "rewards/rejected": 0.03756072744727135, "step": 350 }, { "epoch": 0.016713867867589025, "grad_norm": 177.09593200683594, "learning_rate": 2.990110961511676e-07, "logits/chosen": -18.803382873535156, "logits/rejected": -17.41292381286621, "logps/chosen": -575.483154296875, "logps/rejected": -447.22564697265625, "loss": 0.6843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.054217226803302765, "rewards/margins": 0.01788238435983658, "rewards/rejected": 0.03633483871817589, "step": 360 }, { "epoch": 0.017178141975022054, "grad_norm": 87.61143493652344, "learning_rate": 2.9898323970472166e-07, "logits/chosen": -19.256202697753906, "logits/rejected": -18.71249771118164, "logps/chosen": -397.73516845703125, "logps/rejected": -409.95758056640625, "loss": 0.6933, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.04375045746564865, "rewards/margins": -0.00016647316806484014, "rewards/rejected": 0.04391693323850632, "step": 370 }, { "epoch": 0.017642416082455082, "grad_norm": 99.84996795654297, "learning_rate": 2.989553832582757e-07, "logits/chosen": -18.377967834472656, "logits/rejected": -17.04580307006836, "logps/chosen": -427.22705078125, "logps/rejected": -387.9067687988281, "loss": 0.6947, "rewards/accuracies": 0.5, "rewards/chosen": 0.04531005769968033, "rewards/margins": -0.0029121204279363155, "rewards/rejected": 0.04822217673063278, "step": 380 }, { "epoch": 0.01810669018988811, "grad_norm": 42.05730056762695, "learning_rate": 2.989275268118297e-07, "logits/chosen": -17.40450096130371, "logits/rejected": -17.40810775756836, "logps/chosen": -411.6087951660156, "logps/rejected": -400.7891540527344, "loss": 0.6883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04472732171416283, "rewards/margins": 0.009823608212172985, "rewards/rejected": 0.034903716295957565, "step": 390 }, { "epoch": 0.01857096429732114, "grad_norm": 109.35877227783203, "learning_rate": 2.988996703653837e-07, "logits/chosen": -17.563575744628906, "logits/rejected": -17.08517837524414, "logps/chosen": -431.9017028808594, "logps/rejected": -368.9837341308594, "loss": 0.6881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.038930509239435196, "rewards/margins": 0.010338745079934597, "rewards/rejected": 0.028591766953468323, "step": 400 }, { "epoch": 0.019035238404754167, "grad_norm": 55.20402908325195, "learning_rate": 2.988718139189377e-07, "logits/chosen": -17.241544723510742, "logits/rejected": -17.231807708740234, "logps/chosen": -306.6127014160156, "logps/rejected": -252.3168487548828, "loss": 0.6857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03766327351331711, "rewards/margins": 0.015113946981728077, "rewards/rejected": 0.022549323737621307, "step": 410 }, { "epoch": 0.019499512512187196, "grad_norm": 117.13570404052734, "learning_rate": 2.9884395747249176e-07, "logits/chosen": -18.592845916748047, "logits/rejected": -17.535778045654297, "logps/chosen": -519.3404541015625, "logps/rejected": -370.3611755371094, "loss": 0.6823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.06618209183216095, "rewards/margins": 0.022108763456344604, "rewards/rejected": 0.04407333582639694, "step": 420 }, { "epoch": 0.019963786619620224, "grad_norm": 84.03096008300781, "learning_rate": 2.9881610102604575e-07, "logits/chosen": -18.21696662902832, "logits/rejected": -16.678462982177734, "logps/chosen": -494.6432189941406, "logps/rejected": -305.4082946777344, "loss": 0.6846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0572633370757103, "rewards/margins": 0.017333373427391052, "rewards/rejected": 0.039929963648319244, "step": 430 }, { "epoch": 0.020428060727053252, "grad_norm": 70.84638214111328, "learning_rate": 2.987882445795998e-07, "logits/chosen": -18.246429443359375, "logits/rejected": -18.41632652282715, "logps/chosen": -551.2737426757812, "logps/rejected": -424.6492614746094, "loss": 0.6887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06012740731239319, "rewards/margins": 0.009219968691468239, "rewards/rejected": 0.0509074442088604, "step": 440 }, { "epoch": 0.02089233483448628, "grad_norm": 135.73585510253906, "learning_rate": 2.9876038813315383e-07, "logits/chosen": -18.49070167541504, "logits/rejected": -17.53814125061035, "logps/chosen": -431.03302001953125, "logps/rejected": -328.21514892578125, "loss": 0.6827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06920745968818665, "rewards/margins": 0.021412048488855362, "rewards/rejected": 0.047795407474040985, "step": 450 }, { "epoch": 0.02135660894191931, "grad_norm": 95.3013687133789, "learning_rate": 2.987325316867078e-07, "logits/chosen": -17.08924674987793, "logits/rejected": -17.310237884521484, "logps/chosen": -396.2865295410156, "logps/rejected": -423.1182556152344, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": 0.05750786513090134, "rewards/margins": 0.002650223672389984, "rewards/rejected": 0.054857634007930756, "step": 460 }, { "epoch": 0.021820883049352337, "grad_norm": 76.79278564453125, "learning_rate": 2.987046752402618e-07, "logits/chosen": -17.542781829833984, "logits/rejected": -16.691341400146484, "logps/chosen": -513.9970703125, "logps/rejected": -357.9404296875, "loss": 0.6829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07996048033237457, "rewards/margins": 0.02121143415570259, "rewards/rejected": 0.05874905735254288, "step": 470 }, { "epoch": 0.022285157156785366, "grad_norm": 104.99002838134766, "learning_rate": 2.9867681879381585e-07, "logits/chosen": -17.582366943359375, "logits/rejected": -16.678476333618164, "logps/chosen": -337.56768798828125, "logps/rejected": -272.5372314453125, "loss": 0.6874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.041543617844581604, "rewards/margins": 0.011720657348632812, "rewards/rejected": 0.02982296049594879, "step": 480 }, { "epoch": 0.022749431264218394, "grad_norm": 57.8966178894043, "learning_rate": 2.986489623473699e-07, "logits/chosen": -18.7929630279541, "logits/rejected": -17.205846786499023, "logps/chosen": -444.21209716796875, "logps/rejected": -227.37661743164062, "loss": 0.6751, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.09324105083942413, "rewards/margins": 0.03684886917471886, "rewards/rejected": 0.05639217048883438, "step": 490 }, { "epoch": 0.023213705371651423, "grad_norm": 120.13676452636719, "learning_rate": 2.986211059009239e-07, "logits/chosen": -18.105403900146484, "logits/rejected": -17.387897491455078, "logps/chosen": -532.545166015625, "logps/rejected": -425.22613525390625, "loss": 0.6828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06588444113731384, "rewards/margins": 0.021702157333493233, "rewards/rejected": 0.04418227821588516, "step": 500 }, { "epoch": 0.02367797947908445, "grad_norm": 144.4639129638672, "learning_rate": 2.985932494544779e-07, "logits/chosen": -17.98801040649414, "logits/rejected": -17.249164581298828, "logps/chosen": -395.82623291015625, "logps/rejected": -342.9493713378906, "loss": 0.6845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.06526154279708862, "rewards/margins": 0.017620468512177467, "rewards/rejected": 0.04764106497168541, "step": 510 }, { "epoch": 0.02414225358651748, "grad_norm": 51.11764144897461, "learning_rate": 2.985653930080319e-07, "logits/chosen": -17.981704711914062, "logits/rejected": -17.129226684570312, "logps/chosen": -392.3443908691406, "logps/rejected": -267.657470703125, "loss": 0.6885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06003951281309128, "rewards/margins": 0.009465677663683891, "rewards/rejected": 0.05057384446263313, "step": 520 }, { "epoch": 0.024606527693950508, "grad_norm": 47.346553802490234, "learning_rate": 2.9853753656158595e-07, "logits/chosen": -17.827648162841797, "logits/rejected": -17.095666885375977, "logps/chosen": -398.34600830078125, "logps/rejected": -389.9524230957031, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.08073677122592926, "rewards/margins": 0.010941848158836365, "rewards/rejected": 0.0697949230670929, "step": 530 }, { "epoch": 0.025070801801383536, "grad_norm": 99.12947082519531, "learning_rate": 2.9850968011513994e-07, "logits/chosen": -17.847476959228516, "logits/rejected": -17.466787338256836, "logps/chosen": -366.14312744140625, "logps/rejected": -289.3060607910156, "loss": 0.6876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05615774914622307, "rewards/margins": 0.011325368657708168, "rewards/rejected": 0.04483237862586975, "step": 540 }, { "epoch": 0.025535075908816564, "grad_norm": 18.63617515563965, "learning_rate": 2.98481823668694e-07, "logits/chosen": -17.758743286132812, "logits/rejected": -16.51211166381836, "logps/chosen": -484.84063720703125, "logps/rejected": -305.4541015625, "loss": 0.6785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09139227122068405, "rewards/margins": 0.030284976586699486, "rewards/rejected": 0.061107296496629715, "step": 550 }, { "epoch": 0.025999350016249593, "grad_norm": 58.161041259765625, "learning_rate": 2.98453967222248e-07, "logits/chosen": -17.570110321044922, "logits/rejected": -17.44794464111328, "logps/chosen": -414.88568115234375, "logps/rejected": -392.4132080078125, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08919983357191086, "rewards/margins": 0.0036396202631294727, "rewards/rejected": 0.08556020259857178, "step": 560 }, { "epoch": 0.02646362412368262, "grad_norm": 54.86820602416992, "learning_rate": 2.98426110775802e-07, "logits/chosen": -18.595806121826172, "logits/rejected": -16.954082489013672, "logps/chosen": -452.0233459472656, "logps/rejected": -393.1688537597656, "loss": 0.6861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08154648542404175, "rewards/margins": 0.014354705810546875, "rewards/rejected": 0.06719177216291428, "step": 570 }, { "epoch": 0.02692789823111565, "grad_norm": 121.77066802978516, "learning_rate": 2.9839825432935605e-07, "logits/chosen": -19.63254165649414, "logits/rejected": -18.61478042602539, "logps/chosen": -424.5240173339844, "logps/rejected": -365.94622802734375, "loss": 0.6857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08616288006305695, "rewards/margins": 0.015460053458809853, "rewards/rejected": 0.07070282101631165, "step": 580 }, { "epoch": 0.027392172338548678, "grad_norm": 112.26095581054688, "learning_rate": 2.9837039788291004e-07, "logits/chosen": -18.826322555541992, "logits/rejected": -17.578516006469727, "logps/chosen": -482.11102294921875, "logps/rejected": -373.8777770996094, "loss": 0.6836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09015054255723953, "rewards/margins": 0.020094845443964005, "rewards/rejected": 0.07005570083856583, "step": 590 }, { "epoch": 0.027856446445981706, "grad_norm": 108.53756713867188, "learning_rate": 2.983425414364641e-07, "logits/chosen": -18.05025291442871, "logits/rejected": -17.6160888671875, "logps/chosen": -313.51971435546875, "logps/rejected": -284.8910827636719, "loss": 0.6855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07335831224918365, "rewards/margins": 0.015951231122016907, "rewards/rejected": 0.05740707367658615, "step": 600 }, { "epoch": 0.028320720553414735, "grad_norm": 107.15342712402344, "learning_rate": 2.983146849900181e-07, "logits/chosen": -18.49478530883789, "logits/rejected": -17.719247817993164, "logps/chosen": -411.17266845703125, "logps/rejected": -404.41949462890625, "loss": 0.7019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.07912842184305191, "rewards/margins": -0.01716487668454647, "rewards/rejected": 0.09629329293966293, "step": 610 }, { "epoch": 0.028784994660847763, "grad_norm": 76.99655151367188, "learning_rate": 2.982868285435721e-07, "logits/chosen": -19.260570526123047, "logits/rejected": -18.745769500732422, "logps/chosen": -415.90618896484375, "logps/rejected": -367.6451416015625, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0680653303861618, "rewards/margins": 0.005227126181125641, "rewards/rejected": 0.06283821165561676, "step": 620 }, { "epoch": 0.02924926876828079, "grad_norm": 106.57319641113281, "learning_rate": 2.9825897209712615e-07, "logits/chosen": -18.240692138671875, "logits/rejected": -18.012174606323242, "logps/chosen": -523.5814208984375, "logps/rejected": -504.8575134277344, "loss": 0.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.11050200462341309, "rewards/margins": 0.016633350402116776, "rewards/rejected": 0.0938686653971672, "step": 630 }, { "epoch": 0.02971354287571382, "grad_norm": 136.36122131347656, "learning_rate": 2.9823111565068013e-07, "logits/chosen": -18.71304702758789, "logits/rejected": -17.763994216918945, "logps/chosen": -614.6131591796875, "logps/rejected": -409.2359619140625, "loss": 0.6714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14175841212272644, "rewards/margins": 0.044191665947437286, "rewards/rejected": 0.09756675362586975, "step": 640 }, { "epoch": 0.03017781698314685, "grad_norm": 77.14366149902344, "learning_rate": 2.982032592042342e-07, "logits/chosen": -16.59531593322754, "logits/rejected": -16.542633056640625, "logps/chosen": -337.1812438964844, "logps/rejected": -296.46466064453125, "loss": 0.6904, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.09518066048622131, "rewards/margins": 0.006967899855226278, "rewards/rejected": 0.0882127657532692, "step": 650 }, { "epoch": 0.030642091090579877, "grad_norm": 148.93441772460938, "learning_rate": 2.9817540275778816e-07, "logits/chosen": -18.209897994995117, "logits/rejected": -16.6964054107666, "logps/chosen": -470.6485290527344, "logps/rejected": -303.3775329589844, "loss": 0.6723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10958298295736313, "rewards/margins": 0.04278365895152092, "rewards/rejected": 0.06679932028055191, "step": 660 }, { "epoch": 0.031106365198012905, "grad_norm": 54.62526321411133, "learning_rate": 2.981475463113422e-07, "logits/chosen": -17.501384735107422, "logits/rejected": -17.209596633911133, "logps/chosen": -313.8746337890625, "logps/rejected": -338.17059326171875, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.08187500387430191, "rewards/margins": -0.003313828259706497, "rewards/rejected": 0.08518882840871811, "step": 670 }, { "epoch": 0.03157063930544594, "grad_norm": 121.80809020996094, "learning_rate": 2.9811968986489625e-07, "logits/chosen": -17.670272827148438, "logits/rejected": -16.89398956298828, "logps/chosen": -425.89801025390625, "logps/rejected": -318.17724609375, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.12024048715829849, "rewards/margins": 0.04557327181100845, "rewards/rejected": 0.07466720789670944, "step": 680 }, { "epoch": 0.032034913412878965, "grad_norm": 102.96610260009766, "learning_rate": 2.9809183341845023e-07, "logits/chosen": -18.688030242919922, "logits/rejected": -17.344806671142578, "logps/chosen": -385.0511779785156, "logps/rejected": -329.9939880371094, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12096847593784332, "rewards/margins": 0.025400612503290176, "rewards/rejected": 0.09556785970926285, "step": 690 }, { "epoch": 0.032499187520311994, "grad_norm": 173.201416015625, "learning_rate": 2.980639769720043e-07, "logits/chosen": -17.42366600036621, "logits/rejected": -16.7069091796875, "logps/chosen": -512.1301879882812, "logps/rejected": -481.40673828125, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.08986781537532806, "rewards/margins": 0.0013242706190794706, "rewards/rejected": 0.08854354918003082, "step": 700 }, { "epoch": 0.03296346162774502, "grad_norm": 111.50202178955078, "learning_rate": 2.9803612052555826e-07, "logits/chosen": -18.07979393005371, "logits/rejected": -17.08871841430664, "logps/chosen": -452.6851501464844, "logps/rejected": -353.2229919433594, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12595167756080627, "rewards/margins": 0.0030280284117907286, "rewards/rejected": 0.12292365729808807, "step": 710 }, { "epoch": 0.03342773573517805, "grad_norm": 89.10546875, "learning_rate": 2.980082640791123e-07, "logits/chosen": -18.38022804260254, "logits/rejected": -16.43529510498047, "logps/chosen": -596.95947265625, "logps/rejected": -350.989990234375, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.17318740487098694, "rewards/margins": 0.07207443565130234, "rewards/rejected": 0.1011129841208458, "step": 720 }, { "epoch": 0.03389200984261108, "grad_norm": 74.91948699951172, "learning_rate": 2.979804076326663e-07, "logits/chosen": -17.94866371154785, "logits/rejected": -17.07236671447754, "logps/chosen": -410.2129821777344, "logps/rejected": -283.5935363769531, "loss": 0.681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11742796748876572, "rewards/margins": 0.025156021118164062, "rewards/rejected": 0.09227196127176285, "step": 730 }, { "epoch": 0.03435628395004411, "grad_norm": 115.79324340820312, "learning_rate": 2.9795255118622033e-07, "logits/chosen": -17.38595199584961, "logits/rejected": -16.962459564208984, "logps/chosen": -409.33831787109375, "logps/rejected": -292.81707763671875, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": 0.1408666968345642, "rewards/margins": 0.03142397105693817, "rewards/rejected": 0.10944273322820663, "step": 740 }, { "epoch": 0.034820558057477136, "grad_norm": 70.10552978515625, "learning_rate": 2.979246947397744e-07, "logits/chosen": -17.29495620727539, "logits/rejected": -17.802270889282227, "logps/chosen": -347.4102478027344, "logps/rejected": -371.68408203125, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 0.10566208511590958, "rewards/margins": -0.00012756287469528615, "rewards/rejected": 0.10578964650630951, "step": 750 }, { "epoch": 0.035284832164910164, "grad_norm": 122.97635650634766, "learning_rate": 2.9789683829332836e-07, "logits/chosen": -17.792619705200195, "logits/rejected": -18.196809768676758, "logps/chosen": -355.88641357421875, "logps/rejected": -431.7183532714844, "loss": 0.6942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12683668732643127, "rewards/margins": -0.0005688481032848358, "rewards/rejected": 0.1274055540561676, "step": 760 }, { "epoch": 0.03574910627234319, "grad_norm": 107.86244201660156, "learning_rate": 2.9786898184688235e-07, "logits/chosen": -18.677295684814453, "logits/rejected": -16.855838775634766, "logps/chosen": -508.09368896484375, "logps/rejected": -309.49029541015625, "loss": 0.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14351119101047516, "rewards/margins": 0.03724197670817375, "rewards/rejected": 0.1062692254781723, "step": 770 }, { "epoch": 0.03621338037977622, "grad_norm": 82.47341918945312, "learning_rate": 2.978411254004364e-07, "logits/chosen": -17.611772537231445, "logits/rejected": -18.066421508789062, "logps/chosen": -299.6412353515625, "logps/rejected": -312.9779968261719, "loss": 0.697, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.10839817672967911, "rewards/margins": -0.00697576766833663, "rewards/rejected": 0.11537393182516098, "step": 780 }, { "epoch": 0.03667765448720925, "grad_norm": 76.8634033203125, "learning_rate": 2.9781326895399043e-07, "logits/chosen": -18.220317840576172, "logits/rejected": -17.830278396606445, "logps/chosen": -450.646728515625, "logps/rejected": -384.5395812988281, "loss": 0.6839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.11094300448894501, "rewards/margins": 0.019030077382922173, "rewards/rejected": 0.09191292524337769, "step": 790 }, { "epoch": 0.03714192859464228, "grad_norm": 121.58221435546875, "learning_rate": 2.9778541250754447e-07, "logits/chosen": -19.06214714050293, "logits/rejected": -17.622976303100586, "logps/chosen": -544.9043579101562, "logps/rejected": -385.69989013671875, "loss": 0.6591, "rewards/accuracies": 1.0, "rewards/chosen": 0.1518273800611496, "rewards/margins": 0.06980965286493301, "rewards/rejected": 0.08201774954795837, "step": 800 }, { "epoch": 0.037606202702075306, "grad_norm": 76.89077758789062, "learning_rate": 2.9775755606109846e-07, "logits/chosen": -17.894977569580078, "logits/rejected": -16.649614334106445, "logps/chosen": -418.11712646484375, "logps/rejected": -283.9081115722656, "loss": 0.6673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15060913562774658, "rewards/margins": 0.05585899204015732, "rewards/rejected": 0.09475012868642807, "step": 810 }, { "epoch": 0.038070476809508334, "grad_norm": 66.7522964477539, "learning_rate": 2.9772969961465245e-07, "logits/chosen": -18.654937744140625, "logits/rejected": -17.678577423095703, "logps/chosen": -444.49786376953125, "logps/rejected": -339.84130859375, "loss": 0.684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1386624276638031, "rewards/margins": 0.018837356939911842, "rewards/rejected": 0.11982505023479462, "step": 820 }, { "epoch": 0.03853475091694136, "grad_norm": 119.23123931884766, "learning_rate": 2.977018431682065e-07, "logits/chosen": -17.79199981689453, "logits/rejected": -16.791519165039062, "logps/chosen": -528.9990234375, "logps/rejected": -402.9093322753906, "loss": 0.6811, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.15305283665657043, "rewards/margins": 0.026228487491607666, "rewards/rejected": 0.12682434916496277, "step": 830 }, { "epoch": 0.03899902502437439, "grad_norm": 136.82264709472656, "learning_rate": 2.9767398672176053e-07, "logits/chosen": -17.027812957763672, "logits/rejected": -16.50485610961914, "logps/chosen": -386.57086181640625, "logps/rejected": -343.696044921875, "loss": 0.6666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16229431331157684, "rewards/margins": 0.05580589920282364, "rewards/rejected": 0.1064884215593338, "step": 840 }, { "epoch": 0.03946329913180742, "grad_norm": 174.5501251220703, "learning_rate": 2.976461302753145e-07, "logits/chosen": -17.524715423583984, "logits/rejected": -17.200437545776367, "logps/chosen": -382.87115478515625, "logps/rejected": -389.26226806640625, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13409201800823212, "rewards/margins": 0.006408006884157658, "rewards/rejected": 0.12768401205539703, "step": 850 }, { "epoch": 0.03992757323924045, "grad_norm": 128.1075897216797, "learning_rate": 2.9761827382886856e-07, "logits/chosen": -17.743818283081055, "logits/rejected": -16.674209594726562, "logps/chosen": -482.36920166015625, "logps/rejected": -442.26300048828125, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15339340269565582, "rewards/margins": 0.011176148429512978, "rewards/rejected": 0.1422172635793686, "step": 860 }, { "epoch": 0.040391847346673476, "grad_norm": 101.22434997558594, "learning_rate": 2.975904173824226e-07, "logits/chosen": -18.028928756713867, "logits/rejected": -16.96602439880371, "logps/chosen": -472.7303771972656, "logps/rejected": -442.55841064453125, "loss": 0.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1780276596546173, "rewards/margins": 0.033013515174388885, "rewards/rejected": 0.14501412212848663, "step": 870 }, { "epoch": 0.040856121454106505, "grad_norm": 89.69721221923828, "learning_rate": 2.975625609359766e-07, "logits/chosen": -17.70157814025879, "logits/rejected": -17.272235870361328, "logps/chosen": -305.7450866699219, "logps/rejected": -298.1076965332031, "loss": 0.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10511615127325058, "rewards/margins": 0.016138877719640732, "rewards/rejected": 0.08897727727890015, "step": 880 }, { "epoch": 0.04132039556153953, "grad_norm": 122.48617553710938, "learning_rate": 2.975347044895306e-07, "logits/chosen": -17.954517364501953, "logits/rejected": -17.123027801513672, "logps/chosen": -440.302490234375, "logps/rejected": -303.412109375, "loss": 0.6598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17523670196533203, "rewards/margins": 0.06930689513683319, "rewards/rejected": 0.10592980682849884, "step": 890 }, { "epoch": 0.04178466966897256, "grad_norm": 134.5247344970703, "learning_rate": 2.975068480430846e-07, "logits/chosen": -17.945924758911133, "logits/rejected": -17.77553939819336, "logps/chosen": -451.05499267578125, "logps/rejected": -412.8329162597656, "loss": 0.7013, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.17616957426071167, "rewards/margins": -0.01425399910658598, "rewards/rejected": 0.19042357802391052, "step": 900 }, { "epoch": 0.04224894377640559, "grad_norm": 105.05684661865234, "learning_rate": 2.9747899159663866e-07, "logits/chosen": -17.736831665039062, "logits/rejected": -17.917095184326172, "logps/chosen": -446.48016357421875, "logps/rejected": -372.7218322753906, "loss": 0.6937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.13406707346439362, "rewards/margins": 0.0022669993340969086, "rewards/rejected": 0.1318000853061676, "step": 910 }, { "epoch": 0.04271321788383862, "grad_norm": 87.48090362548828, "learning_rate": 2.9745113515019265e-07, "logits/chosen": -17.78852653503418, "logits/rejected": -17.595027923583984, "logps/chosen": -491.94403076171875, "logps/rejected": -429.1636657714844, "loss": 0.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1822669953107834, "rewards/margins": 0.011952459812164307, "rewards/rejected": 0.17031455039978027, "step": 920 }, { "epoch": 0.043177491991271646, "grad_norm": 108.83039093017578, "learning_rate": 2.974232787037467e-07, "logits/chosen": -18.2768497467041, "logits/rejected": -17.477191925048828, "logps/chosen": -446.0062561035156, "logps/rejected": -350.3463439941406, "loss": 0.6891, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.15171553194522858, "rewards/margins": 0.009239329025149345, "rewards/rejected": 0.14247620105743408, "step": 930 }, { "epoch": 0.043641766098704675, "grad_norm": 70.75343322753906, "learning_rate": 2.973954222573007e-07, "logits/chosen": -18.608959197998047, "logits/rejected": -17.99661636352539, "logps/chosen": -308.4905700683594, "logps/rejected": -237.32321166992188, "loss": 0.6754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.12948058545589447, "rewards/margins": 0.03680863231420517, "rewards/rejected": 0.0926719680428505, "step": 940 }, { "epoch": 0.0441060402061377, "grad_norm": 136.06820678710938, "learning_rate": 2.973675658108547e-07, "logits/chosen": -17.04922103881836, "logits/rejected": -16.675168991088867, "logps/chosen": -354.3727111816406, "logps/rejected": -286.6377868652344, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13985465466976166, "rewards/margins": 0.04832877963781357, "rewards/rejected": 0.09152588248252869, "step": 950 }, { "epoch": 0.04457031431357073, "grad_norm": 73.29203033447266, "learning_rate": 2.973397093644087e-07, "logits/chosen": -18.707347869873047, "logits/rejected": -18.276607513427734, "logps/chosen": -442.7015686035156, "logps/rejected": -336.67919921875, "loss": 0.67, "rewards/accuracies": 0.5, "rewards/chosen": 0.19681641459465027, "rewards/margins": 0.049315858632326126, "rewards/rejected": 0.14750055968761444, "step": 960 }, { "epoch": 0.04503458842100376, "grad_norm": 103.45884704589844, "learning_rate": 2.9731185291796275e-07, "logits/chosen": -18.277217864990234, "logits/rejected": -17.633703231811523, "logps/chosen": -479.26934814453125, "logps/rejected": -430.37396240234375, "loss": 0.6789, "rewards/accuracies": 0.5, "rewards/chosen": 0.19952364265918732, "rewards/margins": 0.030475080013275146, "rewards/rejected": 0.16904854774475098, "step": 970 }, { "epoch": 0.04549886252843679, "grad_norm": 85.48160552978516, "learning_rate": 2.972839964715168e-07, "logits/chosen": -18.141826629638672, "logits/rejected": -17.193790435791016, "logps/chosen": -368.81439208984375, "logps/rejected": -240.6526336669922, "loss": 0.6683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.16396942734718323, "rewards/margins": 0.05085624381899834, "rewards/rejected": 0.11311318725347519, "step": 980 }, { "epoch": 0.04596313663586982, "grad_norm": 62.61702346801758, "learning_rate": 2.972561400250708e-07, "logits/chosen": -18.189369201660156, "logits/rejected": -18.724390029907227, "logps/chosen": -443.25177001953125, "logps/rejected": -367.3170471191406, "loss": 0.694, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.14721977710723877, "rewards/margins": -0.0010771967936307192, "rewards/rejected": 0.14829698204994202, "step": 990 }, { "epoch": 0.046427410743302845, "grad_norm": 105.59542846679688, "learning_rate": 2.972282835786248e-07, "logits/chosen": -18.559120178222656, "logits/rejected": -17.8480224609375, "logps/chosen": -346.59100341796875, "logps/rejected": -320.1004333496094, "loss": 0.684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19506791234016418, "rewards/margins": 0.0202484130859375, "rewards/rejected": 0.1748194843530655, "step": 1000 }, { "epoch": 0.046891684850735874, "grad_norm": 58.94041061401367, "learning_rate": 2.972004271321788e-07, "logits/chosen": -17.1857967376709, "logits/rejected": -16.005355834960938, "logps/chosen": -388.451416015625, "logps/rejected": -219.3909454345703, "loss": 0.6474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20936532318592072, "rewards/margins": 0.0961318239569664, "rewards/rejected": 0.11323349177837372, "step": 1010 }, { "epoch": 0.0473559589581689, "grad_norm": 102.4660873413086, "learning_rate": 2.9717257068573285e-07, "logits/chosen": -17.23604965209961, "logits/rejected": -16.63636016845703, "logps/chosen": -343.729248046875, "logps/rejected": -284.6219787597656, "loss": 0.6679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.17661729454994202, "rewards/margins": 0.05352791026234627, "rewards/rejected": 0.12308937311172485, "step": 1020 }, { "epoch": 0.04782023306560193, "grad_norm": 121.79590606689453, "learning_rate": 2.971447142392869e-07, "logits/chosen": -16.33792495727539, "logits/rejected": -16.905895233154297, "logps/chosen": -426.37054443359375, "logps/rejected": -525.1221923828125, "loss": 0.7293, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.1654476672410965, "rewards/margins": -0.06919205188751221, "rewards/rejected": 0.2346397191286087, "step": 1030 }, { "epoch": 0.04828450717303496, "grad_norm": 100.99112701416016, "learning_rate": 2.971168577928409e-07, "logits/chosen": -18.788366317749023, "logits/rejected": -17.187847137451172, "logps/chosen": -450.5957946777344, "logps/rejected": -347.82879638671875, "loss": 0.6616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20942549407482147, "rewards/margins": 0.06687416136264801, "rewards/rejected": 0.14255134761333466, "step": 1040 }, { "epoch": 0.04874878128046799, "grad_norm": 56.00457000732422, "learning_rate": 2.970890013463949e-07, "logits/chosen": -16.708026885986328, "logits/rejected": -16.191896438598633, "logps/chosen": -336.4464416503906, "logps/rejected": -306.0708312988281, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19874854385852814, "rewards/margins": 0.05386342853307724, "rewards/rejected": 0.1448851078748703, "step": 1050 }, { "epoch": 0.049213055387901015, "grad_norm": 60.72960662841797, "learning_rate": 2.970611448999489e-07, "logits/chosen": -18.141963958740234, "logits/rejected": -17.711402893066406, "logps/chosen": -420.052734375, "logps/rejected": -397.8394775390625, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": 0.20649957656860352, "rewards/margins": 0.005667650606483221, "rewards/rejected": 0.20083189010620117, "step": 1060 }, { "epoch": 0.049677329495334044, "grad_norm": 111.14381408691406, "learning_rate": 2.9703328845350294e-07, "logits/chosen": -18.65705108642578, "logits/rejected": -16.854639053344727, "logps/chosen": -463.20068359375, "logps/rejected": -269.64617919921875, "loss": 0.6419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.21932518482208252, "rewards/margins": 0.10721901804208755, "rewards/rejected": 0.11210618168115616, "step": 1070 }, { "epoch": 0.05014160360276707, "grad_norm": 68.92208862304688, "learning_rate": 2.9700543200705693e-07, "logits/chosen": -18.489280700683594, "logits/rejected": -17.518062591552734, "logps/chosen": -519.7286376953125, "logps/rejected": -404.3791198730469, "loss": 0.6757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22286561131477356, "rewards/margins": 0.03671317547559738, "rewards/rejected": 0.18615242838859558, "step": 1080 }, { "epoch": 0.0506058777102001, "grad_norm": 82.6618881225586, "learning_rate": 2.96977575560611e-07, "logits/chosen": -19.06897735595703, "logits/rejected": -18.79486656188965, "logps/chosen": -382.1256408691406, "logps/rejected": -322.4118347167969, "loss": 0.7063, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.14745429158210754, "rewards/margins": -0.024118268862366676, "rewards/rejected": 0.17157256603240967, "step": 1090 }, { "epoch": 0.05107015181763313, "grad_norm": 120.99327850341797, "learning_rate": 2.96949719114165e-07, "logits/chosen": -17.369266510009766, "logits/rejected": -17.31143569946289, "logps/chosen": -404.13922119140625, "logps/rejected": -379.3133239746094, "loss": 0.6823, "rewards/accuracies": 0.5, "rewards/chosen": 0.23183898627758026, "rewards/margins": 0.028752172365784645, "rewards/rejected": 0.20308682322502136, "step": 1100 }, { "epoch": 0.05153442592506616, "grad_norm": 129.8936004638672, "learning_rate": 2.96921862667719e-07, "logits/chosen": -17.060270309448242, "logits/rejected": -17.492219924926758, "logps/chosen": -398.7113342285156, "logps/rejected": -408.0740966796875, "loss": 0.6877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19003717601299286, "rewards/margins": 0.01269889622926712, "rewards/rejected": 0.17733827233314514, "step": 1110 }, { "epoch": 0.051998700032499186, "grad_norm": 85.94730377197266, "learning_rate": 2.9689400622127304e-07, "logits/chosen": -17.418989181518555, "logits/rejected": -17.163400650024414, "logps/chosen": -302.2437438964844, "logps/rejected": -323.78656005859375, "loss": 0.7214, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.12407181411981583, "rewards/margins": -0.054987143725156784, "rewards/rejected": 0.1790589541196823, "step": 1120 }, { "epoch": 0.052462974139932214, "grad_norm": 91.43364715576172, "learning_rate": 2.9686614977482703e-07, "logits/chosen": -18.16490936279297, "logits/rejected": -17.970539093017578, "logps/chosen": -412.13751220703125, "logps/rejected": -355.5553283691406, "loss": 0.6653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.24489793181419373, "rewards/margins": 0.05771453306078911, "rewards/rejected": 0.18718339502811432, "step": 1130 }, { "epoch": 0.05292724824736524, "grad_norm": 103.89889526367188, "learning_rate": 2.9683829332838107e-07, "logits/chosen": -17.07699203491211, "logits/rejected": -18.2871036529541, "logps/chosen": -307.9039306640625, "logps/rejected": -481.35015869140625, "loss": 0.7373, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 0.18083375692367554, "rewards/margins": -0.08535781502723694, "rewards/rejected": 0.2661915421485901, "step": 1140 }, { "epoch": 0.05339152235479827, "grad_norm": 81.92337036132812, "learning_rate": 2.9681043688193506e-07, "logits/chosen": -17.994760513305664, "logits/rejected": -16.663034439086914, "logps/chosen": -399.294921875, "logps/rejected": -261.5648193359375, "loss": 0.6652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.192474827170372, "rewards/margins": 0.06005844473838806, "rewards/rejected": 0.13241638243198395, "step": 1150 }, { "epoch": 0.0538557964622313, "grad_norm": 112.7637710571289, "learning_rate": 2.967825804354891e-07, "logits/chosen": -17.5280818939209, "logits/rejected": -16.653533935546875, "logps/chosen": -459.9208984375, "logps/rejected": -363.00897216796875, "loss": 0.6749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21134671568870544, "rewards/margins": 0.04269593209028244, "rewards/rejected": 0.1686508059501648, "step": 1160 }, { "epoch": 0.05432007056966433, "grad_norm": 57.52288055419922, "learning_rate": 2.9675472398904314e-07, "logits/chosen": -18.2012882232666, "logits/rejected": -17.59156608581543, "logps/chosen": -361.62701416015625, "logps/rejected": -394.3235778808594, "loss": 0.6815, "rewards/accuracies": 0.5, "rewards/chosen": 0.19071580469608307, "rewards/margins": 0.02442474663257599, "rewards/rejected": 0.1662910431623459, "step": 1170 }, { "epoch": 0.054784344677097356, "grad_norm": 104.89786529541016, "learning_rate": 2.9672686754259713e-07, "logits/chosen": -17.529260635375977, "logits/rejected": -17.5631103515625, "logps/chosen": -389.60650634765625, "logps/rejected": -381.63214111328125, "loss": 0.6979, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.22088806331157684, "rewards/margins": -0.008218673057854176, "rewards/rejected": 0.22910673916339874, "step": 1180 }, { "epoch": 0.055248618784530384, "grad_norm": 143.39129638671875, "learning_rate": 2.966990110961511e-07, "logits/chosen": -17.59573745727539, "logits/rejected": -17.012855529785156, "logps/chosen": -411.33941650390625, "logps/rejected": -339.04473876953125, "loss": 0.6718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20568940043449402, "rewards/margins": 0.0472152940928936, "rewards/rejected": 0.15847408771514893, "step": 1190 }, { "epoch": 0.05571289289196341, "grad_norm": 79.02962493896484, "learning_rate": 2.9667115464970516e-07, "logits/chosen": -17.016803741455078, "logits/rejected": -17.14663314819336, "logps/chosen": -489.45965576171875, "logps/rejected": -530.7103881835938, "loss": 0.7, "rewards/accuracies": 0.5, "rewards/chosen": 0.22083115577697754, "rewards/margins": -0.010452575981616974, "rewards/rejected": 0.2312837392091751, "step": 1200 }, { "epoch": 0.05617716699939644, "grad_norm": 103.0136489868164, "learning_rate": 2.966432982032592e-07, "logits/chosen": -18.193010330200195, "logits/rejected": -18.04822540283203, "logps/chosen": -451.868896484375, "logps/rejected": -444.48046875, "loss": 0.6801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2267257422208786, "rewards/margins": 0.030494648963212967, "rewards/rejected": 0.19623108208179474, "step": 1210 }, { "epoch": 0.05664144110682947, "grad_norm": 109.85965728759766, "learning_rate": 2.9661544175681324e-07, "logits/chosen": -17.516429901123047, "logits/rejected": -17.0471134185791, "logps/chosen": -527.2366943359375, "logps/rejected": -376.91015625, "loss": 0.6807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2407761514186859, "rewards/margins": 0.026178056374192238, "rewards/rejected": 0.21459808945655823, "step": 1220 }, { "epoch": 0.0571057152142625, "grad_norm": 165.8756103515625, "learning_rate": 2.9658758531036723e-07, "logits/chosen": -17.411670684814453, "logits/rejected": -17.39031982421875, "logps/chosen": -387.2791748046875, "logps/rejected": -439.36279296875, "loss": 0.6975, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.1601010113954544, "rewards/margins": -0.007943419739603996, "rewards/rejected": 0.16804443299770355, "step": 1230 }, { "epoch": 0.057569989321695526, "grad_norm": 44.02281951904297, "learning_rate": 2.965597288639212e-07, "logits/chosen": -17.701679229736328, "logits/rejected": -17.693439483642578, "logps/chosen": -348.42694091796875, "logps/rejected": -336.71075439453125, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": 0.16322921216487885, "rewards/margins": -0.01145782321691513, "rewards/rejected": 0.17468704283237457, "step": 1240 }, { "epoch": 0.058034263429128555, "grad_norm": 82.11949920654297, "learning_rate": 2.9653187241747526e-07, "logits/chosen": -18.235143661499023, "logits/rejected": -18.415525436401367, "logps/chosen": -269.1345520019531, "logps/rejected": -210.3825225830078, "loss": 0.6783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.14357788860797882, "rewards/margins": 0.030806925147771835, "rewards/rejected": 0.11277097463607788, "step": 1250 }, { "epoch": 0.05849853753656158, "grad_norm": 116.98623657226562, "learning_rate": 2.965040159710293e-07, "logits/chosen": -17.2365779876709, "logits/rejected": -16.29494857788086, "logps/chosen": -379.82110595703125, "logps/rejected": -256.6835021972656, "loss": 0.665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2007705718278885, "rewards/margins": 0.060283392667770386, "rewards/rejected": 0.1404871642589569, "step": 1260 }, { "epoch": 0.05896281164399461, "grad_norm": 135.61221313476562, "learning_rate": 2.964761595245833e-07, "logits/chosen": -17.63548469543457, "logits/rejected": -17.241241455078125, "logps/chosen": -348.69134521484375, "logps/rejected": -335.0475158691406, "loss": 0.7058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.15050749480724335, "rewards/margins": -0.024095535278320312, "rewards/rejected": 0.17460303008556366, "step": 1270 }, { "epoch": 0.05942708575142764, "grad_norm": 57.235931396484375, "learning_rate": 2.9644830307813733e-07, "logits/chosen": -17.650056838989258, "logits/rejected": -17.472877502441406, "logps/chosen": -437.4979553222656, "logps/rejected": -380.3660583496094, "loss": 0.6861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2097894251346588, "rewards/margins": 0.017417756840586662, "rewards/rejected": 0.1923716813325882, "step": 1280 }, { "epoch": 0.05989135985886067, "grad_norm": 126.99041748046875, "learning_rate": 2.9642044663169137e-07, "logits/chosen": -18.386699676513672, "logits/rejected": -17.60431480407715, "logps/chosen": -547.536865234375, "logps/rejected": -415.24139404296875, "loss": 0.6637, "rewards/accuracies": 0.5, "rewards/chosen": 0.24799928069114685, "rewards/margins": 0.06397918611764908, "rewards/rejected": 0.18402007222175598, "step": 1290 }, { "epoch": 0.0603556339662937, "grad_norm": 64.16259002685547, "learning_rate": 2.9639259018524536e-07, "logits/chosen": -18.17657470703125, "logits/rejected": -18.242734909057617, "logps/chosen": -444.4970703125, "logps/rejected": -427.454833984375, "loss": 0.6755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2393846958875656, "rewards/margins": 0.04026786610484123, "rewards/rejected": 0.1991168111562729, "step": 1300 }, { "epoch": 0.060819908073726725, "grad_norm": 155.82225036621094, "learning_rate": 2.9636473373879935e-07, "logits/chosen": -18.15133285522461, "logits/rejected": -18.092716217041016, "logps/chosen": -423.65753173828125, "logps/rejected": -403.27606201171875, "loss": 0.6888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21819503605365753, "rewards/margins": 0.009183046407997608, "rewards/rejected": 0.2090119868516922, "step": 1310 }, { "epoch": 0.06128418218115975, "grad_norm": 123.82539367675781, "learning_rate": 2.963368772923534e-07, "logits/chosen": -18.188419342041016, "logits/rejected": -17.86198616027832, "logps/chosen": -556.1478271484375, "logps/rejected": -484.42620849609375, "loss": 0.6438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3443293869495392, "rewards/margins": 0.10449150949716568, "rewards/rejected": 0.2398378551006317, "step": 1320 }, { "epoch": 0.06174845628859278, "grad_norm": 88.44532775878906, "learning_rate": 2.9630902084590743e-07, "logits/chosen": -17.25745964050293, "logits/rejected": -16.890302658081055, "logps/chosen": -365.606201171875, "logps/rejected": -324.27593994140625, "loss": 0.695, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.16925635933876038, "rewards/margins": 0.0002482585550751537, "rewards/rejected": 0.16900810599327087, "step": 1330 }, { "epoch": 0.06221273039602581, "grad_norm": 68.71299743652344, "learning_rate": 2.962811643994614e-07, "logits/chosen": -18.1600284576416, "logits/rejected": -17.13959503173828, "logps/chosen": -544.1475830078125, "logps/rejected": -414.10552978515625, "loss": 0.6602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3233782649040222, "rewards/margins": 0.07386212050914764, "rewards/rejected": 0.24951617419719696, "step": 1340 }, { "epoch": 0.06267700450345884, "grad_norm": 65.65338134765625, "learning_rate": 2.9625330795301546e-07, "logits/chosen": -18.769926071166992, "logits/rejected": -17.57953453063965, "logps/chosen": -332.1579895019531, "logps/rejected": -238.71823120117188, "loss": 0.6694, "rewards/accuracies": 0.5, "rewards/chosen": 0.18683472275733948, "rewards/margins": 0.05136864259839058, "rewards/rejected": 0.1354660838842392, "step": 1350 }, { "epoch": 0.06314127861089187, "grad_norm": 122.66963958740234, "learning_rate": 2.9622545150656945e-07, "logits/chosen": -17.194175720214844, "logits/rejected": -16.95052719116211, "logps/chosen": -351.8629150390625, "logps/rejected": -322.71038818359375, "loss": 0.6746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19328933954238892, "rewards/margins": 0.04067115858197212, "rewards/rejected": 0.1526181846857071, "step": 1360 }, { "epoch": 0.0636055527183249, "grad_norm": 68.90863800048828, "learning_rate": 2.961975950601235e-07, "logits/chosen": -18.080442428588867, "logits/rejected": -17.435176849365234, "logps/chosen": -469.0667419433594, "logps/rejected": -369.7581481933594, "loss": 0.6682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.24695558845996857, "rewards/margins": 0.051895368844270706, "rewards/rejected": 0.19506019353866577, "step": 1370 }, { "epoch": 0.06406982682575793, "grad_norm": 138.02589416503906, "learning_rate": 2.961697386136775e-07, "logits/chosen": -18.341815948486328, "logits/rejected": -17.698001861572266, "logps/chosen": -439.005126953125, "logps/rejected": -355.33551025390625, "loss": 0.6639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.22731153666973114, "rewards/margins": 0.06385256350040436, "rewards/rejected": 0.16345898807048798, "step": 1380 }, { "epoch": 0.06453410093319095, "grad_norm": 86.47201538085938, "learning_rate": 2.961418821672315e-07, "logits/chosen": -18.544069290161133, "logits/rejected": -17.89132308959961, "logps/chosen": -390.1712951660156, "logps/rejected": -343.37628173828125, "loss": 0.6597, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.24607177078723907, "rewards/margins": 0.07288757711648941, "rewards/rejected": 0.17318421602249146, "step": 1390 }, { "epoch": 0.06499837504062399, "grad_norm": 120.09246826171875, "learning_rate": 2.9611402572078556e-07, "logits/chosen": -18.63973617553711, "logits/rejected": -18.00241470336914, "logps/chosen": -427.408935546875, "logps/rejected": -370.5323181152344, "loss": 0.6676, "rewards/accuracies": 0.5, "rewards/chosen": 0.3250645697116852, "rewards/margins": 0.05694229155778885, "rewards/rejected": 0.26812225580215454, "step": 1400 }, { "epoch": 0.06546264914805701, "grad_norm": 94.91133117675781, "learning_rate": 2.9608616927433955e-07, "logits/chosen": -17.329063415527344, "logits/rejected": -16.943321228027344, "logps/chosen": -454.7281799316406, "logps/rejected": -379.51776123046875, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24396567046642303, "rewards/margins": 0.03317451477050781, "rewards/rejected": 0.21079115569591522, "step": 1410 }, { "epoch": 0.06592692325549004, "grad_norm": 56.47814178466797, "learning_rate": 2.960583128278936e-07, "logits/chosen": -17.863727569580078, "logits/rejected": -17.43106460571289, "logps/chosen": -394.6963195800781, "logps/rejected": -432.55059814453125, "loss": 0.7108, "rewards/accuracies": 0.5, "rewards/chosen": 0.22787415981292725, "rewards/margins": -0.029802698642015457, "rewards/rejected": 0.2576768696308136, "step": 1420 }, { "epoch": 0.06639119736292307, "grad_norm": 170.97874450683594, "learning_rate": 2.960304563814476e-07, "logits/chosen": -18.579364776611328, "logits/rejected": -16.807357788085938, "logps/chosen": -430.6122131347656, "logps/rejected": -365.070068359375, "loss": 0.6513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.26182129979133606, "rewards/margins": 0.08985106647014618, "rewards/rejected": 0.1719702184200287, "step": 1430 }, { "epoch": 0.0668554714703561, "grad_norm": 72.05404663085938, "learning_rate": 2.960025999350016e-07, "logits/chosen": -18.861080169677734, "logits/rejected": -17.977941513061523, "logps/chosen": -427.39776611328125, "logps/rejected": -346.6516418457031, "loss": 0.694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2740944027900696, "rewards/margins": 0.0007615730282850564, "rewards/rejected": 0.2733328342437744, "step": 1440 }, { "epoch": 0.06731974557778912, "grad_norm": 25.391902923583984, "learning_rate": 2.9597474348855566e-07, "logits/chosen": -18.617023468017578, "logits/rejected": -17.592220306396484, "logps/chosen": -351.51959228515625, "logps/rejected": -275.83367919921875, "loss": 0.6574, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2052338421344757, "rewards/margins": 0.076024129986763, "rewards/rejected": 0.1292097121477127, "step": 1450 }, { "epoch": 0.06778401968522216, "grad_norm": 122.06368255615234, "learning_rate": 2.9594688704210964e-07, "logits/chosen": -17.582014083862305, "logits/rejected": -16.670730590820312, "logps/chosen": -463.6724548339844, "logps/rejected": -369.2563781738281, "loss": 0.6541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3108446002006531, "rewards/margins": 0.08355052769184113, "rewards/rejected": 0.22729404270648956, "step": 1460 }, { "epoch": 0.06824829379265518, "grad_norm": 76.89612579345703, "learning_rate": 2.959190305956637e-07, "logits/chosen": -17.966915130615234, "logits/rejected": -17.57821273803711, "logps/chosen": -377.91717529296875, "logps/rejected": -303.2528381347656, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2694706320762634, "rewards/margins": 0.04006371647119522, "rewards/rejected": 0.22940687835216522, "step": 1470 }, { "epoch": 0.06871256790008821, "grad_norm": 63.03036880493164, "learning_rate": 2.9589117414921767e-07, "logits/chosen": -17.371906280517578, "logits/rejected": -16.64200210571289, "logps/chosen": -421.1785583496094, "logps/rejected": -328.6374816894531, "loss": 0.6584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2814578413963318, "rewards/margins": 0.07359834015369415, "rewards/rejected": 0.20785948634147644, "step": 1480 }, { "epoch": 0.06917684200752124, "grad_norm": 79.60414123535156, "learning_rate": 2.9586331770277166e-07, "logits/chosen": -17.531641006469727, "logits/rejected": -17.450157165527344, "logps/chosen": -225.77774047851562, "logps/rejected": -282.10321044921875, "loss": 0.7034, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.14977256953716278, "rewards/margins": -0.01871238648891449, "rewards/rejected": 0.16848495602607727, "step": 1490 }, { "epoch": 0.06964111611495427, "grad_norm": 138.10789489746094, "learning_rate": 2.958354612563257e-07, "logits/chosen": -18.430891036987305, "logits/rejected": -18.642751693725586, "logps/chosen": -349.95233154296875, "logps/rejected": -425.6109924316406, "loss": 0.7359, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.21025939285755157, "rewards/margins": -0.07925689965486526, "rewards/rejected": 0.28951629996299744, "step": 1500 }, { "epoch": 0.07010539022238729, "grad_norm": 41.199134826660156, "learning_rate": 2.9580760480987974e-07, "logits/chosen": -17.255443572998047, "logits/rejected": -17.339914321899414, "logps/chosen": -291.0590515136719, "logps/rejected": -291.1785583496094, "loss": 0.674, "rewards/accuracies": 0.5, "rewards/chosen": 0.2336605042219162, "rewards/margins": 0.04121135547757149, "rewards/rejected": 0.192449152469635, "step": 1510 }, { "epoch": 0.07056966432982033, "grad_norm": 147.87503051757812, "learning_rate": 2.957797483634338e-07, "logits/chosen": -18.301166534423828, "logits/rejected": -17.280574798583984, "logps/chosen": -455.39617919921875, "logps/rejected": -395.06072998046875, "loss": 0.6723, "rewards/accuracies": 0.5, "rewards/chosen": 0.2759346663951874, "rewards/margins": 0.04895597696304321, "rewards/rejected": 0.22697868943214417, "step": 1520 }, { "epoch": 0.07103393843725335, "grad_norm": 92.84616088867188, "learning_rate": 2.9575189191698777e-07, "logits/chosen": -18.21215057373047, "logits/rejected": -16.804771423339844, "logps/chosen": -402.2911376953125, "logps/rejected": -251.46682739257812, "loss": 0.6328, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.28987857699394226, "rewards/margins": 0.12855106592178345, "rewards/rejected": 0.1613275110721588, "step": 1530 }, { "epoch": 0.07149821254468638, "grad_norm": 117.16847229003906, "learning_rate": 2.957240354705418e-07, "logits/chosen": -18.329967498779297, "logits/rejected": -17.63744354248047, "logps/chosen": -479.14190673828125, "logps/rejected": -382.2281494140625, "loss": 0.6671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.34433990716934204, "rewards/margins": 0.05516525357961655, "rewards/rejected": 0.2891746163368225, "step": 1540 }, { "epoch": 0.0719624866521194, "grad_norm": 82.34053802490234, "learning_rate": 2.956961790240958e-07, "logits/chosen": -18.577693939208984, "logits/rejected": -17.4785099029541, "logps/chosen": -447.0570373535156, "logps/rejected": -279.7308044433594, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2774028182029724, "rewards/margins": 0.08912818878889084, "rewards/rejected": 0.1882745921611786, "step": 1550 }, { "epoch": 0.07242676075955244, "grad_norm": 116.21884155273438, "learning_rate": 2.9566832257764984e-07, "logits/chosen": -18.6710205078125, "logits/rejected": -17.677778244018555, "logps/chosen": -463.37841796875, "logps/rejected": -430.3443908691406, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.27378594875335693, "rewards/margins": 0.026717226952314377, "rewards/rejected": 0.24706871807575226, "step": 1560 }, { "epoch": 0.07289103486698546, "grad_norm": 69.26189422607422, "learning_rate": 2.9564046613120383e-07, "logits/chosen": -17.811771392822266, "logits/rejected": -17.367891311645508, "logps/chosen": -386.8872375488281, "logps/rejected": -340.6778259277344, "loss": 0.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.26902228593826294, "rewards/margins": 0.07518830895423889, "rewards/rejected": 0.19383399188518524, "step": 1570 }, { "epoch": 0.0733553089744185, "grad_norm": 130.81468200683594, "learning_rate": 2.9561260968475787e-07, "logits/chosen": -17.747974395751953, "logits/rejected": -16.983747482299805, "logps/chosen": -454.4137268066406, "logps/rejected": -366.71588134765625, "loss": 0.6944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.23521943390369415, "rewards/margins": 0.002087248954921961, "rewards/rejected": 0.23313216865062714, "step": 1580 }, { "epoch": 0.07381958308185152, "grad_norm": 50.78691482543945, "learning_rate": 2.955847532383119e-07, "logits/chosen": -17.684223175048828, "logits/rejected": -16.953359603881836, "logps/chosen": -426.2616271972656, "logps/rejected": -342.98870849609375, "loss": 0.6671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2946968376636505, "rewards/margins": 0.054551392793655396, "rewards/rejected": 0.24014541506767273, "step": 1590 }, { "epoch": 0.07428385718928456, "grad_norm": 52.61949920654297, "learning_rate": 2.955568967918659e-07, "logits/chosen": -18.84414291381836, "logits/rejected": -18.481990814208984, "logps/chosen": -520.4970703125, "logps/rejected": -487.82647705078125, "loss": 0.6854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3403019607067108, "rewards/margins": 0.021631160750985146, "rewards/rejected": 0.31867077946662903, "step": 1600 }, { "epoch": 0.07474813129671758, "grad_norm": 69.78211975097656, "learning_rate": 2.955290403454199e-07, "logits/chosen": -18.286287307739258, "logits/rejected": -18.367502212524414, "logps/chosen": -296.19830322265625, "logps/rejected": -369.11175537109375, "loss": 0.7338, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 0.24937209486961365, "rewards/margins": -0.07676223665475845, "rewards/rejected": 0.3261343538761139, "step": 1610 }, { "epoch": 0.07521240540415061, "grad_norm": 65.52604675292969, "learning_rate": 2.9550118389897393e-07, "logits/chosen": -17.58051109313965, "logits/rejected": -16.486520767211914, "logps/chosen": -445.2451171875, "logps/rejected": -327.4349670410156, "loss": 0.6446, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.31197357177734375, "rewards/margins": 0.10653762519359589, "rewards/rejected": 0.20543594658374786, "step": 1620 }, { "epoch": 0.07567667951158363, "grad_norm": 88.44215393066406, "learning_rate": 2.9547332745252797e-07, "logits/chosen": -18.68056297302246, "logits/rejected": -18.098529815673828, "logps/chosen": -398.8509826660156, "logps/rejected": -322.92626953125, "loss": 0.6599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2851310670375824, "rewards/margins": 0.06936291605234146, "rewards/rejected": 0.21576817333698273, "step": 1630 }, { "epoch": 0.07614095361901667, "grad_norm": 95.64379119873047, "learning_rate": 2.95445471006082e-07, "logits/chosen": -19.25704574584961, "logits/rejected": -18.296142578125, "logps/chosen": -451.1255798339844, "logps/rejected": -375.1462707519531, "loss": 0.6831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2640101909637451, "rewards/margins": 0.02482648752629757, "rewards/rejected": 0.23918373882770538, "step": 1640 }, { "epoch": 0.07660522772644969, "grad_norm": 110.47467041015625, "learning_rate": 2.95417614559636e-07, "logits/chosen": -18.06210708618164, "logits/rejected": -17.526470184326172, "logps/chosen": -470.74041748046875, "logps/rejected": -359.7370910644531, "loss": 0.676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2929403781890869, "rewards/margins": 0.03951217979192734, "rewards/rejected": 0.253428190946579, "step": 1650 }, { "epoch": 0.07706950183388273, "grad_norm": 53.713897705078125, "learning_rate": 2.9538975811319e-07, "logits/chosen": -17.419973373413086, "logits/rejected": -16.81730842590332, "logps/chosen": -425.08624267578125, "logps/rejected": -291.68316650390625, "loss": 0.6794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2742272913455963, "rewards/margins": 0.030835265293717384, "rewards/rejected": 0.24339203536510468, "step": 1660 }, { "epoch": 0.07753377594131575, "grad_norm": 67.92051696777344, "learning_rate": 2.9536190166674403e-07, "logits/chosen": -17.06180191040039, "logits/rejected": -16.47391700744629, "logps/chosen": -348.7020263671875, "logps/rejected": -254.62496948242188, "loss": 0.6572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.29039156436920166, "rewards/margins": 0.08660377562046051, "rewards/rejected": 0.20378777384757996, "step": 1670 }, { "epoch": 0.07799805004874878, "grad_norm": 101.8538818359375, "learning_rate": 2.9533404522029807e-07, "logits/chosen": -17.53837013244629, "logits/rejected": -17.3294620513916, "logps/chosen": -290.7929382324219, "logps/rejected": -303.3674011230469, "loss": 0.6889, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.19964228570461273, "rewards/margins": 0.013834868557751179, "rewards/rejected": 0.18580742180347443, "step": 1680 }, { "epoch": 0.0784623241561818, "grad_norm": 58.88754653930664, "learning_rate": 2.9530618877385206e-07, "logits/chosen": -17.59099769592285, "logits/rejected": -17.52077865600586, "logps/chosen": -320.83697509765625, "logps/rejected": -379.67724609375, "loss": 0.7195, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.2356618344783783, "rewards/margins": -0.04828331619501114, "rewards/rejected": 0.2839451730251312, "step": 1690 }, { "epoch": 0.07892659826361484, "grad_norm": 81.68673706054688, "learning_rate": 2.952783323274061e-07, "logits/chosen": -18.222274780273438, "logits/rejected": -17.60292625427246, "logps/chosen": -398.8013916015625, "logps/rejected": -338.1429443359375, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24600057303905487, "rewards/margins": 0.028657596558332443, "rewards/rejected": 0.21734294295310974, "step": 1700 }, { "epoch": 0.07939087237104786, "grad_norm": 69.01876068115234, "learning_rate": 2.9525047588096014e-07, "logits/chosen": -18.30507469177246, "logits/rejected": -16.64376449584961, "logps/chosen": -485.21221923828125, "logps/rejected": -369.6275329589844, "loss": 0.6527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.37766212224960327, "rewards/margins": 0.08779828995466232, "rewards/rejected": 0.28986385464668274, "step": 1710 }, { "epoch": 0.0798551464784809, "grad_norm": 90.11952209472656, "learning_rate": 2.9522261943451413e-07, "logits/chosen": -17.900047302246094, "logits/rejected": -16.616708755493164, "logps/chosen": -412.971435546875, "logps/rejected": -284.94232177734375, "loss": 0.6527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.319225013256073, "rewards/margins": 0.09052605926990509, "rewards/rejected": 0.2286989390850067, "step": 1720 }, { "epoch": 0.08031942058591392, "grad_norm": 83.77873229980469, "learning_rate": 2.951947629880681e-07, "logits/chosen": -18.146297454833984, "logits/rejected": -16.866992950439453, "logps/chosen": -494.085693359375, "logps/rejected": -339.06610107421875, "loss": 0.6444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.35798174142837524, "rewards/margins": 0.10804592072963715, "rewards/rejected": 0.2499357908964157, "step": 1730 }, { "epoch": 0.08078369469334695, "grad_norm": 54.86823654174805, "learning_rate": 2.9516690654162216e-07, "logits/chosen": -17.148601531982422, "logits/rejected": -16.387418746948242, "logps/chosen": -314.70086669921875, "logps/rejected": -227.3102264404297, "loss": 0.66, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26043370366096497, "rewards/margins": 0.07342662662267685, "rewards/rejected": 0.18700706958770752, "step": 1740 }, { "epoch": 0.08124796880077997, "grad_norm": 87.82168579101562, "learning_rate": 2.951390500951762e-07, "logits/chosen": -18.6053524017334, "logits/rejected": -17.564138412475586, "logps/chosen": -315.27581787109375, "logps/rejected": -284.5935363769531, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.22750870883464813, "rewards/margins": 0.03533000499010086, "rewards/rejected": 0.19217871129512787, "step": 1750 }, { "epoch": 0.08171224290821301, "grad_norm": 128.81153869628906, "learning_rate": 2.951111936487302e-07, "logits/chosen": -18.114614486694336, "logits/rejected": -18.095788955688477, "logps/chosen": -486.08843994140625, "logps/rejected": -475.76751708984375, "loss": 0.6978, "rewards/accuracies": 0.5, "rewards/chosen": 0.298004150390625, "rewards/margins": -0.00457271421328187, "rewards/rejected": 0.30257683992385864, "step": 1760 }, { "epoch": 0.08217651701564603, "grad_norm": 85.04387664794922, "learning_rate": 2.9508333720228423e-07, "logits/chosen": -18.337827682495117, "logits/rejected": -17.10872459411621, "logps/chosen": -404.2527770996094, "logps/rejected": -264.06292724609375, "loss": 0.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2789645791053772, "rewards/margins": 0.11551021039485931, "rewards/rejected": 0.1634543240070343, "step": 1770 }, { "epoch": 0.08264079112307907, "grad_norm": 94.0112533569336, "learning_rate": 2.950554807558382e-07, "logits/chosen": -17.982784271240234, "logits/rejected": -18.19283676147461, "logps/chosen": -456.1482849121094, "logps/rejected": -445.5244140625, "loss": 0.7019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.33873170614242554, "rewards/margins": -0.013219604268670082, "rewards/rejected": 0.3519513010978699, "step": 1780 }, { "epoch": 0.0831050652305121, "grad_norm": 77.39507293701172, "learning_rate": 2.9502762430939226e-07, "logits/chosen": -17.598209381103516, "logits/rejected": -17.151180267333984, "logps/chosen": -238.36782836914062, "logps/rejected": -204.67733764648438, "loss": 0.6738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19015321135520935, "rewards/margins": 0.04255220293998718, "rewards/rejected": 0.14760097861289978, "step": 1790 }, { "epoch": 0.08356933933794512, "grad_norm": 77.82684326171875, "learning_rate": 2.9499976786294624e-07, "logits/chosen": -19.679330825805664, "logits/rejected": -17.197538375854492, "logps/chosen": -453.01397705078125, "logps/rejected": -297.3131408691406, "loss": 0.6416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.30796098709106445, "rewards/margins": 0.10943357646465302, "rewards/rejected": 0.19852738082408905, "step": 1800 }, { "epoch": 0.08403361344537816, "grad_norm": 58.89065933227539, "learning_rate": 2.949719114165003e-07, "logits/chosen": -18.113372802734375, "logits/rejected": -16.90030860900879, "logps/chosen": -436.18206787109375, "logps/rejected": -300.0413818359375, "loss": 0.6413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.32552382349967957, "rewards/margins": 0.11409439146518707, "rewards/rejected": 0.2114294469356537, "step": 1810 }, { "epoch": 0.08449788755281118, "grad_norm": 23.4796142578125, "learning_rate": 2.949440549700543e-07, "logits/chosen": -17.549999237060547, "logits/rejected": -16.409780502319336, "logps/chosen": -347.64691162109375, "logps/rejected": -243.3507843017578, "loss": 0.6443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3277524709701538, "rewards/margins": 0.10611480474472046, "rewards/rejected": 0.22163765132427216, "step": 1820 }, { "epoch": 0.08496216166024421, "grad_norm": 37.27141571044922, "learning_rate": 2.949161985236083e-07, "logits/chosen": -17.32101821899414, "logits/rejected": -16.920724868774414, "logps/chosen": -343.51043701171875, "logps/rejected": -258.83758544921875, "loss": 0.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.25727295875549316, "rewards/margins": 0.04157089442014694, "rewards/rejected": 0.21570205688476562, "step": 1830 }, { "epoch": 0.08542643576767724, "grad_norm": 77.43058013916016, "learning_rate": 2.9488834207716236e-07, "logits/chosen": -17.74839973449707, "logits/rejected": -16.838571548461914, "logps/chosen": -342.946533203125, "logps/rejected": -191.63047790527344, "loss": 0.6474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2858882248401642, "rewards/margins": 0.10254044830799103, "rewards/rejected": 0.18334776163101196, "step": 1840 }, { "epoch": 0.08589070987511027, "grad_norm": 83.73999786376953, "learning_rate": 2.9486048563071634e-07, "logits/chosen": -17.640491485595703, "logits/rejected": -17.10025405883789, "logps/chosen": -383.1759033203125, "logps/rejected": -250.5626983642578, "loss": 0.6511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3266657888889313, "rewards/margins": 0.08912360668182373, "rewards/rejected": 0.23754219710826874, "step": 1850 }, { "epoch": 0.08635498398254329, "grad_norm": 112.73995971679688, "learning_rate": 2.948326291842704e-07, "logits/chosen": -17.725229263305664, "logits/rejected": -16.817411422729492, "logps/chosen": -413.218017578125, "logps/rejected": -307.2593688964844, "loss": 0.6552, "rewards/accuracies": 0.5, "rewards/chosen": 0.34685245156288147, "rewards/margins": 0.089100182056427, "rewards/rejected": 0.2577522397041321, "step": 1860 }, { "epoch": 0.08681925808997633, "grad_norm": 111.9901123046875, "learning_rate": 2.948047727378244e-07, "logits/chosen": -18.062671661376953, "logits/rejected": -17.09844398498535, "logps/chosen": -456.16949462890625, "logps/rejected": -320.43817138671875, "loss": 0.643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3537665009498596, "rewards/margins": 0.11756362020969391, "rewards/rejected": 0.23620295524597168, "step": 1870 }, { "epoch": 0.08728353219740935, "grad_norm": 43.312931060791016, "learning_rate": 2.947769162913784e-07, "logits/chosen": -17.421812057495117, "logits/rejected": -16.90735626220703, "logps/chosen": -379.23565673828125, "logps/rejected": -338.575927734375, "loss": 0.6717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3205927312374115, "rewards/margins": 0.05218454450368881, "rewards/rejected": 0.2684082090854645, "step": 1880 }, { "epoch": 0.08774780630484239, "grad_norm": 92.6767578125, "learning_rate": 2.9474905984493245e-07, "logits/chosen": -16.667333602905273, "logits/rejected": -17.246383666992188, "logps/chosen": -274.8551025390625, "logps/rejected": -323.23406982421875, "loss": 0.7085, "rewards/accuracies": 0.5, "rewards/chosen": 0.25995591282844543, "rewards/margins": -0.026975005865097046, "rewards/rejected": 0.2869309186935425, "step": 1890 }, { "epoch": 0.0882120804122754, "grad_norm": 53.80314254760742, "learning_rate": 2.9472120339848644e-07, "logits/chosen": -18.64406967163086, "logits/rejected": -17.001144409179688, "logps/chosen": -462.20880126953125, "logps/rejected": -304.071044921875, "loss": 0.6399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.39226672053337097, "rewards/margins": 0.12100158631801605, "rewards/rejected": 0.27126508951187134, "step": 1900 }, { "epoch": 0.08867635451970844, "grad_norm": 62.51168441772461, "learning_rate": 2.9469334695204043e-07, "logits/chosen": -17.757856369018555, "logits/rejected": -17.55521583557129, "logps/chosen": -407.22955322265625, "logps/rejected": -410.311767578125, "loss": 0.6754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.38167113065719604, "rewards/margins": 0.04181916266679764, "rewards/rejected": 0.3398520052433014, "step": 1910 }, { "epoch": 0.08914062862714146, "grad_norm": 66.99473571777344, "learning_rate": 2.9466549050559447e-07, "logits/chosen": -17.580692291259766, "logits/rejected": -18.017696380615234, "logps/chosen": -252.04013061523438, "logps/rejected": -297.11285400390625, "loss": 0.7481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.21407651901245117, "rewards/margins": -0.09434747695922852, "rewards/rejected": 0.3084239661693573, "step": 1920 }, { "epoch": 0.0896049027345745, "grad_norm": 121.03783416748047, "learning_rate": 2.946376340591485e-07, "logits/chosen": -18.109601974487305, "logits/rejected": -16.173192977905273, "logps/chosen": -476.1002502441406, "logps/rejected": -210.63717651367188, "loss": 0.6136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.37622225284576416, "rewards/margins": 0.17483563721179962, "rewards/rejected": 0.20138660073280334, "step": 1930 }, { "epoch": 0.09006917684200752, "grad_norm": 40.421512603759766, "learning_rate": 2.9460977761270255e-07, "logits/chosen": -18.529254913330078, "logits/rejected": -17.69350242614746, "logps/chosen": -468.94854736328125, "logps/rejected": -364.31536865234375, "loss": 0.6361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.40684622526168823, "rewards/margins": 0.12393783032894135, "rewards/rejected": 0.2829083800315857, "step": 1940 }, { "epoch": 0.09053345094944056, "grad_norm": 64.6324462890625, "learning_rate": 2.9458192116625654e-07, "logits/chosen": -17.753314971923828, "logits/rejected": -17.367172241210938, "logps/chosen": -477.2611389160156, "logps/rejected": -448.302001953125, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4157653748989105, "rewards/margins": 0.0715496763586998, "rewards/rejected": 0.3442157208919525, "step": 1950 }, { "epoch": 0.09099772505687358, "grad_norm": 178.97914123535156, "learning_rate": 2.945540647198106e-07, "logits/chosen": -18.842479705810547, "logits/rejected": -17.822784423828125, "logps/chosen": -475.43408203125, "logps/rejected": -398.84228515625, "loss": 0.6998, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.3526425063610077, "rewards/margins": -0.011339189484715462, "rewards/rejected": 0.3639816641807556, "step": 1960 }, { "epoch": 0.09146199916430661, "grad_norm": 65.7156753540039, "learning_rate": 2.9452620827336457e-07, "logits/chosen": -18.451229095458984, "logits/rejected": -17.14171600341797, "logps/chosen": -506.7117614746094, "logps/rejected": -338.67401123046875, "loss": 0.6208, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5018307566642761, "rewards/margins": 0.16007542610168457, "rewards/rejected": 0.34175530076026917, "step": 1970 }, { "epoch": 0.09192627327173963, "grad_norm": 128.06076049804688, "learning_rate": 2.944983518269186e-07, "logits/chosen": -17.600223541259766, "logits/rejected": -17.774324417114258, "logps/chosen": -323.9027404785156, "logps/rejected": -305.7762145996094, "loss": 0.6861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.29015082120895386, "rewards/margins": 0.018644407391548157, "rewards/rejected": 0.2715064287185669, "step": 1980 }, { "epoch": 0.09239054737917267, "grad_norm": 133.08505249023438, "learning_rate": 2.944704953804726e-07, "logits/chosen": -17.986141204833984, "logits/rejected": -17.865726470947266, "logps/chosen": -417.61663818359375, "logps/rejected": -380.53082275390625, "loss": 0.6738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.34666532278060913, "rewards/margins": 0.043629761785268784, "rewards/rejected": 0.30303558707237244, "step": 1990 }, { "epoch": 0.09285482148660569, "grad_norm": 49.47740936279297, "learning_rate": 2.9444263893402664e-07, "logits/chosen": -16.895156860351562, "logits/rejected": -17.09023094177246, "logps/chosen": -316.484375, "logps/rejected": -324.3534240722656, "loss": 0.6967, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.31160250306129456, "rewards/margins": 0.002780616283416748, "rewards/rejected": 0.3088218569755554, "step": 2000 }, { "epoch": 0.09331909559403873, "grad_norm": 62.135597229003906, "learning_rate": 2.944147824875807e-07, "logits/chosen": -17.60721778869629, "logits/rejected": -16.964815139770508, "logps/chosen": -293.5675354003906, "logps/rejected": -292.64239501953125, "loss": 0.6749, "rewards/accuracies": 0.5, "rewards/chosen": 0.2955215573310852, "rewards/margins": 0.04540389031171799, "rewards/rejected": 0.250117689371109, "step": 2010 }, { "epoch": 0.09378336970147175, "grad_norm": 75.73300170898438, "learning_rate": 2.9438692604113467e-07, "logits/chosen": -17.916255950927734, "logits/rejected": -15.90905475616455, "logps/chosen": -373.5625305175781, "logps/rejected": -225.8934783935547, "loss": 0.5732, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4566861689090729, "rewards/margins": 0.27309247851371765, "rewards/rejected": 0.18359358608722687, "step": 2020 }, { "epoch": 0.09424764380890478, "grad_norm": 63.27756881713867, "learning_rate": 2.9435906959468866e-07, "logits/chosen": -17.730310440063477, "logits/rejected": -16.971393585205078, "logps/chosen": -304.8402099609375, "logps/rejected": -254.68582153320312, "loss": 0.6684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3623339533805847, "rewards/margins": 0.05965324491262436, "rewards/rejected": 0.30268073081970215, "step": 2030 }, { "epoch": 0.0947119179163378, "grad_norm": 136.10911560058594, "learning_rate": 2.943312131482427e-07, "logits/chosen": -17.516050338745117, "logits/rejected": -16.675701141357422, "logps/chosen": -509.4755859375, "logps/rejected": -403.1003723144531, "loss": 0.6306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.43638506531715393, "rewards/margins": 0.13427618145942688, "rewards/rejected": 0.30210885405540466, "step": 2040 }, { "epoch": 0.09517619202377084, "grad_norm": 106.38084411621094, "learning_rate": 2.9430335670179674e-07, "logits/chosen": -18.957332611083984, "logits/rejected": -18.985198974609375, "logps/chosen": -412.0924377441406, "logps/rejected": -372.0802917480469, "loss": 0.7069, "rewards/accuracies": 0.5, "rewards/chosen": 0.32225286960601807, "rewards/margins": -0.017870822921395302, "rewards/rejected": 0.34012371301651, "step": 2050 }, { "epoch": 0.09564046613120386, "grad_norm": 65.2176513671875, "learning_rate": 2.942755002553508e-07, "logits/chosen": -18.18659210205078, "logits/rejected": -17.84860610961914, "logps/chosen": -389.71441650390625, "logps/rejected": -331.40576171875, "loss": 0.6783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3759508728981018, "rewards/margins": 0.037816692143678665, "rewards/rejected": 0.33813413977622986, "step": 2060 }, { "epoch": 0.0961047402386369, "grad_norm": 140.17494201660156, "learning_rate": 2.9424764380890477e-07, "logits/chosen": -17.556320190429688, "logits/rejected": -18.11427879333496, "logps/chosen": -399.61907958984375, "logps/rejected": -538.6744995117188, "loss": 0.746, "rewards/accuracies": 0.5, "rewards/chosen": 0.31863340735435486, "rewards/margins": -0.09383420646190643, "rewards/rejected": 0.4124676585197449, "step": 2070 }, { "epoch": 0.09656901434606992, "grad_norm": 72.5481185913086, "learning_rate": 2.9421978736245876e-07, "logits/chosen": -19.414405822753906, "logits/rejected": -18.20473861694336, "logps/chosen": -327.293212890625, "logps/rejected": -269.0317077636719, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.388805091381073, "rewards/margins": 0.12448787689208984, "rewards/rejected": 0.26431721448898315, "step": 2080 }, { "epoch": 0.09703328845350295, "grad_norm": 39.27936553955078, "learning_rate": 2.941919309160128e-07, "logits/chosen": -18.1185359954834, "logits/rejected": -17.06308364868164, "logps/chosen": -446.85467529296875, "logps/rejected": -306.21490478515625, "loss": 0.6363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.48147112131118774, "rewards/margins": 0.1349666565656662, "rewards/rejected": 0.34650444984436035, "step": 2090 }, { "epoch": 0.09749756256093597, "grad_norm": 84.56820678710938, "learning_rate": 2.941640744695668e-07, "logits/chosen": -18.420166015625, "logits/rejected": -17.24138069152832, "logps/chosen": -473.13897705078125, "logps/rejected": -362.12237548828125, "loss": 0.6621, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.528256356716156, "rewards/margins": 0.08124299347400665, "rewards/rejected": 0.4470134377479553, "step": 2100 }, { "epoch": 0.09796183666836901, "grad_norm": 76.74879455566406, "learning_rate": 2.9413621802312083e-07, "logits/chosen": -18.0851993560791, "logits/rejected": -17.96174430847168, "logps/chosen": -438.796142578125, "logps/rejected": -442.83441162109375, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": 0.391213983297348, "rewards/margins": 0.04090103507041931, "rewards/rejected": 0.3503129780292511, "step": 2110 }, { "epoch": 0.09842611077580203, "grad_norm": 55.27158737182617, "learning_rate": 2.9410836157667487e-07, "logits/chosen": -17.931575775146484, "logits/rejected": -16.786888122558594, "logps/chosen": -406.6466979980469, "logps/rejected": -314.40020751953125, "loss": 0.6382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3842623829841614, "rewards/margins": 0.1227632537484169, "rewards/rejected": 0.2614991366863251, "step": 2120 }, { "epoch": 0.09889038488323507, "grad_norm": 59.82120132446289, "learning_rate": 2.940805051302289e-07, "logits/chosen": -17.59683609008789, "logits/rejected": -17.406314849853516, "logps/chosen": -384.38140869140625, "logps/rejected": -375.7089538574219, "loss": 0.6795, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.4223571717739105, "rewards/margins": 0.035972971469163895, "rewards/rejected": 0.38638418912887573, "step": 2130 }, { "epoch": 0.09935465899066809, "grad_norm": 97.9771499633789, "learning_rate": 2.940526486837829e-07, "logits/chosen": -17.86613655090332, "logits/rejected": -17.23392105102539, "logps/chosen": -401.0716247558594, "logps/rejected": -333.76910400390625, "loss": 0.6561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4365234375, "rewards/margins": 0.07719795405864716, "rewards/rejected": 0.35932546854019165, "step": 2140 }, { "epoch": 0.09981893309810112, "grad_norm": 47.26624298095703, "learning_rate": 2.940247922373369e-07, "logits/chosen": -17.93790054321289, "logits/rejected": -17.425518035888672, "logps/chosen": -448.33929443359375, "logps/rejected": -295.4503173828125, "loss": 0.62, "rewards/accuracies": 1.0, "rewards/chosen": 0.45290547609329224, "rewards/margins": 0.15745477378368378, "rewards/rejected": 0.29545068740844727, "step": 2150 }, { "epoch": 0.10028320720553414, "grad_norm": 137.04139709472656, "learning_rate": 2.939969357908909e-07, "logits/chosen": -17.686899185180664, "logits/rejected": -17.595104217529297, "logps/chosen": -509.9916076660156, "logps/rejected": -498.22454833984375, "loss": 0.6802, "rewards/accuracies": 0.5, "rewards/chosen": 0.47065505385398865, "rewards/margins": 0.033933110535144806, "rewards/rejected": 0.43672195076942444, "step": 2160 }, { "epoch": 0.10074748131296718, "grad_norm": 92.11520385742188, "learning_rate": 2.9396907934444497e-07, "logits/chosen": -19.18813133239746, "logits/rejected": -18.882381439208984, "logps/chosen": -351.7355651855469, "logps/rejected": -359.1639709472656, "loss": 0.7138, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.31262075901031494, "rewards/margins": -0.03831230103969574, "rewards/rejected": 0.3509330153465271, "step": 2170 }, { "epoch": 0.1012117554204002, "grad_norm": 69.61847686767578, "learning_rate": 2.9394122289799896e-07, "logits/chosen": -17.057697296142578, "logits/rejected": -16.119388580322266, "logps/chosen": -425.43280029296875, "logps/rejected": -340.421630859375, "loss": 0.6669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.41338682174682617, "rewards/margins": 0.06056199222803116, "rewards/rejected": 0.3528248369693756, "step": 2180 }, { "epoch": 0.10167602952783324, "grad_norm": 116.6235580444336, "learning_rate": 2.93913366451553e-07, "logits/chosen": -18.264949798583984, "logits/rejected": -17.41665267944336, "logps/chosen": -476.58514404296875, "logps/rejected": -400.04730224609375, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": 0.4544163644313812, "rewards/margins": 0.04947715252637863, "rewards/rejected": 0.4049392342567444, "step": 2190 }, { "epoch": 0.10214030363526626, "grad_norm": 98.34571838378906, "learning_rate": 2.93885510005107e-07, "logits/chosen": -17.558095932006836, "logits/rejected": -16.99049186706543, "logps/chosen": -473.05743408203125, "logps/rejected": -398.03485107421875, "loss": 0.6959, "rewards/accuracies": 0.5, "rewards/chosen": 0.5331467390060425, "rewards/margins": 0.027219165116548538, "rewards/rejected": 0.5059275031089783, "step": 2200 }, { "epoch": 0.1026045777426993, "grad_norm": 117.24947357177734, "learning_rate": 2.93857653558661e-07, "logits/chosen": -17.781877517700195, "logits/rejected": -16.387041091918945, "logps/chosen": -566.8297729492188, "logps/rejected": -407.13153076171875, "loss": 0.5956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6216217279434204, "rewards/margins": 0.2428237944841385, "rewards/rejected": 0.3787979185581207, "step": 2210 }, { "epoch": 0.10306885185013231, "grad_norm": 64.5951919555664, "learning_rate": 2.93829797112215e-07, "logits/chosen": -17.66275978088379, "logits/rejected": -17.547367095947266, "logps/chosen": -290.9940490722656, "logps/rejected": -336.8900451660156, "loss": 0.7029, "rewards/accuracies": 0.5, "rewards/chosen": 0.3602500259876251, "rewards/margins": -0.010232360102236271, "rewards/rejected": 0.3704823851585388, "step": 2220 }, { "epoch": 0.10353312595756535, "grad_norm": 121.57694244384766, "learning_rate": 2.9380194066576905e-07, "logits/chosen": -18.393362045288086, "logits/rejected": -16.798683166503906, "logps/chosen": -496.5001525878906, "logps/rejected": -288.6541748046875, "loss": 0.5863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5474986433982849, "rewards/margins": 0.24369315803050995, "rewards/rejected": 0.30380550026893616, "step": 2230 }, { "epoch": 0.10399740006499837, "grad_norm": 69.3285140991211, "learning_rate": 2.937740842193231e-07, "logits/chosen": -18.02028465270996, "logits/rejected": -16.847103118896484, "logps/chosen": -389.0244140625, "logps/rejected": -247.8090057373047, "loss": 0.5948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.47523361444473267, "rewards/margins": 0.22461919486522675, "rewards/rejected": 0.2506144344806671, "step": 2240 }, { "epoch": 0.1044616741724314, "grad_norm": 116.12999725341797, "learning_rate": 2.937462277728771e-07, "logits/chosen": -18.31293296813965, "logits/rejected": -17.605024337768555, "logps/chosen": -366.88690185546875, "logps/rejected": -248.36001586914062, "loss": 0.6382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4008759558200836, "rewards/margins": 0.11964781582355499, "rewards/rejected": 0.28122812509536743, "step": 2250 }, { "epoch": 0.10492594827986443, "grad_norm": 83.57363891601562, "learning_rate": 2.937183713264311e-07, "logits/chosen": -18.854900360107422, "logits/rejected": -18.082599639892578, "logps/chosen": -369.5138854980469, "logps/rejected": -337.96624755859375, "loss": 0.6667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.41834792494773865, "rewards/margins": 0.060997623950242996, "rewards/rejected": 0.35735034942626953, "step": 2260 }, { "epoch": 0.10539022238729746, "grad_norm": 69.43321228027344, "learning_rate": 2.936905148799851e-07, "logits/chosen": -17.775447845458984, "logits/rejected": -17.72513198852539, "logps/chosen": -344.74322509765625, "logps/rejected": -381.22955322265625, "loss": 0.6939, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.384408175945282, "rewards/margins": -0.0006136305746622384, "rewards/rejected": 0.385021835565567, "step": 2270 }, { "epoch": 0.10585449649473049, "grad_norm": 89.28428649902344, "learning_rate": 2.9366265843353915e-07, "logits/chosen": -18.465213775634766, "logits/rejected": -17.561748504638672, "logps/chosen": -431.83380126953125, "logps/rejected": -357.9499816894531, "loss": 0.6127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4606074392795563, "rewards/margins": 0.17720063030719757, "rewards/rejected": 0.2834068238735199, "step": 2280 }, { "epoch": 0.10631877060216352, "grad_norm": 107.95597076416016, "learning_rate": 2.9363480198709314e-07, "logits/chosen": -18.24144744873047, "logits/rejected": -17.718408584594727, "logps/chosen": -387.1502990722656, "logps/rejected": -313.4353942871094, "loss": 0.6677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5264445543289185, "rewards/margins": 0.060823969542980194, "rewards/rejected": 0.46562060713768005, "step": 2290 }, { "epoch": 0.10678304470959654, "grad_norm": 87.7799072265625, "learning_rate": 2.936069455406472e-07, "logits/chosen": -17.69540023803711, "logits/rejected": -16.91592025756836, "logps/chosen": -454.6753845214844, "logps/rejected": -328.37652587890625, "loss": 0.6281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.565287172794342, "rewards/margins": 0.15843841433525085, "rewards/rejected": 0.4068487286567688, "step": 2300 }, { "epoch": 0.10724731881702958, "grad_norm": 65.75328826904297, "learning_rate": 2.935790890942012e-07, "logits/chosen": -17.60936164855957, "logits/rejected": -16.517372131347656, "logps/chosen": -357.75750732421875, "logps/rejected": -249.62588500976562, "loss": 0.67, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.31562623381614685, "rewards/margins": 0.055384982377290726, "rewards/rejected": 0.26024124026298523, "step": 2310 }, { "epoch": 0.1077115929244626, "grad_norm": 125.0814208984375, "learning_rate": 2.935512326477552e-07, "logits/chosen": -17.790937423706055, "logits/rejected": -16.73601531982422, "logps/chosen": -533.8828735351562, "logps/rejected": -355.86663818359375, "loss": 0.5895, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5678520798683167, "rewards/margins": 0.23389434814453125, "rewards/rejected": 0.3339576721191406, "step": 2320 }, { "epoch": 0.10817586703189563, "grad_norm": 129.71702575683594, "learning_rate": 2.935233762013092e-07, "logits/chosen": -18.75585174560547, "logits/rejected": -18.09734535217285, "logps/chosen": -507.2671813964844, "logps/rejected": -426.26025390625, "loss": 0.6197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5479849576950073, "rewards/margins": 0.16063697636127472, "rewards/rejected": 0.3873480558395386, "step": 2330 }, { "epoch": 0.10864014113932866, "grad_norm": 83.96803283691406, "learning_rate": 2.9349551975486324e-07, "logits/chosen": -18.098072052001953, "logits/rejected": -17.09249496459961, "logps/chosen": -439.10882568359375, "logps/rejected": -392.732666015625, "loss": 0.6649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.47312527894973755, "rewards/margins": 0.06594777852296829, "rewards/rejected": 0.4071774482727051, "step": 2340 }, { "epoch": 0.10910441524676169, "grad_norm": 102.87901306152344, "learning_rate": 2.934676633084173e-07, "logits/chosen": -18.36341667175293, "logits/rejected": -16.874292373657227, "logps/chosen": -355.0718688964844, "logps/rejected": -249.75827026367188, "loss": 0.642, "rewards/accuracies": 0.5, "rewards/chosen": 0.4483468532562256, "rewards/margins": 0.12607114017009735, "rewards/rejected": 0.32227572798728943, "step": 2350 }, { "epoch": 0.10956868935419471, "grad_norm": 99.61408996582031, "learning_rate": 2.934398068619713e-07, "logits/chosen": -18.356666564941406, "logits/rejected": -18.030643463134766, "logps/chosen": -395.25457763671875, "logps/rejected": -331.1905822753906, "loss": 0.6604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5262095928192139, "rewards/margins": 0.07666341960430145, "rewards/rejected": 0.4495461583137512, "step": 2360 }, { "epoch": 0.11003296346162775, "grad_norm": 127.64643859863281, "learning_rate": 2.934119504155253e-07, "logits/chosen": -18.83480453491211, "logits/rejected": -17.071002960205078, "logps/chosen": -488.3157653808594, "logps/rejected": -368.38604736328125, "loss": 0.6069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6149235963821411, "rewards/margins": 0.2070375680923462, "rewards/rejected": 0.40788596868515015, "step": 2370 }, { "epoch": 0.11049723756906077, "grad_norm": 146.921142578125, "learning_rate": 2.9338409396907935e-07, "logits/chosen": -18.15646743774414, "logits/rejected": -18.296403884887695, "logps/chosen": -500.87396240234375, "logps/rejected": -434.93975830078125, "loss": 0.7015, "rewards/accuracies": 0.5, "rewards/chosen": 0.48853740096092224, "rewards/margins": -0.005291207227855921, "rewards/rejected": 0.4938287138938904, "step": 2380 }, { "epoch": 0.1109615116764938, "grad_norm": 152.73519897460938, "learning_rate": 2.9335623752263334e-07, "logits/chosen": -17.993846893310547, "logits/rejected": -17.119958877563477, "logps/chosen": -494.905517578125, "logps/rejected": -403.6960754394531, "loss": 0.6728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.51864093542099, "rewards/margins": 0.06630164384841919, "rewards/rejected": 0.4523393213748932, "step": 2390 }, { "epoch": 0.11142578578392683, "grad_norm": 119.6451187133789, "learning_rate": 2.933283810761874e-07, "logits/chosen": -19.37399673461914, "logits/rejected": -16.996097564697266, "logps/chosen": -432.838134765625, "logps/rejected": -310.8914489746094, "loss": 0.584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6433349847793579, "rewards/margins": 0.26722413301467896, "rewards/rejected": 0.37611085176467896, "step": 2400 }, { "epoch": 0.11189005989135986, "grad_norm": 108.79930877685547, "learning_rate": 2.9330052462974137e-07, "logits/chosen": -17.331100463867188, "logits/rejected": -17.4986629486084, "logps/chosen": -459.21844482421875, "logps/rejected": -425.53369140625, "loss": 0.7185, "rewards/accuracies": 0.5, "rewards/chosen": 0.5072699189186096, "rewards/margins": -0.0450800359249115, "rewards/rejected": 0.5523499250411987, "step": 2410 }, { "epoch": 0.11235433399879288, "grad_norm": 133.32899475097656, "learning_rate": 2.932726681832954e-07, "logits/chosen": -18.770654678344727, "logits/rejected": -18.732707977294922, "logps/chosen": -543.4058837890625, "logps/rejected": -515.3101806640625, "loss": 0.6804, "rewards/accuracies": 0.5, "rewards/chosen": 0.581652820110321, "rewards/margins": 0.03884606435894966, "rewards/rejected": 0.5428067445755005, "step": 2420 }, { "epoch": 0.11281860810622592, "grad_norm": 70.3641586303711, "learning_rate": 2.9324481173684945e-07, "logits/chosen": -18.42136001586914, "logits/rejected": -18.149084091186523, "logps/chosen": -393.3199462890625, "logps/rejected": -355.3845520019531, "loss": 0.6642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4941992163658142, "rewards/margins": 0.0721711665391922, "rewards/rejected": 0.4220280647277832, "step": 2430 }, { "epoch": 0.11328288221365894, "grad_norm": 102.2144546508789, "learning_rate": 2.9321695529040344e-07, "logits/chosen": -18.507863998413086, "logits/rejected": -17.707862854003906, "logps/chosen": -392.7869873046875, "logps/rejected": -319.89019775390625, "loss": 0.6427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5098764300346375, "rewards/margins": 0.11482875049114227, "rewards/rejected": 0.39504772424697876, "step": 2440 }, { "epoch": 0.11374715632109197, "grad_norm": 77.02296447753906, "learning_rate": 2.9318909884395743e-07, "logits/chosen": -17.960371017456055, "logits/rejected": -16.816343307495117, "logps/chosen": -343.97515869140625, "logps/rejected": -227.18960571289062, "loss": 0.6098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.557820200920105, "rewards/margins": 0.1823653131723404, "rewards/rejected": 0.375454843044281, "step": 2450 }, { "epoch": 0.114211430428525, "grad_norm": 54.23787307739258, "learning_rate": 2.9316124239751147e-07, "logits/chosen": -18.42627716064453, "logits/rejected": -18.004812240600586, "logps/chosen": -373.5743103027344, "logps/rejected": -375.004638671875, "loss": 0.7239, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.4348391592502594, "rewards/margins": -0.05367283895611763, "rewards/rejected": 0.48851197957992554, "step": 2460 }, { "epoch": 0.11467570453595803, "grad_norm": 114.40089416503906, "learning_rate": 2.931333859510655e-07, "logits/chosen": -17.823444366455078, "logits/rejected": -18.536396026611328, "logps/chosen": -443.77838134765625, "logps/rejected": -491.49615478515625, "loss": 0.7123, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.553659200668335, "rewards/margins": -0.018247226253151894, "rewards/rejected": 0.5719064474105835, "step": 2470 }, { "epoch": 0.11513997864339105, "grad_norm": 90.09416961669922, "learning_rate": 2.931055295046195e-07, "logits/chosen": -19.49966812133789, "logits/rejected": -17.906827926635742, "logps/chosen": -524.9697875976562, "logps/rejected": -342.7666015625, "loss": 0.5968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.649405300617218, "rewards/margins": 0.23161426186561584, "rewards/rejected": 0.4177909791469574, "step": 2480 }, { "epoch": 0.11560425275082409, "grad_norm": 94.74651336669922, "learning_rate": 2.9307767305817354e-07, "logits/chosen": -18.48963737487793, "logits/rejected": -17.31123161315918, "logps/chosen": -491.1385192871094, "logps/rejected": -362.1472473144531, "loss": 0.6195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6714617609977722, "rewards/margins": 0.17406722903251648, "rewards/rejected": 0.4973945617675781, "step": 2490 }, { "epoch": 0.11606852685825711, "grad_norm": 87.39982604980469, "learning_rate": 2.9304981661172753e-07, "logits/chosen": -17.852964401245117, "logits/rejected": -17.689090728759766, "logps/chosen": -334.19451904296875, "logps/rejected": -285.58673095703125, "loss": 0.6883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4814261794090271, "rewards/margins": 0.02913704514503479, "rewards/rejected": 0.4522891640663147, "step": 2500 }, { "epoch": 0.11653280096569014, "grad_norm": 46.12029266357422, "learning_rate": 2.9302474580992615e-07, "logits/chosen": -17.86285400390625, "logits/rejected": -17.756877899169922, "logps/chosen": -416.4344177246094, "logps/rejected": -424.205078125, "loss": 0.6987, "rewards/accuracies": 0.5, "rewards/chosen": 0.4794137477874756, "rewards/margins": -0.003964531235396862, "rewards/rejected": 0.4833783209323883, "step": 2510 }, { "epoch": 0.11699707507312317, "grad_norm": 141.23609924316406, "learning_rate": 2.9299967500812477e-07, "logits/chosen": -19.11173439025879, "logits/rejected": -18.446819305419922, "logps/chosen": -553.2741088867188, "logps/rejected": -420.21875, "loss": 0.6568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5288776755332947, "rewards/margins": 0.09293005615472794, "rewards/rejected": 0.4359476566314697, "step": 2520 }, { "epoch": 0.1174613491805562, "grad_norm": 133.0366973876953, "learning_rate": 2.929718185616788e-07, "logits/chosen": -18.4621524810791, "logits/rejected": -17.599931716918945, "logps/chosen": -346.5234069824219, "logps/rejected": -238.26797485351562, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": 0.4888295531272888, "rewards/margins": 0.10568852722644806, "rewards/rejected": 0.38314107060432434, "step": 2530 }, { "epoch": 0.11792562328798922, "grad_norm": 84.13883972167969, "learning_rate": 2.9294396211523285e-07, "logits/chosen": -18.95123863220215, "logits/rejected": -18.11006736755371, "logps/chosen": -425.27886962890625, "logps/rejected": -307.2245178222656, "loss": 0.6223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5588261485099792, "rewards/margins": 0.16595840454101562, "rewards/rejected": 0.3928677439689636, "step": 2540 }, { "epoch": 0.11838989739542226, "grad_norm": 140.81869506835938, "learning_rate": 2.9291610566878684e-07, "logits/chosen": -18.321537017822266, "logits/rejected": -17.908443450927734, "logps/chosen": -360.39801025390625, "logps/rejected": -311.20672607421875, "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4542056918144226, "rewards/margins": 0.08810420334339142, "rewards/rejected": 0.3661014437675476, "step": 2550 }, { "epoch": 0.11885417150285528, "grad_norm": 97.58634948730469, "learning_rate": 2.928882492223409e-07, "logits/chosen": -18.230899810791016, "logits/rejected": -17.379352569580078, "logps/chosen": -474.4298400878906, "logps/rejected": -378.90826416015625, "loss": 0.6345, "rewards/accuracies": 0.5, "rewards/chosen": 0.5806986093521118, "rewards/margins": 0.14260998368263245, "rewards/rejected": 0.43808865547180176, "step": 2560 }, { "epoch": 0.11931844561028832, "grad_norm": 25.73659324645996, "learning_rate": 2.9286039277589487e-07, "logits/chosen": -18.347797393798828, "logits/rejected": -17.716228485107422, "logps/chosen": -429.75799560546875, "logps/rejected": -324.7107238769531, "loss": 0.6308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5402923822402954, "rewards/margins": 0.15132944285869598, "rewards/rejected": 0.38896289467811584, "step": 2570 }, { "epoch": 0.11978271971772134, "grad_norm": 105.99922943115234, "learning_rate": 2.928325363294489e-07, "logits/chosen": -18.668224334716797, "logits/rejected": -17.154422760009766, "logps/chosen": -469.81689453125, "logps/rejected": -342.68212890625, "loss": 0.6551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5236067175865173, "rewards/margins": 0.09125343710184097, "rewards/rejected": 0.43235331773757935, "step": 2580 }, { "epoch": 0.12024699382515437, "grad_norm": 98.23152923583984, "learning_rate": 2.928046798830029e-07, "logits/chosen": -18.62645721435547, "logits/rejected": -17.67536735534668, "logps/chosen": -459.4444274902344, "logps/rejected": -391.15008544921875, "loss": 0.6693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.558415949344635, "rewards/margins": 0.052885752171278, "rewards/rejected": 0.5055302381515503, "step": 2590 }, { "epoch": 0.1207112679325874, "grad_norm": 41.02576446533203, "learning_rate": 2.9277682343655694e-07, "logits/chosen": -18.00058364868164, "logits/rejected": -18.013776779174805, "logps/chosen": -423.595703125, "logps/rejected": -337.5746154785156, "loss": 0.6573, "rewards/accuracies": 0.5, "rewards/chosen": 0.5986093282699585, "rewards/margins": 0.10205421596765518, "rewards/rejected": 0.4965550899505615, "step": 2600 }, { "epoch": 0.12117554204002043, "grad_norm": 38.8684196472168, "learning_rate": 2.92748966990111e-07, "logits/chosen": -17.373332977294922, "logits/rejected": -16.00826072692871, "logps/chosen": -372.3862609863281, "logps/rejected": -208.96908569335938, "loss": 0.6278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.48594555258750916, "rewards/margins": 0.1669907122850418, "rewards/rejected": 0.31895482540130615, "step": 2610 }, { "epoch": 0.12163981614745345, "grad_norm": 143.7066650390625, "learning_rate": 2.9272111054366497e-07, "logits/chosen": -17.591167449951172, "logits/rejected": -16.98748779296875, "logps/chosen": -373.64410400390625, "logps/rejected": -325.9721374511719, "loss": 0.6532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5258827209472656, "rewards/margins": 0.10539042949676514, "rewards/rejected": 0.4204922616481781, "step": 2620 }, { "epoch": 0.12210409025488649, "grad_norm": 104.33373260498047, "learning_rate": 2.9269325409721895e-07, "logits/chosen": -18.01220703125, "logits/rejected": -17.953624725341797, "logps/chosen": -434.7518005371094, "logps/rejected": -476.255615234375, "loss": 0.7496, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.571512758731842, "rewards/margins": -0.07257324457168579, "rewards/rejected": 0.6440860033035278, "step": 2630 }, { "epoch": 0.1225683643623195, "grad_norm": 86.95465087890625, "learning_rate": 2.92665397650773e-07, "logits/chosen": -18.71133041381836, "logits/rejected": -17.881694793701172, "logps/chosen": -394.70526123046875, "logps/rejected": -410.14752197265625, "loss": 0.6624, "rewards/accuracies": 0.5, "rewards/chosen": 0.5829175710678101, "rewards/margins": 0.08990485221147537, "rewards/rejected": 0.49301281571388245, "step": 2640 }, { "epoch": 0.12303263846975254, "grad_norm": 117.30147552490234, "learning_rate": 2.9263754120432704e-07, "logits/chosen": -18.22380256652832, "logits/rejected": -17.828594207763672, "logps/chosen": -403.84375, "logps/rejected": -352.296875, "loss": 0.6347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5879834294319153, "rewards/margins": 0.14232197403907776, "rewards/rejected": 0.4456614851951599, "step": 2650 }, { "epoch": 0.12349691257718556, "grad_norm": 62.06559371948242, "learning_rate": 2.92609684757881e-07, "logits/chosen": -17.94173812866211, "logits/rejected": -16.337078094482422, "logps/chosen": -393.2875671386719, "logps/rejected": -246.4335479736328, "loss": 0.5934, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5241793990135193, "rewards/margins": 0.23863014578819275, "rewards/rejected": 0.28554925322532654, "step": 2660 }, { "epoch": 0.1239611866846186, "grad_norm": 96.3547592163086, "learning_rate": 2.9258182831143507e-07, "logits/chosen": -17.599506378173828, "logits/rejected": -16.988847732543945, "logps/chosen": -306.984130859375, "logps/rejected": -224.73379516601562, "loss": 0.6424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.41347652673721313, "rewards/margins": 0.11722588539123535, "rewards/rejected": 0.296250581741333, "step": 2670 }, { "epoch": 0.12442546079205162, "grad_norm": 93.98988342285156, "learning_rate": 2.9255397186498905e-07, "logits/chosen": -18.076976776123047, "logits/rejected": -17.288387298583984, "logps/chosen": -386.3916015625, "logps/rejected": -278.853271484375, "loss": 0.6258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5529699325561523, "rewards/margins": 0.16402655839920044, "rewards/rejected": 0.3889433443546295, "step": 2680 }, { "epoch": 0.12488973489948466, "grad_norm": 111.91241455078125, "learning_rate": 2.925261154185431e-07, "logits/chosen": -18.29023551940918, "logits/rejected": -16.942646026611328, "logps/chosen": -459.9029846191406, "logps/rejected": -355.57586669921875, "loss": 0.6543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6150577664375305, "rewards/margins": 0.0864117369055748, "rewards/rejected": 0.5286461114883423, "step": 2690 }, { "epoch": 0.12535400900691768, "grad_norm": 109.85623931884766, "learning_rate": 2.924982589720971e-07, "logits/chosen": -17.4627742767334, "logits/rejected": -17.347591400146484, "logps/chosen": -494.81756591796875, "logps/rejected": -510.3099670410156, "loss": 0.72, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.4829414486885071, "rewards/margins": -0.037217896431684494, "rewards/rejected": 0.5201593637466431, "step": 2700 }, { "epoch": 0.12581828311435073, "grad_norm": 67.69562530517578, "learning_rate": 2.924704025256511e-07, "logits/chosen": -17.637453079223633, "logits/rejected": -17.46546745300293, "logps/chosen": -429.73980712890625, "logps/rejected": -424.58392333984375, "loss": 0.7357, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.6200366616249084, "rewards/margins": -0.06323118507862091, "rewards/rejected": 0.6832678318023682, "step": 2710 }, { "epoch": 0.12628255722178375, "grad_norm": 74.17218780517578, "learning_rate": 2.9244254607920517e-07, "logits/chosen": -17.904834747314453, "logits/rejected": -16.383026123046875, "logps/chosen": -409.1935729980469, "logps/rejected": -202.79129028320312, "loss": 0.5792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5693503618240356, "rewards/margins": 0.2726876139640808, "rewards/rejected": 0.2966627776622772, "step": 2720 }, { "epoch": 0.12674683132921677, "grad_norm": 111.76895904541016, "learning_rate": 2.924146896327592e-07, "logits/chosen": -18.743213653564453, "logits/rejected": -17.491670608520508, "logps/chosen": -342.40570068359375, "logps/rejected": -276.89520263671875, "loss": 0.6144, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5401535630226135, "rewards/margins": 0.18317437171936035, "rewards/rejected": 0.3569791913032532, "step": 2730 }, { "epoch": 0.1272111054366498, "grad_norm": 139.8992462158203, "learning_rate": 2.923868331863132e-07, "logits/chosen": -17.84526824951172, "logits/rejected": -16.784326553344727, "logps/chosen": -492.2936096191406, "logps/rejected": -356.1236877441406, "loss": 0.5974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.673771858215332, "rewards/margins": 0.2187775820493698, "rewards/rejected": 0.454994261264801, "step": 2740 }, { "epoch": 0.12767537954408284, "grad_norm": 112.7441177368164, "learning_rate": 2.923589767398672e-07, "logits/chosen": -17.670669555664062, "logits/rejected": -16.914546966552734, "logps/chosen": -329.41748046875, "logps/rejected": -247.47427368164062, "loss": 0.6602, "rewards/accuracies": 0.5, "rewards/chosen": 0.5393210649490356, "rewards/margins": 0.08858601748943329, "rewards/rejected": 0.45073509216308594, "step": 2750 }, { "epoch": 0.12813965365151586, "grad_norm": 68.91506958007812, "learning_rate": 2.923311202934212e-07, "logits/chosen": -18.446319580078125, "logits/rejected": -17.194290161132812, "logps/chosen": -514.8841552734375, "logps/rejected": -387.90570068359375, "loss": 0.6076, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7105926275253296, "rewards/margins": 0.20183086395263672, "rewards/rejected": 0.5087617635726929, "step": 2760 }, { "epoch": 0.12860392775894888, "grad_norm": 110.99430847167969, "learning_rate": 2.9230326384697526e-07, "logits/chosen": -17.70412826538086, "logits/rejected": -16.98326873779297, "logps/chosen": -460.4417419433594, "logps/rejected": -334.3287353515625, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": 0.688062310218811, "rewards/margins": 0.05100066587328911, "rewards/rejected": 0.6370616555213928, "step": 2770 }, { "epoch": 0.1290682018663819, "grad_norm": 30.84825325012207, "learning_rate": 2.9227540740052925e-07, "logits/chosen": -18.260454177856445, "logits/rejected": -17.60406494140625, "logps/chosen": -413.9754333496094, "logps/rejected": -278.34039306640625, "loss": 0.6046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6136776208877563, "rewards/margins": 0.23046882450580597, "rewards/rejected": 0.38320887088775635, "step": 2780 }, { "epoch": 0.12953247597381495, "grad_norm": 117.1463623046875, "learning_rate": 2.922475509540833e-07, "logits/chosen": -18.293455123901367, "logits/rejected": -17.323505401611328, "logps/chosen": -431.62896728515625, "logps/rejected": -360.01312255859375, "loss": 0.6182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6819063425064087, "rewards/margins": 0.1687651425600052, "rewards/rejected": 0.5131412148475647, "step": 2790 }, { "epoch": 0.12999675008124797, "grad_norm": 182.0643768310547, "learning_rate": 2.922196945076373e-07, "logits/chosen": -18.114917755126953, "logits/rejected": -17.657392501831055, "logps/chosen": -549.47314453125, "logps/rejected": -506.1228942871094, "loss": 0.65, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7075631618499756, "rewards/margins": 0.1161053404211998, "rewards/rejected": 0.5914578437805176, "step": 2800 }, { "epoch": 0.130461024188681, "grad_norm": 41.552799224853516, "learning_rate": 2.921918380611913e-07, "logits/chosen": -17.90250587463379, "logits/rejected": -17.177265167236328, "logps/chosen": -341.73870849609375, "logps/rejected": -240.0276336669922, "loss": 0.6302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.4915243685245514, "rewards/margins": 0.13908886909484863, "rewards/rejected": 0.35243552923202515, "step": 2810 }, { "epoch": 0.13092529829611402, "grad_norm": 48.841121673583984, "learning_rate": 2.921639816147453e-07, "logits/chosen": -18.225099563598633, "logits/rejected": -17.78683853149414, "logps/chosen": -462.23095703125, "logps/rejected": -417.2350158691406, "loss": 0.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6960458755493164, "rewards/margins": 0.03780609741806984, "rewards/rejected": 0.6582397222518921, "step": 2820 }, { "epoch": 0.13138957240354707, "grad_norm": 70.57811737060547, "learning_rate": 2.9213612516829935e-07, "logits/chosen": -18.410694122314453, "logits/rejected": -18.179906845092773, "logps/chosen": -343.89556884765625, "logps/rejected": -346.8002624511719, "loss": 0.6946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5008925199508667, "rewards/margins": 0.01794544979929924, "rewards/rejected": 0.48294705152511597, "step": 2830 }, { "epoch": 0.1318538465109801, "grad_norm": 107.01775360107422, "learning_rate": 2.921082687218534e-07, "logits/chosen": -18.41263198852539, "logits/rejected": -17.355754852294922, "logps/chosen": -575.5589599609375, "logps/rejected": -432.95489501953125, "loss": 0.6447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.932452380657196, "rewards/margins": 0.15736596286296844, "rewards/rejected": 0.7750864624977112, "step": 2840 }, { "epoch": 0.1323181206184131, "grad_norm": 68.18220520019531, "learning_rate": 2.920804122754074e-07, "logits/chosen": -17.642276763916016, "logits/rejected": -17.480518341064453, "logps/chosen": -372.711181640625, "logps/rejected": -292.7355651855469, "loss": 0.6317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.623234748840332, "rewards/margins": 0.14487698674201965, "rewards/rejected": 0.47835773229599, "step": 2850 }, { "epoch": 0.13278239472584613, "grad_norm": 42.24419021606445, "learning_rate": 2.920525558289614e-07, "logits/chosen": -18.519527435302734, "logits/rejected": -18.09327507019043, "logps/chosen": -328.3561706542969, "logps/rejected": -373.750244140625, "loss": 0.7287, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.517419695854187, "rewards/margins": 0.007839268073439598, "rewards/rejected": 0.5095804929733276, "step": 2860 }, { "epoch": 0.13324666883327918, "grad_norm": 64.03739166259766, "learning_rate": 2.920246993825154e-07, "logits/chosen": -18.15378189086914, "logits/rejected": -16.80885887145996, "logps/chosen": -520.7562866210938, "logps/rejected": -396.6856384277344, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7005078196525574, "rewards/margins": 0.2610723674297333, "rewards/rejected": 0.4394353926181793, "step": 2870 }, { "epoch": 0.1337109429407122, "grad_norm": 50.25257873535156, "learning_rate": 2.9199684293606945e-07, "logits/chosen": -17.720304489135742, "logits/rejected": -17.77754783630371, "logps/chosen": -292.13177490234375, "logps/rejected": -318.5281066894531, "loss": 0.7013, "rewards/accuracies": 0.5, "rewards/chosen": 0.39934197068214417, "rewards/margins": -0.010922315530478954, "rewards/rejected": 0.4102643132209778, "step": 2880 }, { "epoch": 0.13417521704814522, "grad_norm": 54.45234680175781, "learning_rate": 2.9196898648962344e-07, "logits/chosen": -17.54033660888672, "logits/rejected": -17.094051361083984, "logps/chosen": -303.3470458984375, "logps/rejected": -232.88525390625, "loss": 0.5941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5578106641769409, "rewards/margins": 0.23895719647407532, "rewards/rejected": 0.31885355710983276, "step": 2890 }, { "epoch": 0.13463949115557824, "grad_norm": 121.95909881591797, "learning_rate": 2.919411300431775e-07, "logits/chosen": -17.696372985839844, "logits/rejected": -16.515098571777344, "logps/chosen": -404.8245849609375, "logps/rejected": -305.32427978515625, "loss": 0.598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5838284492492676, "rewards/margins": 0.21946267783641815, "rewards/rejected": 0.36436569690704346, "step": 2900 }, { "epoch": 0.1351037652630113, "grad_norm": 88.00968170166016, "learning_rate": 2.919132735967315e-07, "logits/chosen": -18.635692596435547, "logits/rejected": -18.556188583374023, "logps/chosen": -552.2750244140625, "logps/rejected": -404.36151123046875, "loss": 0.6835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7416518330574036, "rewards/margins": 0.04763137549161911, "rewards/rejected": 0.6940203905105591, "step": 2910 }, { "epoch": 0.13556803937044432, "grad_norm": 74.419677734375, "learning_rate": 2.918854171502855e-07, "logits/chosen": -17.08091163635254, "logits/rejected": -16.40222930908203, "logps/chosen": -369.44378662109375, "logps/rejected": -290.74395751953125, "loss": 0.6622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5879896879196167, "rewards/margins": 0.08492577821016312, "rewards/rejected": 0.5030639171600342, "step": 2920 }, { "epoch": 0.13603231347787734, "grad_norm": 85.98696899414062, "learning_rate": 2.918575607038395e-07, "logits/chosen": -18.315553665161133, "logits/rejected": -18.388227462768555, "logps/chosen": -408.48638916015625, "logps/rejected": -360.72650146484375, "loss": 0.6437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.598084568977356, "rewards/margins": 0.11433807760477066, "rewards/rejected": 0.48374658823013306, "step": 2930 }, { "epoch": 0.13649658758531036, "grad_norm": 107.9933853149414, "learning_rate": 2.9182970425739354e-07, "logits/chosen": -18.441242218017578, "logits/rejected": -18.4577579498291, "logps/chosen": -367.1937561035156, "logps/rejected": -344.79888916015625, "loss": 0.6821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6341129541397095, "rewards/margins": 0.03533799573779106, "rewards/rejected": 0.5987749099731445, "step": 2940 }, { "epoch": 0.1369608616927434, "grad_norm": 88.55716705322266, "learning_rate": 2.918018478109476e-07, "logits/chosen": -19.015295028686523, "logits/rejected": -18.278369903564453, "logps/chosen": -502.41375732421875, "logps/rejected": -372.80670166015625, "loss": 0.6314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7644619941711426, "rewards/margins": 0.14893800020217896, "rewards/rejected": 0.6155239939689636, "step": 2950 }, { "epoch": 0.13742513580017643, "grad_norm": 43.94636154174805, "learning_rate": 2.917739913645016e-07, "logits/chosen": -18.170124053955078, "logits/rejected": -17.01937484741211, "logps/chosen": -393.12579345703125, "logps/rejected": -294.32720947265625, "loss": 0.6694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.53236985206604, "rewards/margins": 0.07757793366909027, "rewards/rejected": 0.4547918736934662, "step": 2960 }, { "epoch": 0.13788940990760945, "grad_norm": 70.03071594238281, "learning_rate": 2.917461349180556e-07, "logits/chosen": -19.0540714263916, "logits/rejected": -17.867258071899414, "logps/chosen": -351.98931884765625, "logps/rejected": -248.00357055664062, "loss": 0.634, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5202369689941406, "rewards/margins": 0.12884186208248138, "rewards/rejected": 0.39139512181282043, "step": 2970 }, { "epoch": 0.13835368401504247, "grad_norm": 98.85297393798828, "learning_rate": 2.9171827847160965e-07, "logits/chosen": -17.487213134765625, "logits/rejected": -16.896324157714844, "logps/chosen": -330.88739013671875, "logps/rejected": -261.8233947753906, "loss": 0.6681, "rewards/accuracies": 0.5, "rewards/chosen": 0.5666121244430542, "rewards/margins": 0.057289134711027145, "rewards/rejected": 0.5093228816986084, "step": 2980 }, { "epoch": 0.13881795812247552, "grad_norm": 48.3664665222168, "learning_rate": 2.9169042202516364e-07, "logits/chosen": -18.48871612548828, "logits/rejected": -17.90102195739746, "logps/chosen": -461.60406494140625, "logps/rejected": -359.2693786621094, "loss": 0.6741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6142764091491699, "rewards/margins": 0.051905591040849686, "rewards/rejected": 0.5623708963394165, "step": 2990 }, { "epoch": 0.13928223222990854, "grad_norm": 80.2057113647461, "learning_rate": 2.916625655787176e-07, "logits/chosen": -18.101980209350586, "logits/rejected": -18.001798629760742, "logps/chosen": -503.1399841308594, "logps/rejected": -422.78436279296875, "loss": 0.6529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7393986582756042, "rewards/margins": 0.13037022948265076, "rewards/rejected": 0.6090284585952759, "step": 3000 }, { "epoch": 0.13974650633734156, "grad_norm": 66.64413452148438, "learning_rate": 2.9163470913227167e-07, "logits/chosen": -17.471141815185547, "logits/rejected": -17.158720016479492, "logps/chosen": -419.94476318359375, "logps/rejected": -386.5711364746094, "loss": 0.6649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7247463464736938, "rewards/margins": 0.08953866362571716, "rewards/rejected": 0.6352077126502991, "step": 3010 }, { "epoch": 0.14021078044477459, "grad_norm": 122.11930084228516, "learning_rate": 2.916068526858257e-07, "logits/chosen": -18.176820755004883, "logits/rejected": -17.665512084960938, "logps/chosen": -398.35107421875, "logps/rejected": -345.20599365234375, "loss": 0.6967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4931097626686096, "rewards/margins": 0.011906856670975685, "rewards/rejected": 0.4812029004096985, "step": 3020 }, { "epoch": 0.14067505455220763, "grad_norm": 42.351524353027344, "learning_rate": 2.9157899623937975e-07, "logits/chosen": -17.564607620239258, "logits/rejected": -17.204830169677734, "logps/chosen": -396.10711669921875, "logps/rejected": -343.1501770019531, "loss": 0.7209, "rewards/accuracies": 0.5, "rewards/chosen": 0.5468751788139343, "rewards/margins": -0.0004997908836230636, "rewards/rejected": 0.5473750829696655, "step": 3030 }, { "epoch": 0.14113932865964066, "grad_norm": 85.3527603149414, "learning_rate": 2.9155113979293374e-07, "logits/chosen": -17.272315979003906, "logits/rejected": -16.878952026367188, "logps/chosen": -417.5428771972656, "logps/rejected": -300.5958251953125, "loss": 0.6355, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7679678201675415, "rewards/margins": 0.1639557182788849, "rewards/rejected": 0.6040120720863342, "step": 3040 }, { "epoch": 0.14160360276707368, "grad_norm": 68.18299102783203, "learning_rate": 2.915232833464877e-07, "logits/chosen": -17.554325103759766, "logits/rejected": -16.25029182434082, "logps/chosen": -396.691162109375, "logps/rejected": -236.48330688476562, "loss": 0.5809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6859524846076965, "rewards/margins": 0.2856714725494385, "rewards/rejected": 0.40028101205825806, "step": 3050 }, { "epoch": 0.1420678768745067, "grad_norm": 55.942081451416016, "learning_rate": 2.9149542690004177e-07, "logits/chosen": -17.750118255615234, "logits/rejected": -18.32577133178711, "logps/chosen": -424.2145080566406, "logps/rejected": -482.14056396484375, "loss": 0.7293, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.675886332988739, "rewards/margins": -0.03407769650220871, "rewards/rejected": 0.7099639177322388, "step": 3060 }, { "epoch": 0.14253215098193975, "grad_norm": 65.89762878417969, "learning_rate": 2.914675704535958e-07, "logits/chosen": -17.17725372314453, "logits/rejected": -16.41983413696289, "logps/chosen": -337.40277099609375, "logps/rejected": -338.84869384765625, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 0.6044459342956543, "rewards/margins": 0.0828358381986618, "rewards/rejected": 0.5216101408004761, "step": 3070 }, { "epoch": 0.14299642508937277, "grad_norm": 159.4180145263672, "learning_rate": 2.914397140071498e-07, "logits/chosen": -17.79362678527832, "logits/rejected": -17.550376892089844, "logps/chosen": -460.15911865234375, "logps/rejected": -357.3041076660156, "loss": 0.6984, "rewards/accuracies": 0.5, "rewards/chosen": 0.5402505993843079, "rewards/margins": 0.015860864892601967, "rewards/rejected": 0.5243896842002869, "step": 3080 }, { "epoch": 0.1434606991968058, "grad_norm": 37.0462760925293, "learning_rate": 2.9141185756070384e-07, "logits/chosen": -16.783748626708984, "logits/rejected": -17.047956466674805, "logps/chosen": -357.70611572265625, "logps/rejected": -336.3292541503906, "loss": 0.7338, "rewards/accuracies": 0.5, "rewards/chosen": 0.5102128386497498, "rewards/margins": -0.0475364625453949, "rewards/rejected": 0.5577493906021118, "step": 3090 }, { "epoch": 0.1439249733042388, "grad_norm": 56.569454193115234, "learning_rate": 2.913840011142578e-07, "logits/chosen": -19.072908401489258, "logits/rejected": -18.53108024597168, "logps/chosen": -397.14385986328125, "logps/rejected": -339.9366455078125, "loss": 0.6675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6011667251586914, "rewards/margins": 0.0831093043088913, "rewards/rejected": 0.5180574059486389, "step": 3100 }, { "epoch": 0.14438924741167186, "grad_norm": 83.78886413574219, "learning_rate": 2.9135614466781186e-07, "logits/chosen": -18.01613998413086, "logits/rejected": -17.318126678466797, "logps/chosen": -412.94793701171875, "logps/rejected": -345.362548828125, "loss": 0.6166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6369319558143616, "rewards/margins": 0.1696307510137558, "rewards/rejected": 0.4673011898994446, "step": 3110 }, { "epoch": 0.14485352151910488, "grad_norm": 52.85260009765625, "learning_rate": 2.9132828822136585e-07, "logits/chosen": -18.41288185119629, "logits/rejected": -17.473474502563477, "logps/chosen": -375.5820007324219, "logps/rejected": -331.3514099121094, "loss": 0.6862, "rewards/accuracies": 0.5, "rewards/chosen": 0.5657327771186829, "rewards/margins": 0.04104991629719734, "rewards/rejected": 0.524682879447937, "step": 3120 }, { "epoch": 0.1453177956265379, "grad_norm": 58.928504943847656, "learning_rate": 2.913004317749199e-07, "logits/chosen": -17.83778190612793, "logits/rejected": -16.7512264251709, "logps/chosen": -504.8798828125, "logps/rejected": -413.9385681152344, "loss": 0.5965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9542994499206543, "rewards/margins": 0.2377736121416092, "rewards/rejected": 0.7165258526802063, "step": 3130 }, { "epoch": 0.14578206973397093, "grad_norm": 75.23446655273438, "learning_rate": 2.9127257532847393e-07, "logits/chosen": -17.61599349975586, "logits/rejected": -16.79678726196289, "logps/chosen": -308.44683837890625, "logps/rejected": -260.83697509765625, "loss": 0.6477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.585955798625946, "rewards/margins": 0.1161491647362709, "rewards/rejected": 0.4698066711425781, "step": 3140 }, { "epoch": 0.14624634384140398, "grad_norm": 51.500946044921875, "learning_rate": 2.9124750452667256e-07, "logits/chosen": -18.276451110839844, "logits/rejected": -17.84055519104004, "logps/chosen": -417.26220703125, "logps/rejected": -327.51861572265625, "loss": 0.5889, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7893635034561157, "rewards/margins": 0.24406547844409943, "rewards/rejected": 0.5452979207038879, "step": 3150 }, { "epoch": 0.146710617948837, "grad_norm": 70.17254638671875, "learning_rate": 2.9121964808022654e-07, "logits/chosen": -18.280044555664062, "logits/rejected": -17.73143196105957, "logps/chosen": -440.1888122558594, "logps/rejected": -376.7879638671875, "loss": 0.6711, "rewards/accuracies": 0.5, "rewards/chosen": 0.5858691930770874, "rewards/margins": 0.0605696439743042, "rewards/rejected": 0.5252994894981384, "step": 3160 }, { "epoch": 0.14717489205627002, "grad_norm": 148.74110412597656, "learning_rate": 2.911917916337806e-07, "logits/chosen": -18.855609893798828, "logits/rejected": -17.317066192626953, "logps/chosen": -398.89739990234375, "logps/rejected": -314.18890380859375, "loss": 0.6353, "rewards/accuracies": 0.5, "rewards/chosen": 0.6712985038757324, "rewards/margins": 0.15133526921272278, "rewards/rejected": 0.5199633240699768, "step": 3170 }, { "epoch": 0.14763916616370304, "grad_norm": 70.32422637939453, "learning_rate": 2.9116393518733457e-07, "logits/chosen": -17.894290924072266, "logits/rejected": -17.858327865600586, "logps/chosen": -491.1851501464844, "logps/rejected": -496.9193420410156, "loss": 0.6628, "rewards/accuracies": 0.5, "rewards/chosen": 0.792551577091217, "rewards/margins": 0.07835862785577774, "rewards/rejected": 0.7141929864883423, "step": 3180 }, { "epoch": 0.1481034402711361, "grad_norm": 138.70213317871094, "learning_rate": 2.911360787408886e-07, "logits/chosen": -17.6065731048584, "logits/rejected": -17.65072250366211, "logps/chosen": -310.82891845703125, "logps/rejected": -289.9864807128906, "loss": 0.7334, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.47951990365982056, "rewards/margins": -0.06586303561925888, "rewards/rejected": 0.5453829169273376, "step": 3190 }, { "epoch": 0.1485677143785691, "grad_norm": 122.73899841308594, "learning_rate": 2.9110822229444265e-07, "logits/chosen": -18.1839542388916, "logits/rejected": -17.471508026123047, "logps/chosen": -307.037353515625, "logps/rejected": -301.5962829589844, "loss": 0.746, "rewards/accuracies": 0.5, "rewards/chosen": 0.5820664167404175, "rewards/margins": -0.04743293672800064, "rewards/rejected": 0.6294993758201599, "step": 3200 }, { "epoch": 0.14903198848600213, "grad_norm": 70.39419555664062, "learning_rate": 2.9108036584799664e-07, "logits/chosen": -17.920021057128906, "logits/rejected": -17.266530990600586, "logps/chosen": -357.8512878417969, "logps/rejected": -289.7271728515625, "loss": 0.6353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6211289167404175, "rewards/margins": 0.14806345105171204, "rewards/rejected": 0.47306546568870544, "step": 3210 }, { "epoch": 0.14949626259343515, "grad_norm": 125.99661254882812, "learning_rate": 2.910525094015507e-07, "logits/chosen": -18.115116119384766, "logits/rejected": -17.914621353149414, "logps/chosen": -434.3263244628906, "logps/rejected": -410.9374084472656, "loss": 0.7345, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.6262685060501099, "rewards/margins": -0.05617981031537056, "rewards/rejected": 0.6824482679367065, "step": 3220 }, { "epoch": 0.1499605367008682, "grad_norm": 75.45013427734375, "learning_rate": 2.9102465295510467e-07, "logits/chosen": -17.994670867919922, "logits/rejected": -17.013134002685547, "logps/chosen": -363.636962890625, "logps/rejected": -257.98101806640625, "loss": 0.6151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.657495379447937, "rewards/margins": 0.18681108951568604, "rewards/rejected": 0.47068434953689575, "step": 3230 }, { "epoch": 0.15042481080830122, "grad_norm": 77.14061737060547, "learning_rate": 2.909967965086587e-07, "logits/chosen": -17.819202423095703, "logits/rejected": -17.10492706298828, "logps/chosen": -449.8472595214844, "logps/rejected": -344.0034484863281, "loss": 0.6119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7715551853179932, "rewards/margins": 0.19321730732917786, "rewards/rejected": 0.5783378481864929, "step": 3240 }, { "epoch": 0.15088908491573425, "grad_norm": 68.43038940429688, "learning_rate": 2.909689400622127e-07, "logits/chosen": -18.07083511352539, "logits/rejected": -17.03428840637207, "logps/chosen": -468.3829040527344, "logps/rejected": -307.91680908203125, "loss": 0.6503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7770493626594543, "rewards/margins": 0.12172194570302963, "rewards/rejected": 0.6553274989128113, "step": 3250 }, { "epoch": 0.15135335902316727, "grad_norm": 51.365020751953125, "learning_rate": 2.9094108361576674e-07, "logits/chosen": -18.551158905029297, "logits/rejected": -16.971820831298828, "logps/chosen": -432.4715881347656, "logps/rejected": -344.7942199707031, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7030183672904968, "rewards/margins": 0.15606904029846191, "rewards/rejected": 0.5469493269920349, "step": 3260 }, { "epoch": 0.15181763313060032, "grad_norm": 61.94179153442383, "learning_rate": 2.909132271693208e-07, "logits/chosen": -18.414297103881836, "logits/rejected": -18.347078323364258, "logps/chosen": -332.2785949707031, "logps/rejected": -331.0310974121094, "loss": 0.6994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5871157050132751, "rewards/margins": 0.025677397847175598, "rewards/rejected": 0.561438262462616, "step": 3270 }, { "epoch": 0.15228190723803334, "grad_norm": 85.75664520263672, "learning_rate": 2.9088537072287477e-07, "logits/chosen": -18.48321533203125, "logits/rejected": -16.901165008544922, "logps/chosen": -519.4710083007812, "logps/rejected": -299.1258239746094, "loss": 0.5789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7797451615333557, "rewards/margins": 0.27299827337265015, "rewards/rejected": 0.5067468881607056, "step": 3280 }, { "epoch": 0.15274618134546636, "grad_norm": 60.0022087097168, "learning_rate": 2.9085751427642876e-07, "logits/chosen": -18.278186798095703, "logits/rejected": -15.779617309570312, "logps/chosen": -474.0813903808594, "logps/rejected": -184.69705200195312, "loss": 0.4567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.94605952501297, "rewards/margins": 0.6286482214927673, "rewards/rejected": 0.3174113631248474, "step": 3290 }, { "epoch": 0.15321045545289938, "grad_norm": 37.407596588134766, "learning_rate": 2.908296578299828e-07, "logits/chosen": -17.602649688720703, "logits/rejected": -17.269346237182617, "logps/chosen": -369.5552062988281, "logps/rejected": -306.3258361816406, "loss": 0.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6052524447441101, "rewards/margins": 0.14316725730895996, "rewards/rejected": 0.46208518743515015, "step": 3300 }, { "epoch": 0.15367472956033243, "grad_norm": 106.95731353759766, "learning_rate": 2.9080180138353684e-07, "logits/chosen": -17.657915115356445, "logits/rejected": -17.052021026611328, "logps/chosen": -532.1570434570312, "logps/rejected": -370.00030517578125, "loss": 0.6575, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7795381546020508, "rewards/margins": 0.1263556182384491, "rewards/rejected": 0.6531825661659241, "step": 3310 }, { "epoch": 0.15413900366776545, "grad_norm": 112.79298400878906, "learning_rate": 2.9077394493709083e-07, "logits/chosen": -18.554462432861328, "logits/rejected": -17.226207733154297, "logps/chosen": -479.2259216308594, "logps/rejected": -310.5823059082031, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7159344553947449, "rewards/margins": 0.21403995156288147, "rewards/rejected": 0.501894474029541, "step": 3320 }, { "epoch": 0.15460327777519847, "grad_norm": 86.92269134521484, "learning_rate": 2.9074608849064487e-07, "logits/chosen": -18.243572235107422, "logits/rejected": -17.33965492248535, "logps/chosen": -423.675537109375, "logps/rejected": -276.1025390625, "loss": 0.5354, "rewards/accuracies": 1.0, "rewards/chosen": 0.8747159838676453, "rewards/margins": 0.35781413316726685, "rewards/rejected": 0.516901969909668, "step": 3330 }, { "epoch": 0.1550675518826315, "grad_norm": 139.58306884765625, "learning_rate": 2.907182320441989e-07, "logits/chosen": -18.234439849853516, "logits/rejected": -18.199485778808594, "logps/chosen": -451.9588928222656, "logps/rejected": -481.0164489746094, "loss": 0.7419, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.8631041646003723, "rewards/margins": -0.06212729215621948, "rewards/rejected": 0.9252313375473022, "step": 3340 }, { "epoch": 0.15553182599006454, "grad_norm": 64.71827697753906, "learning_rate": 2.906903755977529e-07, "logits/chosen": -19.581146240234375, "logits/rejected": -18.7359619140625, "logps/chosen": -427.49676513671875, "logps/rejected": -393.5354919433594, "loss": 0.6102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7892228960990906, "rewards/margins": 0.18988510966300964, "rewards/rejected": 0.5993377566337585, "step": 3350 }, { "epoch": 0.15599610009749756, "grad_norm": 92.07847595214844, "learning_rate": 2.906625191513069e-07, "logits/chosen": -18.25357437133789, "logits/rejected": -17.218032836914062, "logps/chosen": -491.811767578125, "logps/rejected": -357.27783203125, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": 0.8240594863891602, "rewards/margins": 0.25801852345466614, "rewards/rejected": 0.5660409331321716, "step": 3360 }, { "epoch": 0.15646037420493059, "grad_norm": 40.52136993408203, "learning_rate": 2.9063466270486093e-07, "logits/chosen": -16.821870803833008, "logits/rejected": -16.6726016998291, "logps/chosen": -412.52716064453125, "logps/rejected": -272.38897705078125, "loss": 0.5644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8396472930908203, "rewards/margins": 0.3174380660057068, "rewards/rejected": 0.5222092270851135, "step": 3370 }, { "epoch": 0.1569246483123636, "grad_norm": 67.51866912841797, "learning_rate": 2.9060680625841497e-07, "logits/chosen": -18.539091110229492, "logits/rejected": -17.58595848083496, "logps/chosen": -433.1643981933594, "logps/rejected": -291.30926513671875, "loss": 0.6281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6896904706954956, "rewards/margins": 0.15444782376289368, "rewards/rejected": 0.5352426171302795, "step": 3380 }, { "epoch": 0.15738892241979666, "grad_norm": 65.30690002441406, "learning_rate": 2.90578949811969e-07, "logits/chosen": -17.175018310546875, "logits/rejected": -16.7293701171875, "logps/chosen": -370.9895324707031, "logps/rejected": -301.8438720703125, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7305065989494324, "rewards/margins": 0.1103198379278183, "rewards/rejected": 0.6201867461204529, "step": 3390 }, { "epoch": 0.15785319652722968, "grad_norm": 108.76478576660156, "learning_rate": 2.90551093365523e-07, "logits/chosen": -18.197246551513672, "logits/rejected": -18.072681427001953, "logps/chosen": -462.67474365234375, "logps/rejected": -477.25372314453125, "loss": 0.7082, "rewards/accuracies": 0.5, "rewards/chosen": 0.7318617105484009, "rewards/margins": -0.019367460161447525, "rewards/rejected": 0.7512291669845581, "step": 3400 }, { "epoch": 0.1583174706346627, "grad_norm": 68.49213409423828, "learning_rate": 2.90523236919077e-07, "logits/chosen": -18.9097900390625, "logits/rejected": -17.698673248291016, "logps/chosen": -375.28094482421875, "logps/rejected": -247.40615844726562, "loss": 0.5971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6276172995567322, "rewards/margins": 0.22961732745170593, "rewards/rejected": 0.398000031709671, "step": 3410 }, { "epoch": 0.15878174474209572, "grad_norm": 70.13960266113281, "learning_rate": 2.9049538047263103e-07, "logits/chosen": -18.53003692626953, "logits/rejected": -17.620262145996094, "logps/chosen": -385.30194091796875, "logps/rejected": -264.7559509277344, "loss": 0.6213, "rewards/accuracies": 0.5, "rewards/chosen": 0.811755359172821, "rewards/margins": 0.22128288447856903, "rewards/rejected": 0.5904725790023804, "step": 3420 }, { "epoch": 0.15924601884952877, "grad_norm": 107.54270935058594, "learning_rate": 2.9046752402618507e-07, "logits/chosen": -18.689353942871094, "logits/rejected": -18.098329544067383, "logps/chosen": -545.0781860351562, "logps/rejected": -371.703125, "loss": 0.5637, "rewards/accuracies": 0.5, "rewards/chosen": 1.0197677612304688, "rewards/margins": 0.38296571373939514, "rewards/rejected": 0.6368020176887512, "step": 3430 }, { "epoch": 0.1597102929569618, "grad_norm": 125.14342498779297, "learning_rate": 2.9043966757973906e-07, "logits/chosen": -18.52072525024414, "logits/rejected": -18.730836868286133, "logps/chosen": -452.9527893066406, "logps/rejected": -496.43603515625, "loss": 0.7224, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.7454604506492615, "rewards/margins": -0.007205492351204157, "rewards/rejected": 0.7526659369468689, "step": 3440 }, { "epoch": 0.1601745670643948, "grad_norm": 103.62628936767578, "learning_rate": 2.904118111332931e-07, "logits/chosen": -18.94863510131836, "logits/rejected": -17.87346076965332, "logps/chosen": -525.4996337890625, "logps/rejected": -402.1225280761719, "loss": 0.5964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9331787824630737, "rewards/margins": 0.23406219482421875, "rewards/rejected": 0.6991165280342102, "step": 3450 }, { "epoch": 0.16063884117182783, "grad_norm": 105.19241333007812, "learning_rate": 2.903839546868471e-07, "logits/chosen": -18.69088363647461, "logits/rejected": -17.972980499267578, "logps/chosen": -496.75, "logps/rejected": -391.17559814453125, "loss": 0.7059, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.8330133557319641, "rewards/margins": 0.0010151624446734786, "rewards/rejected": 0.8319981694221497, "step": 3460 }, { "epoch": 0.16110311527926088, "grad_norm": 111.2286148071289, "learning_rate": 2.9035609824040113e-07, "logits/chosen": -18.404813766479492, "logits/rejected": -17.768177032470703, "logps/chosen": -424.45697021484375, "logps/rejected": -334.0193176269531, "loss": 0.6023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8736817240715027, "rewards/margins": 0.25167250633239746, "rewards/rejected": 0.62200927734375, "step": 3470 }, { "epoch": 0.1615673893866939, "grad_norm": 137.8740234375, "learning_rate": 2.903282417939551e-07, "logits/chosen": -17.867717742919922, "logits/rejected": -17.188243865966797, "logps/chosen": -373.7234802246094, "logps/rejected": -328.3979797363281, "loss": 0.7085, "rewards/accuracies": 0.5, "rewards/chosen": 0.6106628179550171, "rewards/margins": 0.013409310951828957, "rewards/rejected": 0.5972535014152527, "step": 3480 }, { "epoch": 0.16203166349412693, "grad_norm": 73.86395263671875, "learning_rate": 2.9030038534750916e-07, "logits/chosen": -19.069913864135742, "logits/rejected": -18.05548095703125, "logps/chosen": -357.9356384277344, "logps/rejected": -319.47003173828125, "loss": 0.6309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.73241126537323, "rewards/margins": 0.14778530597686768, "rewards/rejected": 0.5846258997917175, "step": 3490 }, { "epoch": 0.16249593760155995, "grad_norm": 93.348876953125, "learning_rate": 2.902725289010632e-07, "logits/chosen": -17.852245330810547, "logits/rejected": -17.17350959777832, "logps/chosen": -481.898681640625, "logps/rejected": -355.45452880859375, "loss": 0.6438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9573438763618469, "rewards/margins": 0.18482384085655212, "rewards/rejected": 0.7725200057029724, "step": 3500 }, { "epoch": 0.162960211708993, "grad_norm": 58.69588851928711, "learning_rate": 2.902446724546172e-07, "logits/chosen": -18.277660369873047, "logits/rejected": -17.315759658813477, "logps/chosen": -454.5224609375, "logps/rejected": -369.480712890625, "loss": 0.6425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8737324476242065, "rewards/margins": 0.1283978372812271, "rewards/rejected": 0.7453345656394958, "step": 3510 }, { "epoch": 0.16342448581642602, "grad_norm": 69.13815307617188, "learning_rate": 2.902168160081712e-07, "logits/chosen": -18.15355682373047, "logits/rejected": -17.308908462524414, "logps/chosen": -419.7845764160156, "logps/rejected": -356.4355163574219, "loss": 0.6201, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7896326780319214, "rewards/margins": 0.16622593998908997, "rewards/rejected": 0.6234067678451538, "step": 3520 }, { "epoch": 0.16388875992385904, "grad_norm": 56.65826416015625, "learning_rate": 2.901889595617252e-07, "logits/chosen": -17.904592514038086, "logits/rejected": -16.925195693969727, "logps/chosen": -387.27874755859375, "logps/rejected": -282.86212158203125, "loss": 0.6311, "rewards/accuracies": 0.5, "rewards/chosen": 0.8117233514785767, "rewards/margins": 0.1810448169708252, "rewards/rejected": 0.6306785345077515, "step": 3530 }, { "epoch": 0.16435303403129206, "grad_norm": 81.99915313720703, "learning_rate": 2.9016110311527925e-07, "logits/chosen": -18.44211196899414, "logits/rejected": -18.025123596191406, "logps/chosen": -354.6427001953125, "logps/rejected": -347.47991943359375, "loss": 0.7035, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.6244110465049744, "rewards/margins": 0.0012814014917239547, "rewards/rejected": 0.623129665851593, "step": 3540 }, { "epoch": 0.1648173081387251, "grad_norm": 102.32557678222656, "learning_rate": 2.9013324666883324e-07, "logits/chosen": -18.103200912475586, "logits/rejected": -17.443416595458984, "logps/chosen": -361.9154968261719, "logps/rejected": -250.6254119873047, "loss": 0.6013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7149603366851807, "rewards/margins": 0.2079273909330368, "rewards/rejected": 0.5070329904556274, "step": 3550 }, { "epoch": 0.16528158224615813, "grad_norm": 45.47930908203125, "learning_rate": 2.901053902223873e-07, "logits/chosen": -18.448991775512695, "logits/rejected": -17.194196701049805, "logps/chosen": -377.5616149902344, "logps/rejected": -365.54229736328125, "loss": 0.6484, "rewards/accuracies": 0.5, "rewards/chosen": 0.7061413526535034, "rewards/margins": 0.14817726612091064, "rewards/rejected": 0.557964026927948, "step": 3560 }, { "epoch": 0.16574585635359115, "grad_norm": 87.82937622070312, "learning_rate": 2.900775337759413e-07, "logits/chosen": -17.28255271911621, "logits/rejected": -17.169218063354492, "logps/chosen": -327.73419189453125, "logps/rejected": -316.18072509765625, "loss": 0.7446, "rewards/accuracies": 0.5, "rewards/chosen": 0.6713768243789673, "rewards/margins": -0.04303940013051033, "rewards/rejected": 0.7144162058830261, "step": 3570 }, { "epoch": 0.1662101304610242, "grad_norm": 67.75474548339844, "learning_rate": 2.900496773294953e-07, "logits/chosen": -18.369600296020508, "logits/rejected": -17.99663543701172, "logps/chosen": -369.5442199707031, "logps/rejected": -315.7245788574219, "loss": 0.6207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.88597571849823, "rewards/margins": 0.1903459131717682, "rewards/rejected": 0.6956297159194946, "step": 3580 }, { "epoch": 0.16667440456845722, "grad_norm": 145.68846130371094, "learning_rate": 2.9002182088304935e-07, "logits/chosen": -17.728984832763672, "logits/rejected": -17.06857681274414, "logps/chosen": -422.7398376464844, "logps/rejected": -359.82086181640625, "loss": 0.6244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8468904495239258, "rewards/margins": 0.19857630133628845, "rewards/rejected": 0.6483141183853149, "step": 3590 }, { "epoch": 0.16713867867589025, "grad_norm": 97.37892150878906, "learning_rate": 2.8999396443660334e-07, "logits/chosen": -18.61159896850586, "logits/rejected": -17.525650024414062, "logps/chosen": -451.76458740234375, "logps/rejected": -354.96337890625, "loss": 0.6312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8908022046089172, "rewards/margins": 0.16709065437316895, "rewards/rejected": 0.7237115502357483, "step": 3600 }, { "epoch": 0.16760295278332327, "grad_norm": 106.27289581298828, "learning_rate": 2.899661079901574e-07, "logits/chosen": -19.098125457763672, "logits/rejected": -17.97098731994629, "logps/chosen": -396.9530944824219, "logps/rejected": -366.44097900390625, "loss": 0.633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8255743980407715, "rewards/margins": 0.16375091671943665, "rewards/rejected": 0.6618233919143677, "step": 3610 }, { "epoch": 0.16806722689075632, "grad_norm": 92.48492431640625, "learning_rate": 2.899382515437114e-07, "logits/chosen": -17.908313751220703, "logits/rejected": -17.1241397857666, "logps/chosen": -497.09710693359375, "logps/rejected": -371.8640441894531, "loss": 0.6344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8735305070877075, "rewards/margins": 0.147649884223938, "rewards/rejected": 0.7258806228637695, "step": 3620 }, { "epoch": 0.16853150099818934, "grad_norm": 42.66592025756836, "learning_rate": 2.899103950972654e-07, "logits/chosen": -18.03453254699707, "logits/rejected": -16.28874969482422, "logps/chosen": -424.97381591796875, "logps/rejected": -270.1409606933594, "loss": 0.5249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0114340782165527, "rewards/margins": 0.40433087944984436, "rewards/rejected": 0.6071033477783203, "step": 3630 }, { "epoch": 0.16899577510562236, "grad_norm": 98.54498291015625, "learning_rate": 2.8988253865081945e-07, "logits/chosen": -18.44988250732422, "logits/rejected": -18.784475326538086, "logps/chosen": -458.17401123046875, "logps/rejected": -469.7838439941406, "loss": 0.7468, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8341930508613586, "rewards/margins": -0.08677148073911667, "rewards/rejected": 0.9209645390510559, "step": 3640 }, { "epoch": 0.16946004921305538, "grad_norm": 63.57672882080078, "learning_rate": 2.8985468220437344e-07, "logits/chosen": -18.171329498291016, "logits/rejected": -17.659496307373047, "logps/chosen": -493.7449645996094, "logps/rejected": -434.7940368652344, "loss": 0.6888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9056129455566406, "rewards/margins": 0.044965602457523346, "rewards/rejected": 0.8606473803520203, "step": 3650 }, { "epoch": 0.16992432332048843, "grad_norm": 109.97968292236328, "learning_rate": 2.8982682575792743e-07, "logits/chosen": -17.006311416625977, "logits/rejected": -17.001222610473633, "logps/chosen": -314.40216064453125, "logps/rejected": -345.60443115234375, "loss": 0.7586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7330456972122192, "rewards/margins": -0.048852212727069855, "rewards/rejected": 0.7818979024887085, "step": 3660 }, { "epoch": 0.17038859742792145, "grad_norm": 75.47868347167969, "learning_rate": 2.8979896931148147e-07, "logits/chosen": -18.631364822387695, "logits/rejected": -17.658432006835938, "logps/chosen": -320.8546447753906, "logps/rejected": -274.802978515625, "loss": 0.7352, "rewards/accuracies": 0.5, "rewards/chosen": 0.5973833799362183, "rewards/margins": -0.020588625222444534, "rewards/rejected": 0.6179720759391785, "step": 3670 }, { "epoch": 0.17085287153535447, "grad_norm": 35.963680267333984, "learning_rate": 2.897711128650355e-07, "logits/chosen": -18.837295532226562, "logits/rejected": -17.625755310058594, "logps/chosen": -534.2899169921875, "logps/rejected": -302.81591796875, "loss": 0.5862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9347850680351257, "rewards/margins": 0.3576729893684387, "rewards/rejected": 0.5771121382713318, "step": 3680 }, { "epoch": 0.1713171456427875, "grad_norm": 133.17115783691406, "learning_rate": 2.8974325641858955e-07, "logits/chosen": -18.04637336730957, "logits/rejected": -17.871042251586914, "logps/chosen": -463.8168029785156, "logps/rejected": -401.31964111328125, "loss": 0.705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9029020071029663, "rewards/margins": 0.07229019701480865, "rewards/rejected": 0.8306118845939636, "step": 3690 }, { "epoch": 0.17178141975022054, "grad_norm": 145.6256103515625, "learning_rate": 2.8971539997214354e-07, "logits/chosen": -18.466005325317383, "logits/rejected": -18.66688346862793, "logps/chosen": -447.9222717285156, "logps/rejected": -426.371337890625, "loss": 0.73, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7099112868309021, "rewards/margins": -0.04522966220974922, "rewards/rejected": 0.7551409602165222, "step": 3700 }, { "epoch": 0.17224569385765356, "grad_norm": 88.13247680664062, "learning_rate": 2.8968754352569753e-07, "logits/chosen": -18.419719696044922, "logits/rejected": -18.054882049560547, "logps/chosen": -466.01953125, "logps/rejected": -441.8290100097656, "loss": 0.7093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8999370336532593, "rewards/margins": 0.01977676711976528, "rewards/rejected": 0.8801602125167847, "step": 3710 }, { "epoch": 0.17270996796508659, "grad_norm": 60.44733428955078, "learning_rate": 2.8965968707925157e-07, "logits/chosen": -18.139453887939453, "logits/rejected": -17.313011169433594, "logps/chosen": -394.65277099609375, "logps/rejected": -310.73150634765625, "loss": 0.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8208646774291992, "rewards/margins": 0.28139087557792664, "rewards/rejected": 0.5394737124443054, "step": 3720 }, { "epoch": 0.1731742420725196, "grad_norm": 57.578102111816406, "learning_rate": 2.896318306328056e-07, "logits/chosen": -18.65052032470703, "logits/rejected": -16.905635833740234, "logps/chosen": -415.5105895996094, "logps/rejected": -294.2066345214844, "loss": 0.5806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9149495959281921, "rewards/margins": 0.2664806842803955, "rewards/rejected": 0.6484689712524414, "step": 3730 }, { "epoch": 0.17363851617995266, "grad_norm": 66.61353302001953, "learning_rate": 2.896039741863596e-07, "logits/chosen": -18.90267562866211, "logits/rejected": -18.101213455200195, "logps/chosen": -479.7018127441406, "logps/rejected": -329.64556884765625, "loss": 0.5736, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9336680173873901, "rewards/margins": 0.2762433886528015, "rewards/rejected": 0.6574246287345886, "step": 3740 }, { "epoch": 0.17410279028738568, "grad_norm": 37.64634704589844, "learning_rate": 2.8957611773991364e-07, "logits/chosen": -18.784387588500977, "logits/rejected": -17.162036895751953, "logps/chosen": -415.5771484375, "logps/rejected": -272.42596435546875, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8701158761978149, "rewards/margins": 0.2758820950984955, "rewards/rejected": 0.5942336916923523, "step": 3750 }, { "epoch": 0.1745670643948187, "grad_norm": 74.1735610961914, "learning_rate": 2.895482612934677e-07, "logits/chosen": -18.297414779663086, "logits/rejected": -16.554460525512695, "logps/chosen": -490.72845458984375, "logps/rejected": -301.5240478515625, "loss": 0.5456, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0159566402435303, "rewards/margins": 0.3757534921169281, "rewards/rejected": 0.6402031183242798, "step": 3760 }, { "epoch": 0.17503133850225172, "grad_norm": 45.68318557739258, "learning_rate": 2.8952040484702167e-07, "logits/chosen": -17.71128273010254, "logits/rejected": -17.73048210144043, "logps/chosen": -269.57269287109375, "logps/rejected": -319.0127258300781, "loss": 0.7823, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.515546441078186, "rewards/margins": -0.14082136750221252, "rewards/rejected": 0.6563677787780762, "step": 3770 }, { "epoch": 0.17549561260968477, "grad_norm": 59.66310501098633, "learning_rate": 2.8949254840057566e-07, "logits/chosen": -18.392061233520508, "logits/rejected": -17.79549217224121, "logps/chosen": -482.98016357421875, "logps/rejected": -313.0513916015625, "loss": 0.6229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9587502479553223, "rewards/margins": 0.20318754017353058, "rewards/rejected": 0.7555626630783081, "step": 3780 }, { "epoch": 0.1759598867171178, "grad_norm": 70.31092071533203, "learning_rate": 2.894646919541297e-07, "logits/chosen": -18.61328125, "logits/rejected": -17.555465698242188, "logps/chosen": -394.764892578125, "logps/rejected": -323.5894775390625, "loss": 0.7276, "rewards/accuracies": 0.5, "rewards/chosen": 0.6827521324157715, "rewards/margins": -0.023070061579346657, "rewards/rejected": 0.7058221697807312, "step": 3790 }, { "epoch": 0.1764241608245508, "grad_norm": 42.385189056396484, "learning_rate": 2.8943683550768374e-07, "logits/chosen": -18.343769073486328, "logits/rejected": -16.739299774169922, "logps/chosen": -544.035888671875, "logps/rejected": -332.43096923828125, "loss": 0.6033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0401705503463745, "rewards/margins": 0.235588937997818, "rewards/rejected": 0.8045816421508789, "step": 3800 }, { "epoch": 0.17688843493198383, "grad_norm": 54.765132904052734, "learning_rate": 2.894089790612378e-07, "logits/chosen": -17.675472259521484, "logits/rejected": -16.82796859741211, "logps/chosen": -481.1417541503906, "logps/rejected": -388.933837890625, "loss": 0.627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.921649158000946, "rewards/margins": 0.16571179032325745, "rewards/rejected": 0.755937397480011, "step": 3810 }, { "epoch": 0.17735270903941688, "grad_norm": 57.22236251831055, "learning_rate": 2.8938112261479177e-07, "logits/chosen": -18.032047271728516, "logits/rejected": -16.861906051635742, "logps/chosen": -452.27423095703125, "logps/rejected": -322.05975341796875, "loss": 0.6552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9033463597297668, "rewards/margins": 0.11830542236566544, "rewards/rejected": 0.7850409746170044, "step": 3820 }, { "epoch": 0.1778169831468499, "grad_norm": 49.3691520690918, "learning_rate": 2.8935326616834576e-07, "logits/chosen": -17.66417121887207, "logits/rejected": -17.13431167602539, "logps/chosen": -327.31793212890625, "logps/rejected": -248.82666015625, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7257834672927856, "rewards/margins": 0.14757028222084045, "rewards/rejected": 0.5782132744789124, "step": 3830 }, { "epoch": 0.17828125725428293, "grad_norm": 142.70436096191406, "learning_rate": 2.893254097218998e-07, "logits/chosen": -18.322805404663086, "logits/rejected": -18.233604431152344, "logps/chosen": -452.90185546875, "logps/rejected": -458.9656677246094, "loss": 0.7173, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.7385858297348022, "rewards/margins": -0.02412557043135166, "rewards/rejected": 0.7627114057540894, "step": 3840 }, { "epoch": 0.17874553136171595, "grad_norm": 113.3558349609375, "learning_rate": 2.892975532754538e-07, "logits/chosen": -17.812850952148438, "logits/rejected": -17.387210845947266, "logps/chosen": -413.6573791503906, "logps/rejected": -305.9615478515625, "loss": 0.5777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8495262861251831, "rewards/margins": 0.26592156291007996, "rewards/rejected": 0.5836048722267151, "step": 3850 }, { "epoch": 0.179209805469149, "grad_norm": 55.67914581298828, "learning_rate": 2.892696968290078e-07, "logits/chosen": -18.06092071533203, "logits/rejected": -16.575489044189453, "logps/chosen": -536.4756469726562, "logps/rejected": -399.5121765136719, "loss": 0.5904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2288479804992676, "rewards/margins": 0.31373316049575806, "rewards/rejected": 0.9151147603988647, "step": 3860 }, { "epoch": 0.17967407957658202, "grad_norm": 161.81715393066406, "learning_rate": 2.8924184038256187e-07, "logits/chosen": -17.96221923828125, "logits/rejected": -17.550596237182617, "logps/chosen": -400.2974548339844, "logps/rejected": -314.79864501953125, "loss": 0.6331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9410125017166138, "rewards/margins": 0.23023250699043274, "rewards/rejected": 0.7107799053192139, "step": 3870 }, { "epoch": 0.18013835368401504, "grad_norm": 54.7454833984375, "learning_rate": 2.8921398393611586e-07, "logits/chosen": -19.539791107177734, "logits/rejected": -18.451704025268555, "logps/chosen": -485.731689453125, "logps/rejected": -340.0448303222656, "loss": 0.6006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0019848346710205, "rewards/margins": 0.2549649178981781, "rewards/rejected": 0.7470198273658752, "step": 3880 }, { "epoch": 0.18060262779144806, "grad_norm": 75.38533782958984, "learning_rate": 2.891861274896699e-07, "logits/chosen": -17.314863204956055, "logits/rejected": -17.04790496826172, "logps/chosen": -324.8284606933594, "logps/rejected": -243.7213897705078, "loss": 0.6522, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6474538445472717, "rewards/margins": 0.10784506797790527, "rewards/rejected": 0.5396088361740112, "step": 3890 }, { "epoch": 0.1810669018988811, "grad_norm": 68.67909240722656, "learning_rate": 2.891582710432239e-07, "logits/chosen": -19.317319869995117, "logits/rejected": -17.858659744262695, "logps/chosen": -393.8170166015625, "logps/rejected": -234.17367553710938, "loss": 0.5249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0135642290115356, "rewards/margins": 0.45475301146507263, "rewards/rejected": 0.5588111281394958, "step": 3900 }, { "epoch": 0.18153117600631413, "grad_norm": 49.46918869018555, "learning_rate": 2.891304145967779e-07, "logits/chosen": -17.578842163085938, "logits/rejected": -16.623960494995117, "logps/chosen": -447.12677001953125, "logps/rejected": -335.42218017578125, "loss": 0.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8898905515670776, "rewards/margins": 0.17093276977539062, "rewards/rejected": 0.7189578413963318, "step": 3910 }, { "epoch": 0.18199545011374715, "grad_norm": 53.88945388793945, "learning_rate": 2.8910255815033197e-07, "logits/chosen": -17.489116668701172, "logits/rejected": -17.302011489868164, "logps/chosen": -358.4311828613281, "logps/rejected": -304.0968017578125, "loss": 0.616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8096820116043091, "rewards/margins": 0.18677569925785065, "rewards/rejected": 0.6229063272476196, "step": 3920 }, { "epoch": 0.18245972422118018, "grad_norm": 59.391780853271484, "learning_rate": 2.8907470170388595e-07, "logits/chosen": -18.305633544921875, "logits/rejected": -17.751161575317383, "logps/chosen": -518.0864868164062, "logps/rejected": -383.9032287597656, "loss": 0.5863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0621097087860107, "rewards/margins": 0.26207435131073, "rewards/rejected": 0.8000353574752808, "step": 3930 }, { "epoch": 0.18292399832861322, "grad_norm": 82.84818267822266, "learning_rate": 2.8904684525744e-07, "logits/chosen": -17.889881134033203, "logits/rejected": -17.81840705871582, "logps/chosen": -414.068115234375, "logps/rejected": -374.9361877441406, "loss": 0.6968, "rewards/accuracies": 0.5, "rewards/chosen": 0.8933679461479187, "rewards/margins": 0.08285211026668549, "rewards/rejected": 0.8105157613754272, "step": 3940 }, { "epoch": 0.18338827243604625, "grad_norm": 64.50325012207031, "learning_rate": 2.89018988810994e-07, "logits/chosen": -17.349384307861328, "logits/rejected": -17.242473602294922, "logps/chosen": -440.4056701660156, "logps/rejected": -360.0589904785156, "loss": 0.6178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9992188215255737, "rewards/margins": 0.18041293323040009, "rewards/rejected": 0.8188058733940125, "step": 3950 }, { "epoch": 0.18385254654347927, "grad_norm": 150.23587036132812, "learning_rate": 2.88991132364548e-07, "logits/chosen": -18.459177017211914, "logits/rejected": -17.845563888549805, "logps/chosen": -432.13629150390625, "logps/rejected": -388.76495361328125, "loss": 0.7364, "rewards/accuracies": 0.5, "rewards/chosen": 0.8000583648681641, "rewards/margins": -0.02432234026491642, "rewards/rejected": 0.8243807554244995, "step": 3960 }, { "epoch": 0.1843168206509123, "grad_norm": 61.140785217285156, "learning_rate": 2.88963275918102e-07, "logits/chosen": -18.513486862182617, "logits/rejected": -18.17730712890625, "logps/chosen": -375.84930419921875, "logps/rejected": -340.7678527832031, "loss": 0.7119, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7807188630104065, "rewards/margins": 0.02407488226890564, "rewards/rejected": 0.7566439509391785, "step": 3970 }, { "epoch": 0.18478109475834534, "grad_norm": 37.918701171875, "learning_rate": 2.8893541947165605e-07, "logits/chosen": -17.831716537475586, "logits/rejected": -16.344451904296875, "logps/chosen": -426.30322265625, "logps/rejected": -263.54107666015625, "loss": 0.5416, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9702584147453308, "rewards/margins": 0.34817051887512207, "rewards/rejected": 0.6220880150794983, "step": 3980 }, { "epoch": 0.18524536886577836, "grad_norm": 53.990928649902344, "learning_rate": 2.889075630252101e-07, "logits/chosen": -17.909854888916016, "logits/rejected": -17.2224178314209, "logps/chosen": -315.61724853515625, "logps/rejected": -276.8010559082031, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6940831542015076, "rewards/margins": 0.11735428869724274, "rewards/rejected": 0.5767289400100708, "step": 3990 }, { "epoch": 0.18570964297321138, "grad_norm": 109.27045440673828, "learning_rate": 2.888797065787641e-07, "logits/chosen": -18.015888214111328, "logits/rejected": -17.964494705200195, "logps/chosen": -334.90667724609375, "logps/rejected": -335.2496032714844, "loss": 0.6731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7708240747451782, "rewards/margins": 0.04534969851374626, "rewards/rejected": 0.72547447681427, "step": 4000 }, { "epoch": 0.1861739170806444, "grad_norm": 104.88585662841797, "learning_rate": 2.888518501323181e-07, "logits/chosen": -17.674755096435547, "logits/rejected": -17.0737361907959, "logps/chosen": -406.4237976074219, "logps/rejected": -309.0255126953125, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.815044105052948, "rewards/margins": 0.09733322262763977, "rewards/rejected": 0.7177108526229858, "step": 4010 }, { "epoch": 0.18663819118807745, "grad_norm": 76.21430969238281, "learning_rate": 2.888239936858721e-07, "logits/chosen": -18.397380828857422, "logits/rejected": -16.839109420776367, "logps/chosen": -422.6553649902344, "logps/rejected": -275.6712341308594, "loss": 0.5852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9418696165084839, "rewards/margins": 0.292479932308197, "rewards/rejected": 0.6493896245956421, "step": 4020 }, { "epoch": 0.18710246529551047, "grad_norm": 168.1366729736328, "learning_rate": 2.8879613723942615e-07, "logits/chosen": -18.972929000854492, "logits/rejected": -18.08329963684082, "logps/chosen": -411.8028259277344, "logps/rejected": -331.34271240234375, "loss": 0.6, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9358307123184204, "rewards/margins": 0.2696632146835327, "rewards/rejected": 0.6661674976348877, "step": 4030 }, { "epoch": 0.1875667394029435, "grad_norm": 114.5055923461914, "learning_rate": 2.887682807929802e-07, "logits/chosen": -17.746326446533203, "logits/rejected": -16.828189849853516, "logps/chosen": -379.99029541015625, "logps/rejected": -274.06646728515625, "loss": 0.6343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8558380007743835, "rewards/margins": 0.20714671909809113, "rewards/rejected": 0.6486912369728088, "step": 4040 }, { "epoch": 0.18803101351037652, "grad_norm": 118.6678695678711, "learning_rate": 2.887404243465342e-07, "logits/chosen": -18.10117530822754, "logits/rejected": -17.085464477539062, "logps/chosen": -458.62713623046875, "logps/rejected": -336.72552490234375, "loss": 0.5814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0091619491577148, "rewards/margins": 0.2672252058982849, "rewards/rejected": 0.7419367432594299, "step": 4050 }, { "epoch": 0.18849528761780956, "grad_norm": 39.659637451171875, "learning_rate": 2.887125679000882e-07, "logits/chosen": -18.16163444519043, "logits/rejected": -17.599178314208984, "logps/chosen": -368.3918151855469, "logps/rejected": -327.98162841796875, "loss": 0.6357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7664273977279663, "rewards/margins": 0.16138823330402374, "rewards/rejected": 0.6050391793251038, "step": 4060 }, { "epoch": 0.1889595617252426, "grad_norm": 144.30401611328125, "learning_rate": 2.886847114536422e-07, "logits/chosen": -18.309690475463867, "logits/rejected": -18.004261016845703, "logps/chosen": -449.2353515625, "logps/rejected": -452.8033142089844, "loss": 0.7732, "rewards/accuracies": 0.5, "rewards/chosen": 1.0612815618515015, "rewards/margins": -0.026850616559386253, "rewards/rejected": 1.0881322622299194, "step": 4070 }, { "epoch": 0.1894238358326756, "grad_norm": 57.895023345947266, "learning_rate": 2.886568550071962e-07, "logits/chosen": -18.613550186157227, "logits/rejected": -17.50552749633789, "logps/chosen": -416.96527099609375, "logps/rejected": -273.9273681640625, "loss": 0.5098, "rewards/accuracies": 1.0, "rewards/chosen": 1.1183792352676392, "rewards/margins": 0.4640520215034485, "rewards/rejected": 0.6543272137641907, "step": 4080 }, { "epoch": 0.18988810994010863, "grad_norm": 66.1661376953125, "learning_rate": 2.8862899856075024e-07, "logits/chosen": -17.197368621826172, "logits/rejected": -17.29431915283203, "logps/chosen": -352.8525085449219, "logps/rejected": -275.99786376953125, "loss": 0.6108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9041886329650879, "rewards/margins": 0.20338168740272522, "rewards/rejected": 0.7008069753646851, "step": 4090 }, { "epoch": 0.19035238404754168, "grad_norm": 132.4310302734375, "learning_rate": 2.886011421143043e-07, "logits/chosen": -18.03774642944336, "logits/rejected": -16.97662353515625, "logps/chosen": -518.8304443359375, "logps/rejected": -307.4454650878906, "loss": 0.5332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0773259401321411, "rewards/margins": 0.40823015570640564, "rewards/rejected": 0.6690956950187683, "step": 4100 }, { "epoch": 0.1908166581549747, "grad_norm": 54.26932907104492, "learning_rate": 2.885732856678583e-07, "logits/chosen": -18.531517028808594, "logits/rejected": -17.89997673034668, "logps/chosen": -408.29095458984375, "logps/rejected": -315.5237731933594, "loss": 0.5637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.944024384021759, "rewards/margins": 0.33894801139831543, "rewards/rejected": 0.6050763726234436, "step": 4110 }, { "epoch": 0.19128093226240772, "grad_norm": 123.6554183959961, "learning_rate": 2.885454292214123e-07, "logits/chosen": -18.27670669555664, "logits/rejected": -18.04290199279785, "logps/chosen": -445.74688720703125, "logps/rejected": -448.3915100097656, "loss": 0.7218, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.057823896408081, "rewards/margins": -0.025948649272322655, "rewards/rejected": 1.0837726593017578, "step": 4120 }, { "epoch": 0.19174520636984074, "grad_norm": 92.48731231689453, "learning_rate": 2.885175727749663e-07, "logits/chosen": -17.885231018066406, "logits/rejected": -18.12515640258789, "logps/chosen": -437.33270263671875, "logps/rejected": -407.5586853027344, "loss": 0.7242, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.8593894243240356, "rewards/margins": -0.048792269080877304, "rewards/rejected": 0.9081816673278809, "step": 4130 }, { "epoch": 0.1922094804772738, "grad_norm": 148.34117126464844, "learning_rate": 2.8848971632852034e-07, "logits/chosen": -18.334880828857422, "logits/rejected": -17.218318939208984, "logps/chosen": -467.3954162597656, "logps/rejected": -365.9916076660156, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9997628331184387, "rewards/margins": 0.0841258242726326, "rewards/rejected": 0.9156370162963867, "step": 4140 }, { "epoch": 0.1926737545847068, "grad_norm": 73.67308807373047, "learning_rate": 2.884618598820744e-07, "logits/chosen": -17.610597610473633, "logits/rejected": -17.71367645263672, "logps/chosen": -441.8355407714844, "logps/rejected": -472.42950439453125, "loss": 0.6878, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0529850721359253, "rewards/margins": 0.09340177476406097, "rewards/rejected": 0.9595832824707031, "step": 4150 }, { "epoch": 0.19313802869213983, "grad_norm": 80.07567596435547, "learning_rate": 2.8843400343562837e-07, "logits/chosen": -18.336956024169922, "logits/rejected": -17.0593318939209, "logps/chosen": -394.5023498535156, "logps/rejected": -189.3816680908203, "loss": 0.5345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9451409578323364, "rewards/margins": 0.4319620132446289, "rewards/rejected": 0.5131788849830627, "step": 4160 }, { "epoch": 0.19360230279957286, "grad_norm": 67.56866455078125, "learning_rate": 2.884061469891824e-07, "logits/chosen": -18.991741180419922, "logits/rejected": -17.53668212890625, "logps/chosen": -484.7347717285156, "logps/rejected": -345.241943359375, "loss": 0.5649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9950191378593445, "rewards/margins": 0.336718887090683, "rewards/rejected": 0.6583001613616943, "step": 4170 }, { "epoch": 0.1940665769070059, "grad_norm": 68.71648406982422, "learning_rate": 2.8837829054273645e-07, "logits/chosen": -17.622547149658203, "logits/rejected": -16.796506881713867, "logps/chosen": -366.5741271972656, "logps/rejected": -282.76214599609375, "loss": 0.6052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8556531667709351, "rewards/margins": 0.3401675522327423, "rewards/rejected": 0.5154855847358704, "step": 4180 }, { "epoch": 0.19453085101443893, "grad_norm": 53.25246810913086, "learning_rate": 2.8835043409629044e-07, "logits/chosen": -18.163393020629883, "logits/rejected": -17.65278434753418, "logps/chosen": -343.0597229003906, "logps/rejected": -322.29425048828125, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": 0.7745493054389954, "rewards/margins": 0.07498122751712799, "rewards/rejected": 0.6995680928230286, "step": 4190 }, { "epoch": 0.19499512512187195, "grad_norm": 72.04731750488281, "learning_rate": 2.883225776498444e-07, "logits/chosen": -18.15133285522461, "logits/rejected": -16.952777862548828, "logps/chosen": -398.87298583984375, "logps/rejected": -300.1581115722656, "loss": 0.6235, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9849754571914673, "rewards/margins": 0.16711123287677765, "rewards/rejected": 0.8178642392158508, "step": 4200 }, { "epoch": 0.19545939922930497, "grad_norm": 29.965511322021484, "learning_rate": 2.8829472120339847e-07, "logits/chosen": -17.754053115844727, "logits/rejected": -17.667194366455078, "logps/chosen": -455.4969787597656, "logps/rejected": -405.8942565917969, "loss": 0.6922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1571381092071533, "rewards/margins": 0.03544095903635025, "rewards/rejected": 1.1216973066329956, "step": 4210 }, { "epoch": 0.19592367333673802, "grad_norm": 36.032474517822266, "learning_rate": 2.882668647569525e-07, "logits/chosen": -17.551822662353516, "logits/rejected": -16.686113357543945, "logps/chosen": -370.76177978515625, "logps/rejected": -334.5241394042969, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.8216192126274109, "rewards/margins": 0.08998923003673553, "rewards/rejected": 0.7316300272941589, "step": 4220 }, { "epoch": 0.19638794744417104, "grad_norm": 71.63134002685547, "learning_rate": 2.8823900831050655e-07, "logits/chosen": -17.707517623901367, "logits/rejected": -16.74738883972168, "logps/chosen": -476.98529052734375, "logps/rejected": -369.32318115234375, "loss": 0.6311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0795470476150513, "rewards/margins": 0.17096967995166779, "rewards/rejected": 0.9085773229598999, "step": 4230 }, { "epoch": 0.19685222155160406, "grad_norm": 42.240413665771484, "learning_rate": 2.8821115186406054e-07, "logits/chosen": -18.14344024658203, "logits/rejected": -17.147262573242188, "logps/chosen": -458.55853271484375, "logps/rejected": -268.45550537109375, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 1.2197086811065674, "rewards/margins": 0.5801258087158203, "rewards/rejected": 0.6395827531814575, "step": 4240 }, { "epoch": 0.19731649565903708, "grad_norm": 56.74647521972656, "learning_rate": 2.881832954176145e-07, "logits/chosen": -18.505733489990234, "logits/rejected": -16.945329666137695, "logps/chosen": -343.31219482421875, "logps/rejected": -213.69027709960938, "loss": 0.5497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8441710472106934, "rewards/margins": 0.3567783534526825, "rewards/rejected": 0.48739272356033325, "step": 4250 }, { "epoch": 0.19778076976647013, "grad_norm": 144.3299102783203, "learning_rate": 2.8815543897116857e-07, "logits/chosen": -18.1607723236084, "logits/rejected": -16.490848541259766, "logps/chosen": -514.7621459960938, "logps/rejected": -404.71484375, "loss": 0.5426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0983120203018188, "rewards/margins": 0.3809082508087158, "rewards/rejected": 0.7174037098884583, "step": 4260 }, { "epoch": 0.19824504387390315, "grad_norm": 63.78759002685547, "learning_rate": 2.8812758252472255e-07, "logits/chosen": -18.27560043334961, "logits/rejected": -18.077869415283203, "logps/chosen": -468.13568115234375, "logps/rejected": -423.0293884277344, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 0.8900771141052246, "rewards/margins": 0.06568281352519989, "rewards/rejected": 0.8243943452835083, "step": 4270 }, { "epoch": 0.19870931798133618, "grad_norm": 130.67543029785156, "learning_rate": 2.880997260782766e-07, "logits/chosen": -18.319364547729492, "logits/rejected": -17.289386749267578, "logps/chosen": -396.33819580078125, "logps/rejected": -363.81256103515625, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.8652394413948059, "rewards/margins": 0.06301857531070709, "rewards/rejected": 0.8022208213806152, "step": 4280 }, { "epoch": 0.1991735920887692, "grad_norm": 104.45003509521484, "learning_rate": 2.8807186963183064e-07, "logits/chosen": -18.554462432861328, "logits/rejected": -17.658233642578125, "logps/chosen": -478.15118408203125, "logps/rejected": -395.7998046875, "loss": 0.5537, "rewards/accuracies": 1.0, "rewards/chosen": 1.0832879543304443, "rewards/margins": 0.3163506090641022, "rewards/rejected": 0.7669373750686646, "step": 4290 }, { "epoch": 0.19963786619620225, "grad_norm": 127.29073333740234, "learning_rate": 2.880440131853846e-07, "logits/chosen": -17.925945281982422, "logits/rejected": -17.227745056152344, "logps/chosen": -373.36248779296875, "logps/rejected": -334.2540283203125, "loss": 0.7109, "rewards/accuracies": 0.5, "rewards/chosen": 0.8720463514328003, "rewards/margins": -0.0015397698152810335, "rewards/rejected": 0.8735860586166382, "step": 4300 }, { "epoch": 0.20010214030363527, "grad_norm": 79.43114471435547, "learning_rate": 2.8801615673893867e-07, "logits/chosen": -18.33576011657715, "logits/rejected": -18.670804977416992, "logps/chosen": -424.9229431152344, "logps/rejected": -392.5019836425781, "loss": 0.6688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0054502487182617, "rewards/margins": 0.07578439265489578, "rewards/rejected": 0.9296658635139465, "step": 4310 }, { "epoch": 0.2005664144110683, "grad_norm": 28.082958221435547, "learning_rate": 2.8798830029249265e-07, "logits/chosen": -18.167522430419922, "logits/rejected": -17.400449752807617, "logps/chosen": -409.10064697265625, "logps/rejected": -310.33135986328125, "loss": 0.6193, "rewards/accuracies": 0.5, "rewards/chosen": 0.9503809213638306, "rewards/margins": 0.263537734746933, "rewards/rejected": 0.6868431568145752, "step": 4320 }, { "epoch": 0.2010306885185013, "grad_norm": 125.32650756835938, "learning_rate": 2.879604438460467e-07, "logits/chosen": -17.998525619506836, "logits/rejected": -17.987865447998047, "logps/chosen": -395.47674560546875, "logps/rejected": -316.57879638671875, "loss": 0.669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9634727239608765, "rewards/margins": 0.14726538956165314, "rewards/rejected": 0.8162074089050293, "step": 4330 }, { "epoch": 0.20149496262593436, "grad_norm": 75.9280776977539, "learning_rate": 2.8793258739960074e-07, "logits/chosen": -18.555944442749023, "logits/rejected": -17.319839477539062, "logps/chosen": -454.09619140625, "logps/rejected": -296.20098876953125, "loss": 0.5264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1706143617630005, "rewards/margins": 0.5299721956253052, "rewards/rejected": 0.6406421661376953, "step": 4340 }, { "epoch": 0.20195923673336738, "grad_norm": 56.18987274169922, "learning_rate": 2.879047309531547e-07, "logits/chosen": -17.55342674255371, "logits/rejected": -18.027652740478516, "logps/chosen": -359.16131591796875, "logps/rejected": -382.71343994140625, "loss": 0.7772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9539246559143066, "rewards/margins": -0.11480927467346191, "rewards/rejected": 1.0687339305877686, "step": 4350 }, { "epoch": 0.2024235108408004, "grad_norm": 135.17181396484375, "learning_rate": 2.8787687450670876e-07, "logits/chosen": -19.0618839263916, "logits/rejected": -18.129863739013672, "logps/chosen": -440.39666748046875, "logps/rejected": -380.03790283203125, "loss": 0.5831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.192724585533142, "rewards/margins": 0.2916297912597656, "rewards/rejected": 0.9010947942733765, "step": 4360 }, { "epoch": 0.20288778494823342, "grad_norm": 102.84747314453125, "learning_rate": 2.8784901806026275e-07, "logits/chosen": -18.1896915435791, "logits/rejected": -17.86383819580078, "logps/chosen": -444.59259033203125, "logps/rejected": -364.2959899902344, "loss": 0.6597, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8777710795402527, "rewards/margins": 0.076629638671875, "rewards/rejected": 0.8011414408683777, "step": 4370 }, { "epoch": 0.20335205905566647, "grad_norm": 161.70437622070312, "learning_rate": 2.878211616138168e-07, "logits/chosen": -17.663015365600586, "logits/rejected": -18.169708251953125, "logps/chosen": -343.372802734375, "logps/rejected": -386.4754638671875, "loss": 0.7218, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9672203063964844, "rewards/margins": 0.030373627319931984, "rewards/rejected": 0.9368468523025513, "step": 4380 }, { "epoch": 0.2038163331630995, "grad_norm": 110.16325378417969, "learning_rate": 2.877933051673708e-07, "logits/chosen": -17.528350830078125, "logits/rejected": -17.322603225708008, "logps/chosen": -258.25836181640625, "logps/rejected": -304.0145568847656, "loss": 0.7366, "rewards/accuracies": 0.5, "rewards/chosen": 0.8226372003555298, "rewards/margins": -0.03329024091362953, "rewards/rejected": 0.8559274673461914, "step": 4390 }, { "epoch": 0.20428060727053252, "grad_norm": 55.52617263793945, "learning_rate": 2.877654487209248e-07, "logits/chosen": -18.187585830688477, "logits/rejected": -16.585676193237305, "logps/chosen": -464.2945251464844, "logps/rejected": -293.53094482421875, "loss": 0.5888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9673522710800171, "rewards/margins": 0.3178408741950989, "rewards/rejected": 0.6495113968849182, "step": 4400 }, { "epoch": 0.20474488137796554, "grad_norm": 78.29500579833984, "learning_rate": 2.8773759227447886e-07, "logits/chosen": -18.002197265625, "logits/rejected": -18.174428939819336, "logps/chosen": -443.41949462890625, "logps/rejected": -440.36285400390625, "loss": 0.71, "rewards/accuracies": 0.5, "rewards/chosen": 1.077500581741333, "rewards/margins": 0.003356248140335083, "rewards/rejected": 1.0741443634033203, "step": 4410 }, { "epoch": 0.2052091554853986, "grad_norm": 75.23421478271484, "learning_rate": 2.8770973582803285e-07, "logits/chosen": -18.893648147583008, "logits/rejected": -18.25806999206543, "logps/chosen": -389.9239807128906, "logps/rejected": -337.2721862792969, "loss": 0.6312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9936637878417969, "rewards/margins": 0.2241876870393753, "rewards/rejected": 0.769476056098938, "step": 4420 }, { "epoch": 0.2056734295928316, "grad_norm": 37.530338287353516, "learning_rate": 2.876818793815869e-07, "logits/chosen": -16.454029083251953, "logits/rejected": -16.676725387573242, "logps/chosen": -373.91131591796875, "logps/rejected": -339.4589538574219, "loss": 0.7146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8087382316589355, "rewards/margins": 0.06374292075634003, "rewards/rejected": 0.7449952363967896, "step": 4430 }, { "epoch": 0.20613770370026463, "grad_norm": 68.37398529052734, "learning_rate": 2.876540229351409e-07, "logits/chosen": -18.000085830688477, "logits/rejected": -16.817276000976562, "logps/chosen": -432.903076171875, "logps/rejected": -330.992431640625, "loss": 0.646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0323704481124878, "rewards/margins": 0.18274006247520447, "rewards/rejected": 0.8496305346488953, "step": 4440 }, { "epoch": 0.20660197780769765, "grad_norm": 21.15660858154297, "learning_rate": 2.876261664886949e-07, "logits/chosen": -16.875967025756836, "logits/rejected": -16.752180099487305, "logps/chosen": -352.5218505859375, "logps/rejected": -354.1351013183594, "loss": 0.6781, "rewards/accuracies": 0.5, "rewards/chosen": 0.7594403028488159, "rewards/margins": 0.040189653635025024, "rewards/rejected": 0.7192505598068237, "step": 4450 }, { "epoch": 0.2070662519151307, "grad_norm": 71.2194595336914, "learning_rate": 2.875983100422489e-07, "logits/chosen": -18.429040908813477, "logits/rejected": -17.41753387451172, "logps/chosen": -517.5739135742188, "logps/rejected": -413.42413330078125, "loss": 0.6236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1382255554199219, "rewards/margins": 0.2184806764125824, "rewards/rejected": 0.9197448492050171, "step": 4460 }, { "epoch": 0.20753052602256372, "grad_norm": 102.32286071777344, "learning_rate": 2.8757045359580295e-07, "logits/chosen": -18.670927047729492, "logits/rejected": -18.33689308166504, "logps/chosen": -382.1297912597656, "logps/rejected": -386.545166015625, "loss": 0.7421, "rewards/accuracies": 0.5, "rewards/chosen": 0.8409803509712219, "rewards/margins": -0.05572450906038284, "rewards/rejected": 0.8967048525810242, "step": 4470 }, { "epoch": 0.20799480012999674, "grad_norm": 101.84711456298828, "learning_rate": 2.87542597149357e-07, "logits/chosen": -17.348047256469727, "logits/rejected": -16.696409225463867, "logps/chosen": -407.05181884765625, "logps/rejected": -401.19390869140625, "loss": 0.6756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9405258893966675, "rewards/margins": 0.10023341327905655, "rewards/rejected": 0.8402925729751587, "step": 4480 }, { "epoch": 0.2084590742374298, "grad_norm": 51.107078552246094, "learning_rate": 2.87514740702911e-07, "logits/chosen": -17.596708297729492, "logits/rejected": -17.402034759521484, "logps/chosen": -427.704833984375, "logps/rejected": -375.3165588378906, "loss": 0.6532, "rewards/accuracies": 0.5, "rewards/chosen": 1.0757417678833008, "rewards/margins": 0.16847968101501465, "rewards/rejected": 0.9072620272636414, "step": 4490 }, { "epoch": 0.2089233483448628, "grad_norm": 57.237239837646484, "learning_rate": 2.8748688425646497e-07, "logits/chosen": -18.995006561279297, "logits/rejected": -18.060848236083984, "logps/chosen": -434.064453125, "logps/rejected": -324.506103515625, "loss": 0.6047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0196452140808105, "rewards/margins": 0.21567144989967346, "rewards/rejected": 0.8039737939834595, "step": 4500 }, { "epoch": 0.20938762245229584, "grad_norm": 53.911075592041016, "learning_rate": 2.87459027810019e-07, "logits/chosen": -16.96784782409668, "logits/rejected": -16.206281661987305, "logps/chosen": -414.6551208496094, "logps/rejected": -297.02056884765625, "loss": 0.6758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.945966899394989, "rewards/margins": 0.17118097841739655, "rewards/rejected": 0.7747858762741089, "step": 4510 }, { "epoch": 0.20985189655972886, "grad_norm": 69.80020141601562, "learning_rate": 2.8743117136357305e-07, "logits/chosen": -16.987964630126953, "logits/rejected": -18.282459259033203, "logps/chosen": -313.1648254394531, "logps/rejected": -389.4388732910156, "loss": 0.7319, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8180480003356934, "rewards/margins": -0.049755729734897614, "rewards/rejected": 0.867803692817688, "step": 4520 }, { "epoch": 0.2103161706671619, "grad_norm": 44.09538650512695, "learning_rate": 2.874033149171271e-07, "logits/chosen": -18.808130264282227, "logits/rejected": -17.393781661987305, "logps/chosen": -578.83251953125, "logps/rejected": -490.521484375, "loss": 0.5778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.194612741470337, "rewards/margins": 0.2678738236427307, "rewards/rejected": 0.9267389178276062, "step": 4530 }, { "epoch": 0.21078044477459493, "grad_norm": 125.37730407714844, "learning_rate": 2.873754584706811e-07, "logits/chosen": -18.976993560791016, "logits/rejected": -17.872238159179688, "logps/chosen": -459.6896057128906, "logps/rejected": -356.53021240234375, "loss": 0.6186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9753411412239075, "rewards/margins": 0.2063916027545929, "rewards/rejected": 0.768949568271637, "step": 4540 }, { "epoch": 0.21124471888202795, "grad_norm": 98.76469421386719, "learning_rate": 2.8734760202423507e-07, "logits/chosen": -18.46615219116211, "logits/rejected": -17.970195770263672, "logps/chosen": -449.21441650390625, "logps/rejected": -359.6968078613281, "loss": 0.6446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9390686750411987, "rewards/margins": 0.1144084483385086, "rewards/rejected": 0.8246603012084961, "step": 4550 }, { "epoch": 0.21170899298946097, "grad_norm": 67.46662139892578, "learning_rate": 2.873197455777891e-07, "logits/chosen": -17.79813575744629, "logits/rejected": -16.153682708740234, "logps/chosen": -395.4600524902344, "logps/rejected": -211.281494140625, "loss": 0.5105, "rewards/accuracies": 1.0, "rewards/chosen": 1.0504350662231445, "rewards/margins": 0.417081356048584, "rewards/rejected": 0.6333537697792053, "step": 4560 }, { "epoch": 0.21217326709689402, "grad_norm": 59.30665969848633, "learning_rate": 2.8729188913134315e-07, "logits/chosen": -18.106914520263672, "logits/rejected": -17.656246185302734, "logps/chosen": -516.9124755859375, "logps/rejected": -477.94378662109375, "loss": 0.8108, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0020906925201416, "rewards/margins": -0.17267531156539917, "rewards/rejected": 1.1747660636901855, "step": 4570 }, { "epoch": 0.21263754120432704, "grad_norm": 68.41188049316406, "learning_rate": 2.8726403268489714e-07, "logits/chosen": -19.36652183532715, "logits/rejected": -17.87046241760254, "logps/chosen": -424.41510009765625, "logps/rejected": -266.5649108886719, "loss": 0.5327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9955042004585266, "rewards/margins": 0.4051719605922699, "rewards/rejected": 0.5903321504592896, "step": 4580 }, { "epoch": 0.21310181531176006, "grad_norm": 78.92621612548828, "learning_rate": 2.872361762384512e-07, "logits/chosen": -17.81015968322754, "logits/rejected": -17.268672943115234, "logps/chosen": -422.59552001953125, "logps/rejected": -344.85467529296875, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 1.090578317642212, "rewards/margins": 0.09287668764591217, "rewards/rejected": 0.9977016448974609, "step": 4590 }, { "epoch": 0.21356608941919308, "grad_norm": 49.0003547668457, "learning_rate": 2.872083197920052e-07, "logits/chosen": -17.964019775390625, "logits/rejected": -18.229019165039062, "logps/chosen": -445.7533264160156, "logps/rejected": -337.9835510253906, "loss": 0.5846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1964107751846313, "rewards/margins": 0.29319489002227783, "rewards/rejected": 0.9032160043716431, "step": 4600 }, { "epoch": 0.21403036352662613, "grad_norm": 64.71725463867188, "learning_rate": 2.871804633455592e-07, "logits/chosen": -17.913557052612305, "logits/rejected": -16.579017639160156, "logps/chosen": -420.5682678222656, "logps/rejected": -255.36959838867188, "loss": 0.5453, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8708723783493042, "rewards/margins": 0.3769404888153076, "rewards/rejected": 0.49393200874328613, "step": 4610 }, { "epoch": 0.21449463763405915, "grad_norm": 176.4026641845703, "learning_rate": 2.871526068991132e-07, "logits/chosen": -19.36798667907715, "logits/rejected": -19.174028396606445, "logps/chosen": -391.1190490722656, "logps/rejected": -397.3308410644531, "loss": 0.7732, "rewards/accuracies": 0.5, "rewards/chosen": 0.8410083055496216, "rewards/margins": -0.05969369411468506, "rewards/rejected": 0.9007019996643066, "step": 4620 }, { "epoch": 0.21495891174149218, "grad_norm": 45.064056396484375, "learning_rate": 2.8712475045266724e-07, "logits/chosen": -17.938121795654297, "logits/rejected": -17.704614639282227, "logps/chosen": -434.86199951171875, "logps/rejected": -416.9164123535156, "loss": 0.7089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1172951459884644, "rewards/margins": 0.03598739951848984, "rewards/rejected": 1.0813078880310059, "step": 4630 }, { "epoch": 0.2154231858489252, "grad_norm": 58.15275573730469, "learning_rate": 2.870968940062213e-07, "logits/chosen": -17.79999351501465, "logits/rejected": -17.254093170166016, "logps/chosen": -424.10809326171875, "logps/rejected": -328.97601318359375, "loss": 0.6063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.111320972442627, "rewards/margins": 0.2483116090297699, "rewards/rejected": 0.8630093336105347, "step": 4640 }, { "epoch": 0.21588745995635825, "grad_norm": 45.9622688293457, "learning_rate": 2.8706903755977527e-07, "logits/chosen": -17.9300594329834, "logits/rejected": -17.094751358032227, "logps/chosen": -343.4571228027344, "logps/rejected": -291.18585205078125, "loss": 0.6729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9621256589889526, "rewards/margins": 0.08551427721977234, "rewards/rejected": 0.8766114115715027, "step": 4650 }, { "epoch": 0.21635173406379127, "grad_norm": 50.5578498840332, "learning_rate": 2.870411811133293e-07, "logits/chosen": -17.563594818115234, "logits/rejected": -16.28110122680664, "logps/chosen": -389.8401794433594, "logps/rejected": -229.78158569335938, "loss": 0.5245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.181412935256958, "rewards/margins": 0.4402555525302887, "rewards/rejected": 0.7411574125289917, "step": 4660 }, { "epoch": 0.2168160081712243, "grad_norm": 59.38346481323242, "learning_rate": 2.870133246668833e-07, "logits/chosen": -18.284542083740234, "logits/rejected": -17.371109008789062, "logps/chosen": -417.5257263183594, "logps/rejected": -354.27239990234375, "loss": 0.6408, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0712112188339233, "rewards/margins": 0.19602057337760925, "rewards/rejected": 0.8751906156539917, "step": 4670 }, { "epoch": 0.2172802822786573, "grad_norm": 78.5655288696289, "learning_rate": 2.8698546822043734e-07, "logits/chosen": -18.34337615966797, "logits/rejected": -17.249629974365234, "logps/chosen": -550.1319580078125, "logps/rejected": -407.94091796875, "loss": 0.5412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4531471729278564, "rewards/margins": 0.45644673705101013, "rewards/rejected": 0.9967004060745239, "step": 4680 }, { "epoch": 0.21774455638609036, "grad_norm": 50.07729721069336, "learning_rate": 2.869576117739913e-07, "logits/chosen": -18.284564971923828, "logits/rejected": -16.925844192504883, "logps/chosen": -377.50909423828125, "logps/rejected": -215.45449829101562, "loss": 0.5006, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0505521297454834, "rewards/margins": 0.49176788330078125, "rewards/rejected": 0.5587840676307678, "step": 4690 }, { "epoch": 0.21820883049352338, "grad_norm": 49.75025939941406, "learning_rate": 2.8692975532754536e-07, "logits/chosen": -18.576820373535156, "logits/rejected": -17.384721755981445, "logps/chosen": -438.4013671875, "logps/rejected": -375.3874206542969, "loss": 0.6674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.97810298204422, "rewards/margins": 0.06974105536937714, "rewards/rejected": 0.9083617925643921, "step": 4700 }, { "epoch": 0.2186731046009564, "grad_norm": 64.64168548583984, "learning_rate": 2.869018988810994e-07, "logits/chosen": -17.453502655029297, "logits/rejected": -16.136150360107422, "logps/chosen": -457.25567626953125, "logps/rejected": -230.32376098632812, "loss": 0.5667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1801164150238037, "rewards/margins": 0.42692461609840393, "rewards/rejected": 0.7531918883323669, "step": 4710 }, { "epoch": 0.21913737870838942, "grad_norm": 72.24520111083984, "learning_rate": 2.868740424346534e-07, "logits/chosen": -18.4731502532959, "logits/rejected": -17.721092224121094, "logps/chosen": -449.07720947265625, "logps/rejected": -346.024658203125, "loss": 0.6041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0961090326309204, "rewards/margins": 0.22971686720848083, "rewards/rejected": 0.8663923144340515, "step": 4720 }, { "epoch": 0.21960165281582247, "grad_norm": 74.26905822753906, "learning_rate": 2.8684618598820743e-07, "logits/chosen": -18.299707412719727, "logits/rejected": -17.018192291259766, "logps/chosen": -325.9400329589844, "logps/rejected": -241.94754028320312, "loss": 0.5988, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.896953284740448, "rewards/margins": 0.21014046669006348, "rewards/rejected": 0.6868129372596741, "step": 4730 }, { "epoch": 0.2200659269232555, "grad_norm": 147.49478149414062, "learning_rate": 2.868183295417614e-07, "logits/chosen": -17.987863540649414, "logits/rejected": -16.869211196899414, "logps/chosen": -440.81011962890625, "logps/rejected": -342.986083984375, "loss": 0.6915, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1040294170379639, "rewards/margins": 0.09266646951436996, "rewards/rejected": 1.01136314868927, "step": 4740 }, { "epoch": 0.22053020103068852, "grad_norm": 61.60625076293945, "learning_rate": 2.8679047309531546e-07, "logits/chosen": -17.081531524658203, "logits/rejected": -16.56760025024414, "logps/chosen": -368.71783447265625, "logps/rejected": -297.03863525390625, "loss": 0.6489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0070503950119019, "rewards/margins": 0.12978364527225494, "rewards/rejected": 0.8772667646408081, "step": 4750 }, { "epoch": 0.22099447513812154, "grad_norm": 57.25096893310547, "learning_rate": 2.867626166488695e-07, "logits/chosen": -17.71824836730957, "logits/rejected": -17.16972541809082, "logps/chosen": -381.62994384765625, "logps/rejected": -338.00653076171875, "loss": 0.6668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9130890965461731, "rewards/margins": 0.14183638989925385, "rewards/rejected": 0.7712526321411133, "step": 4760 }, { "epoch": 0.2214587492455546, "grad_norm": 85.29606628417969, "learning_rate": 2.867347602024235e-07, "logits/chosen": -18.05654525756836, "logits/rejected": -17.405431747436523, "logps/chosen": -395.28375244140625, "logps/rejected": -347.62481689453125, "loss": 0.6824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.06593656539917, "rewards/margins": 0.1318880319595337, "rewards/rejected": 0.9340485334396362, "step": 4770 }, { "epoch": 0.2219230233529876, "grad_norm": 44.89200210571289, "learning_rate": 2.8670690375597753e-07, "logits/chosen": -19.084312438964844, "logits/rejected": -18.176998138427734, "logps/chosen": -447.24945068359375, "logps/rejected": -404.04351806640625, "loss": 0.6404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2043808698654175, "rewards/margins": 0.1865081489086151, "rewards/rejected": 1.0178728103637695, "step": 4780 }, { "epoch": 0.22238729746042063, "grad_norm": 96.22667694091797, "learning_rate": 2.866790473095315e-07, "logits/chosen": -18.947158813476562, "logits/rejected": -17.61870002746582, "logps/chosen": -448.3558044433594, "logps/rejected": -343.8897705078125, "loss": 0.5913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2376458644866943, "rewards/margins": 0.2994541525840759, "rewards/rejected": 0.9381917119026184, "step": 4790 }, { "epoch": 0.22285157156785365, "grad_norm": 92.2330093383789, "learning_rate": 2.8665119086308556e-07, "logits/chosen": -18.199535369873047, "logits/rejected": -17.67157554626465, "logps/chosen": -388.663330078125, "logps/rejected": -311.4795837402344, "loss": 0.661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0494245290756226, "rewards/margins": 0.09763672202825546, "rewards/rejected": 0.9517878293991089, "step": 4800 }, { "epoch": 0.2233158456752867, "grad_norm": 89.92005157470703, "learning_rate": 2.8662333441663955e-07, "logits/chosen": -17.989334106445312, "logits/rejected": -17.867259979248047, "logps/chosen": -338.82318115234375, "logps/rejected": -318.41400146484375, "loss": 0.7306, "rewards/accuracies": 0.5, "rewards/chosen": 0.8176368474960327, "rewards/margins": -0.030736494809389114, "rewards/rejected": 0.848373293876648, "step": 4810 }, { "epoch": 0.22378011978271972, "grad_norm": 136.36492919921875, "learning_rate": 2.865954779701936e-07, "logits/chosen": -17.90657615661621, "logits/rejected": -17.2901668548584, "logps/chosen": -406.1366271972656, "logps/rejected": -298.71185302734375, "loss": 0.6368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9472452402114868, "rewards/margins": 0.19242338836193085, "rewards/rejected": 0.75482177734375, "step": 4820 }, { "epoch": 0.22424439389015274, "grad_norm": 57.8651237487793, "learning_rate": 2.8656762152374763e-07, "logits/chosen": -17.855976104736328, "logits/rejected": -17.493934631347656, "logps/chosen": -363.08197021484375, "logps/rejected": -328.5620422363281, "loss": 0.7399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8307450413703918, "rewards/margins": -0.0627308264374733, "rewards/rejected": 0.8934758305549622, "step": 4830 }, { "epoch": 0.22470866799758576, "grad_norm": 98.692138671875, "learning_rate": 2.865397650773016e-07, "logits/chosen": -17.84244728088379, "logits/rejected": -17.88890266418457, "logps/chosen": -452.92822265625, "logps/rejected": -404.5495910644531, "loss": 0.7083, "rewards/accuracies": 0.5, "rewards/chosen": 0.9723037481307983, "rewards/margins": -0.0054008751176297665, "rewards/rejected": 0.977704644203186, "step": 4840 }, { "epoch": 0.22517294210501881, "grad_norm": 70.90565490722656, "learning_rate": 2.8651190863085566e-07, "logits/chosen": -17.837764739990234, "logits/rejected": -16.620697021484375, "logps/chosen": -460.21923828125, "logps/rejected": -293.2234802246094, "loss": 0.6014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.990347683429718, "rewards/margins": 0.2209126502275467, "rewards/rejected": 0.7694350481033325, "step": 4850 }, { "epoch": 0.22563721621245184, "grad_norm": 83.40522003173828, "learning_rate": 2.8648405218440965e-07, "logits/chosen": -18.14415740966797, "logits/rejected": -17.214872360229492, "logps/chosen": -371.1001281738281, "logps/rejected": -313.79473876953125, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1481746435165405, "rewards/margins": 0.36958450078964233, "rewards/rejected": 0.7785903215408325, "step": 4860 }, { "epoch": 0.22610149031988486, "grad_norm": 83.25127410888672, "learning_rate": 2.864561957379637e-07, "logits/chosen": -17.41317367553711, "logits/rejected": -17.476167678833008, "logps/chosen": -416.8440856933594, "logps/rejected": -438.70428466796875, "loss": 0.7141, "rewards/accuracies": 0.5, "rewards/chosen": 1.1011117696762085, "rewards/margins": -0.0016798407305032015, "rewards/rejected": 1.1027915477752686, "step": 4870 }, { "epoch": 0.22656576442731788, "grad_norm": 40.828121185302734, "learning_rate": 2.864283392915177e-07, "logits/chosen": -16.789291381835938, "logits/rejected": -16.58831214904785, "logps/chosen": -252.673583984375, "logps/rejected": -277.0472106933594, "loss": 0.7536, "rewards/accuracies": 0.5, "rewards/chosen": 0.6922749280929565, "rewards/margins": -0.05462505295872688, "rewards/rejected": 0.7468999624252319, "step": 4880 }, { "epoch": 0.22703003853475093, "grad_norm": 46.38114929199219, "learning_rate": 2.864004828450717e-07, "logits/chosen": -17.50882339477539, "logits/rejected": -17.24483871459961, "logps/chosen": -408.152587890625, "logps/rejected": -385.4029235839844, "loss": 0.6362, "rewards/accuracies": 0.5, "rewards/chosen": 1.1063320636749268, "rewards/margins": 0.2051454484462738, "rewards/rejected": 0.9011866450309753, "step": 4890 }, { "epoch": 0.22749431264218395, "grad_norm": 137.9785614013672, "learning_rate": 2.8637262639862576e-07, "logits/chosen": -18.284465789794922, "logits/rejected": -17.33695411682129, "logps/chosen": -501.44525146484375, "logps/rejected": -401.377197265625, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2118489742279053, "rewards/margins": 0.24030153453350067, "rewards/rejected": 0.9715474247932434, "step": 4900 }, { "epoch": 0.22795858674961697, "grad_norm": 63.401824951171875, "learning_rate": 2.8634476995217975e-07, "logits/chosen": -17.687686920166016, "logits/rejected": -16.97152328491211, "logps/chosen": -372.76708984375, "logps/rejected": -324.0478820800781, "loss": 0.6057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.032700777053833, "rewards/margins": 0.25688856840133667, "rewards/rejected": 0.7758121490478516, "step": 4910 }, { "epoch": 0.22842286085705, "grad_norm": 76.78984069824219, "learning_rate": 2.8631691350573374e-07, "logits/chosen": -18.441362380981445, "logits/rejected": -17.2286376953125, "logps/chosen": -459.13690185546875, "logps/rejected": -256.2331237792969, "loss": 0.5294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2000315189361572, "rewards/margins": 0.4534642696380615, "rewards/rejected": 0.7465672492980957, "step": 4920 }, { "epoch": 0.22888713496448304, "grad_norm": 60.02265167236328, "learning_rate": 2.862890570592878e-07, "logits/chosen": -17.655385971069336, "logits/rejected": -16.581090927124023, "logps/chosen": -444.7256774902344, "logps/rejected": -283.3291015625, "loss": 0.5589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0944761037826538, "rewards/margins": 0.37313520908355713, "rewards/rejected": 0.7213408350944519, "step": 4930 }, { "epoch": 0.22935140907191606, "grad_norm": 54.834293365478516, "learning_rate": 2.862612006128418e-07, "logits/chosen": -19.278385162353516, "logits/rejected": -17.874805450439453, "logps/chosen": -472.5941467285156, "logps/rejected": -444.63232421875, "loss": 0.5643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3967478275299072, "rewards/margins": 0.3126637041568756, "rewards/rejected": 1.084084153175354, "step": 4940 }, { "epoch": 0.22981568317934908, "grad_norm": 169.4990997314453, "learning_rate": 2.8623334416639586e-07, "logits/chosen": -18.46364974975586, "logits/rejected": -16.928781509399414, "logps/chosen": -396.4701843261719, "logps/rejected": -273.3775329589844, "loss": 0.5819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.082417607307434, "rewards/margins": 0.35843944549560547, "rewards/rejected": 0.7239781618118286, "step": 4950 }, { "epoch": 0.2302799572867821, "grad_norm": 56.91423797607422, "learning_rate": 2.8620548771994985e-07, "logits/chosen": -18.053810119628906, "logits/rejected": -17.225116729736328, "logps/chosen": -384.4461975097656, "logps/rejected": -288.6903991699219, "loss": 0.6007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.917990505695343, "rewards/margins": 0.23787155747413635, "rewards/rejected": 0.6801189184188843, "step": 4960 }, { "epoch": 0.23074423139421515, "grad_norm": 100.45523071289062, "learning_rate": 2.8617763127350384e-07, "logits/chosen": -17.869243621826172, "logits/rejected": -17.314279556274414, "logps/chosen": -350.9238586425781, "logps/rejected": -344.1046447753906, "loss": 0.6384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0657336711883545, "rewards/margins": 0.16731080412864685, "rewards/rejected": 0.8984228372573853, "step": 4970 }, { "epoch": 0.23120850550164818, "grad_norm": 52.90190124511719, "learning_rate": 2.861497748270579e-07, "logits/chosen": -18.14702033996582, "logits/rejected": -17.521907806396484, "logps/chosen": -382.24029541015625, "logps/rejected": -350.7237243652344, "loss": 0.6209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1050622463226318, "rewards/margins": 0.169800266623497, "rewards/rejected": 0.9352619051933289, "step": 4980 }, { "epoch": 0.2316727796090812, "grad_norm": 20.624624252319336, "learning_rate": 2.861219183806119e-07, "logits/chosen": -17.10013771057129, "logits/rejected": -16.49679183959961, "logps/chosen": -393.119384765625, "logps/rejected": -335.2757568359375, "loss": 0.76, "rewards/accuracies": 0.5, "rewards/chosen": 0.9859763979911804, "rewards/margins": 0.05775586888194084, "rewards/rejected": 0.9282205700874329, "step": 4990 }, { "epoch": 0.23213705371651422, "grad_norm": 68.05693054199219, "learning_rate": 2.860940619341659e-07, "logits/chosen": -17.661596298217773, "logits/rejected": -17.059139251708984, "logps/chosen": -436.39385986328125, "logps/rejected": -347.05059814453125, "loss": 0.6205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.10430109500885, "rewards/margins": 0.24824412167072296, "rewards/rejected": 0.8560568690299988, "step": 5000 }, { "epoch": 0.23260132782394727, "grad_norm": 56.23577117919922, "learning_rate": 2.8606620548771995e-07, "logits/chosen": -18.130233764648438, "logits/rejected": -17.54892349243164, "logps/chosen": -480.25262451171875, "logps/rejected": -414.89947509765625, "loss": 0.668, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1961305141448975, "rewards/margins": 0.10628805309534073, "rewards/rejected": 1.089842438697815, "step": 5010 }, { "epoch": 0.2330656019313803, "grad_norm": 75.58949279785156, "learning_rate": 2.86038349041274e-07, "logits/chosen": -18.743118286132812, "logits/rejected": -17.55527687072754, "logps/chosen": -418.32403564453125, "logps/rejected": -332.10870361328125, "loss": 0.7037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3444569110870361, "rewards/margins": 0.12975284457206726, "rewards/rejected": 1.2147042751312256, "step": 5020 }, { "epoch": 0.2335298760388133, "grad_norm": 25.633216857910156, "learning_rate": 2.86010492594828e-07, "logits/chosen": -18.37950325012207, "logits/rejected": -17.125280380249023, "logps/chosen": -661.7210693359375, "logps/rejected": -333.6180419921875, "loss": 0.4752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.726593017578125, "rewards/margins": 0.7231378555297852, "rewards/rejected": 1.0034550428390503, "step": 5030 }, { "epoch": 0.23399415014624633, "grad_norm": 124.6767349243164, "learning_rate": 2.8598263614838196e-07, "logits/chosen": -18.627771377563477, "logits/rejected": -17.568798065185547, "logps/chosen": -465.06976318359375, "logps/rejected": -350.00665283203125, "loss": 0.551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1956286430358887, "rewards/margins": 0.41267991065979004, "rewards/rejected": 0.7829487919807434, "step": 5040 }, { "epoch": 0.23445842425367938, "grad_norm": 134.77919006347656, "learning_rate": 2.85954779701936e-07, "logits/chosen": -18.25107192993164, "logits/rejected": -18.011226654052734, "logps/chosen": -493.81158447265625, "logps/rejected": -465.8528747558594, "loss": 0.7302, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2808282375335693, "rewards/margins": -0.002821665955707431, "rewards/rejected": 1.2836499214172363, "step": 5050 }, { "epoch": 0.2349226983611124, "grad_norm": 52.39257049560547, "learning_rate": 2.8592692325549005e-07, "logits/chosen": -18.14494514465332, "logits/rejected": -18.43283462524414, "logps/chosen": -401.6662292480469, "logps/rejected": -430.27984619140625, "loss": 0.6979, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.0940340757369995, "rewards/margins": 0.03962203115224838, "rewards/rejected": 1.0544121265411377, "step": 5060 }, { "epoch": 0.23538697246854542, "grad_norm": 45.29792785644531, "learning_rate": 2.8589906680904403e-07, "logits/chosen": -18.37929916381836, "logits/rejected": -17.03180694580078, "logps/chosen": -611.2840576171875, "logps/rejected": -361.9637451171875, "loss": 0.4973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5999114513397217, "rewards/margins": 0.600090503692627, "rewards/rejected": 0.9998205900192261, "step": 5070 }, { "epoch": 0.23585124657597845, "grad_norm": 92.78739166259766, "learning_rate": 2.858712103625981e-07, "logits/chosen": -18.871461868286133, "logits/rejected": -18.38599395751953, "logps/chosen": -413.83990478515625, "logps/rejected": -403.51873779296875, "loss": 0.6403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.084266185760498, "rewards/margins": 0.15102358162403107, "rewards/rejected": 0.933242678642273, "step": 5080 }, { "epoch": 0.2363155206834115, "grad_norm": 59.93891906738281, "learning_rate": 2.8584335391615206e-07, "logits/chosen": -17.540578842163086, "logits/rejected": -17.114477157592773, "logps/chosen": -427.18328857421875, "logps/rejected": -351.6246643066406, "loss": 0.6001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.08748459815979, "rewards/margins": 0.2150939404964447, "rewards/rejected": 0.872390627861023, "step": 5090 }, { "epoch": 0.23677979479084452, "grad_norm": 76.12251281738281, "learning_rate": 2.858154974697061e-07, "logits/chosen": -16.836986541748047, "logits/rejected": -16.372312545776367, "logps/chosen": -292.67413330078125, "logps/rejected": -274.80413818359375, "loss": 0.6746, "rewards/accuracies": 0.5, "rewards/chosen": 0.9589205980300903, "rewards/margins": 0.12279431521892548, "rewards/rejected": 0.8361262083053589, "step": 5100 }, { "epoch": 0.23724406889827754, "grad_norm": 76.75672912597656, "learning_rate": 2.857876410232601e-07, "logits/chosen": -19.199596405029297, "logits/rejected": -18.263690948486328, "logps/chosen": -586.4119262695312, "logps/rejected": -472.40557861328125, "loss": 0.6069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3598315715789795, "rewards/margins": 0.24281708896160126, "rewards/rejected": 1.1170145273208618, "step": 5110 }, { "epoch": 0.23770834300571056, "grad_norm": 77.86820220947266, "learning_rate": 2.8575978457681413e-07, "logits/chosen": -17.682952880859375, "logits/rejected": -17.323095321655273, "logps/chosen": -450.78729248046875, "logps/rejected": -408.1517333984375, "loss": 0.7308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0760996341705322, "rewards/margins": 0.06802015006542206, "rewards/rejected": 1.0080795288085938, "step": 5120 }, { "epoch": 0.2381726171131436, "grad_norm": 46.06122970581055, "learning_rate": 2.857319281303682e-07, "logits/chosen": -17.05001449584961, "logits/rejected": -16.430959701538086, "logps/chosen": -312.84454345703125, "logps/rejected": -277.15899658203125, "loss": 0.5952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1885615587234497, "rewards/margins": 0.3118475377559662, "rewards/rejected": 0.8767139315605164, "step": 5130 }, { "epoch": 0.23863689122057663, "grad_norm": 56.996578216552734, "learning_rate": 2.8570407168392216e-07, "logits/chosen": -18.19634246826172, "logits/rejected": -17.73285484313965, "logps/chosen": -540.5237426757812, "logps/rejected": -442.4817810058594, "loss": 0.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4698278903961182, "rewards/margins": 0.15735992789268494, "rewards/rejected": 1.3124679327011108, "step": 5140 }, { "epoch": 0.23910116532800965, "grad_norm": 143.6078643798828, "learning_rate": 2.856762152374762e-07, "logits/chosen": -18.184280395507812, "logits/rejected": -17.548171997070312, "logps/chosen": -465.4551696777344, "logps/rejected": -456.50555419921875, "loss": 0.7356, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.1051218509674072, "rewards/margins": 0.015428602695465088, "rewards/rejected": 1.089693307876587, "step": 5150 }, { "epoch": 0.23956543943544267, "grad_norm": 119.43887329101562, "learning_rate": 2.856483587910302e-07, "logits/chosen": -18.70871925354004, "logits/rejected": -19.01114273071289, "logps/chosen": -388.79827880859375, "logps/rejected": -427.991455078125, "loss": 0.6983, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0899300575256348, "rewards/margins": 0.007724320981651545, "rewards/rejected": 1.0822057723999023, "step": 5160 }, { "epoch": 0.24002971354287572, "grad_norm": 59.88078689575195, "learning_rate": 2.8562050234458423e-07, "logits/chosen": -19.239093780517578, "logits/rejected": -17.913103103637695, "logps/chosen": -479.83319091796875, "logps/rejected": -325.940673828125, "loss": 0.5114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.360792875289917, "rewards/margins": 0.44755420088768005, "rewards/rejected": 0.9132386445999146, "step": 5170 }, { "epoch": 0.24049398765030874, "grad_norm": 73.08869934082031, "learning_rate": 2.855926458981383e-07, "logits/chosen": -18.934377670288086, "logits/rejected": -17.9959716796875, "logps/chosen": -432.20361328125, "logps/rejected": -381.12689208984375, "loss": 0.5923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1940810680389404, "rewards/margins": 0.29511839151382446, "rewards/rejected": 0.8989627957344055, "step": 5180 }, { "epoch": 0.24095826175774177, "grad_norm": 101.90892791748047, "learning_rate": 2.8556478945169226e-07, "logits/chosen": -18.090572357177734, "logits/rejected": -18.10669708251953, "logps/chosen": -332.4056091308594, "logps/rejected": -356.21868896484375, "loss": 0.6695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9186347126960754, "rewards/margins": 0.09100769460201263, "rewards/rejected": 0.8276268839836121, "step": 5190 }, { "epoch": 0.2414225358651748, "grad_norm": 74.24759674072266, "learning_rate": 2.855369330052463e-07, "logits/chosen": -18.617462158203125, "logits/rejected": -18.56329917907715, "logps/chosen": -325.6420593261719, "logps/rejected": -298.5250244140625, "loss": 0.7393, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.9840266108512878, "rewards/margins": -0.04983597993850708, "rewards/rejected": 1.0338627099990845, "step": 5200 }, { "epoch": 0.24188680997260784, "grad_norm": 52.44668197631836, "learning_rate": 2.855090765588003e-07, "logits/chosen": -18.33194923400879, "logits/rejected": -17.90555191040039, "logps/chosen": -481.74688720703125, "logps/rejected": -292.26641845703125, "loss": 0.4969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2512178421020508, "rewards/margins": 0.501774787902832, "rewards/rejected": 0.7494430541992188, "step": 5210 }, { "epoch": 0.24235108408004086, "grad_norm": 122.09659576416016, "learning_rate": 2.8548122011235433e-07, "logits/chosen": -18.094890594482422, "logits/rejected": -17.52042579650879, "logps/chosen": -305.1699523925781, "logps/rejected": -268.525390625, "loss": 0.6959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8782604932785034, "rewards/margins": 0.08133852481842041, "rewards/rejected": 0.7969220280647278, "step": 5220 }, { "epoch": 0.24281535818747388, "grad_norm": 147.31704711914062, "learning_rate": 2.854533636659083e-07, "logits/chosen": -19.001323699951172, "logits/rejected": -18.2458438873291, "logps/chosen": -543.1766967773438, "logps/rejected": -390.4980773925781, "loss": 0.5937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3941758871078491, "rewards/margins": 0.2781301736831665, "rewards/rejected": 1.1160457134246826, "step": 5230 }, { "epoch": 0.2432796322949069, "grad_norm": 67.06189727783203, "learning_rate": 2.8542550721946236e-07, "logits/chosen": -17.714649200439453, "logits/rejected": -16.838102340698242, "logps/chosen": -442.4021911621094, "logps/rejected": -310.940673828125, "loss": 0.5667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.461525559425354, "rewards/margins": 0.40587979555130005, "rewards/rejected": 1.0556457042694092, "step": 5240 }, { "epoch": 0.24374390640233995, "grad_norm": 222.1954803466797, "learning_rate": 2.853976507730164e-07, "logits/chosen": -19.284971237182617, "logits/rejected": -17.2246036529541, "logps/chosen": -474.8768005371094, "logps/rejected": -340.0010986328125, "loss": 0.5494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3206846714019775, "rewards/margins": 0.41967636346817017, "rewards/rejected": 0.9010082483291626, "step": 5250 }, { "epoch": 0.24420818050977297, "grad_norm": 138.72103881835938, "learning_rate": 2.853697943265704e-07, "logits/chosen": -16.29001235961914, "logits/rejected": -15.916157722473145, "logps/chosen": -395.69610595703125, "logps/rejected": -334.0440673828125, "loss": 0.6628, "rewards/accuracies": 0.5, "rewards/chosen": 1.0954519510269165, "rewards/margins": 0.20495715737342834, "rewards/rejected": 0.8904949426651001, "step": 5260 }, { "epoch": 0.244672454617206, "grad_norm": 96.02104187011719, "learning_rate": 2.8534193788012443e-07, "logits/chosen": -18.440723419189453, "logits/rejected": -18.309106826782227, "logps/chosen": -331.7604675292969, "logps/rejected": -315.4805603027344, "loss": 0.7035, "rewards/accuracies": 0.5, "rewards/chosen": 1.0580880641937256, "rewards/margins": 0.008399697951972485, "rewards/rejected": 1.0496883392333984, "step": 5270 }, { "epoch": 0.245136728724639, "grad_norm": 119.13866424560547, "learning_rate": 2.853140814336784e-07, "logits/chosen": -18.39320182800293, "logits/rejected": -17.905725479125977, "logps/chosen": -421.1162109375, "logps/rejected": -378.52935791015625, "loss": 0.6551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3115195035934448, "rewards/margins": 0.2177324742078781, "rewards/rejected": 1.0937870740890503, "step": 5280 }, { "epoch": 0.24560100283207206, "grad_norm": 27.36235809326172, "learning_rate": 2.8528622498723246e-07, "logits/chosen": -19.257320404052734, "logits/rejected": -18.331501007080078, "logps/chosen": -423.53668212890625, "logps/rejected": -254.63900756835938, "loss": 0.5072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3203494548797607, "rewards/margins": 0.5748404264450073, "rewards/rejected": 0.7455089092254639, "step": 5290 }, { "epoch": 0.24606527693950508, "grad_norm": 178.11354064941406, "learning_rate": 2.8525836854078645e-07, "logits/chosen": -18.335628509521484, "logits/rejected": -16.859079360961914, "logps/chosen": -434.15899658203125, "logps/rejected": -284.17425537109375, "loss": 0.5459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1741148233413696, "rewards/margins": 0.45146113634109497, "rewards/rejected": 0.7226538062095642, "step": 5300 }, { "epoch": 0.2465295510469381, "grad_norm": 31.4332275390625, "learning_rate": 2.852305120943405e-07, "logits/chosen": -17.557918548583984, "logits/rejected": -16.618284225463867, "logps/chosen": -378.7447204589844, "logps/rejected": -243.24734497070312, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1126925945281982, "rewards/margins": 0.4785217344760895, "rewards/rejected": 0.6341708898544312, "step": 5310 }, { "epoch": 0.24699382515437113, "grad_norm": 47.62028121948242, "learning_rate": 2.8520265564789453e-07, "logits/chosen": -18.321142196655273, "logits/rejected": -17.475345611572266, "logps/chosen": -443.2754821777344, "logps/rejected": -280.68505859375, "loss": 0.4471, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4674307107925415, "rewards/margins": 0.7458375692367554, "rewards/rejected": 0.7215932607650757, "step": 5320 }, { "epoch": 0.24745809926180418, "grad_norm": 51.53606414794922, "learning_rate": 2.851747992014485e-07, "logits/chosen": -17.723676681518555, "logits/rejected": -16.600040435791016, "logps/chosen": -284.51654052734375, "logps/rejected": -180.17251586914062, "loss": 0.5802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.769919753074646, "rewards/margins": 0.2998882532119751, "rewards/rejected": 0.4700315594673157, "step": 5330 }, { "epoch": 0.2479223733692372, "grad_norm": 71.61044311523438, "learning_rate": 2.851469427550025e-07, "logits/chosen": -17.84402847290039, "logits/rejected": -18.89737892150879, "logps/chosen": -404.655517578125, "logps/rejected": -389.03582763671875, "loss": 0.7117, "rewards/accuracies": 0.5, "rewards/chosen": 1.1275371313095093, "rewards/margins": 0.02747141197323799, "rewards/rejected": 1.1000655889511108, "step": 5340 }, { "epoch": 0.24838664747667022, "grad_norm": 67.41191864013672, "learning_rate": 2.8511908630855655e-07, "logits/chosen": -18.814498901367188, "logits/rejected": -17.501361846923828, "logps/chosen": -507.7730407714844, "logps/rejected": -341.7479248046875, "loss": 0.4752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.493617296218872, "rewards/margins": 0.5956856608390808, "rewards/rejected": 0.897931694984436, "step": 5350 }, { "epoch": 0.24885092158410324, "grad_norm": 54.36946487426758, "learning_rate": 2.850912298621106e-07, "logits/chosen": -18.39284896850586, "logits/rejected": -16.689647674560547, "logps/chosen": -410.5733337402344, "logps/rejected": -333.4823303222656, "loss": 0.5336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3342839479446411, "rewards/margins": 0.442709743976593, "rewards/rejected": 0.8915742039680481, "step": 5360 }, { "epoch": 0.2493151956915363, "grad_norm": 92.85326385498047, "learning_rate": 2.8506337341566463e-07, "logits/chosen": -17.797719955444336, "logits/rejected": -17.368755340576172, "logps/chosen": -476.97509765625, "logps/rejected": -490.1650390625, "loss": 0.6527, "rewards/accuracies": 0.5, "rewards/chosen": 1.585209846496582, "rewards/margins": 0.14084358513355255, "rewards/rejected": 1.444366216659546, "step": 5370 }, { "epoch": 0.2497794697989693, "grad_norm": 101.65103149414062, "learning_rate": 2.850355169692186e-07, "logits/chosen": -17.919525146484375, "logits/rejected": -18.49100112915039, "logps/chosen": -296.3255920410156, "logps/rejected": -291.7710876464844, "loss": 0.7687, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.7577011585235596, "rewards/margins": -0.09568385034799576, "rewards/rejected": 0.8533849716186523, "step": 5380 }, { "epoch": 0.25024374390640236, "grad_norm": 98.32312774658203, "learning_rate": 2.850076605227726e-07, "logits/chosen": -19.056795120239258, "logits/rejected": -17.89362144470215, "logps/chosen": -397.839599609375, "logps/rejected": -302.12603759765625, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1962679624557495, "rewards/margins": 0.13940861821174622, "rewards/rejected": 1.0568593740463257, "step": 5390 }, { "epoch": 0.25070801801383535, "grad_norm": 52.81856918334961, "learning_rate": 2.8497980407632665e-07, "logits/chosen": -18.823986053466797, "logits/rejected": -15.870323181152344, "logps/chosen": -564.3347778320312, "logps/rejected": -279.62451171875, "loss": 0.4717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5670745372772217, "rewards/margins": 0.6558693051338196, "rewards/rejected": 0.9112052917480469, "step": 5400 }, { "epoch": 0.2511722921212684, "grad_norm": 68.94639587402344, "learning_rate": 2.849519476298807e-07, "logits/chosen": -16.8799991607666, "logits/rejected": -16.330272674560547, "logps/chosen": -318.72930908203125, "logps/rejected": -234.7156982421875, "loss": 0.6535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8701542019844055, "rewards/margins": 0.11211661994457245, "rewards/rejected": 0.7580376863479614, "step": 5410 }, { "epoch": 0.25163656622870145, "grad_norm": 56.56523895263672, "learning_rate": 2.849240911834347e-07, "logits/chosen": -17.401281356811523, "logits/rejected": -16.723224639892578, "logps/chosen": -437.43658447265625, "logps/rejected": -345.8362121582031, "loss": 0.6328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6057298183441162, "rewards/margins": 0.3197987675666809, "rewards/rejected": 1.2859313488006592, "step": 5420 }, { "epoch": 0.25210084033613445, "grad_norm": 110.13633728027344, "learning_rate": 2.848962347369887e-07, "logits/chosen": -18.874849319458008, "logits/rejected": -18.308670043945312, "logps/chosen": -520.1251831054688, "logps/rejected": -469.1781311035156, "loss": 0.7585, "rewards/accuracies": 0.5, "rewards/chosen": 1.4609835147857666, "rewards/margins": -0.03145811706781387, "rewards/rejected": 1.4924416542053223, "step": 5430 }, { "epoch": 0.2525651144435675, "grad_norm": 104.10205841064453, "learning_rate": 2.8486837829054276e-07, "logits/chosen": -17.22673797607422, "logits/rejected": -17.184497833251953, "logps/chosen": -412.28094482421875, "logps/rejected": -352.904052734375, "loss": 0.7872, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.025655746459961, "rewards/margins": -0.07778792083263397, "rewards/rejected": 1.1034437417984009, "step": 5440 }, { "epoch": 0.2530293885510005, "grad_norm": 117.81503295898438, "learning_rate": 2.8484052184409675e-07, "logits/chosen": -17.60367202758789, "logits/rejected": -18.294925689697266, "logps/chosen": -385.55145263671875, "logps/rejected": -325.7090759277344, "loss": 0.826, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.968909740447998, "rewards/margins": -0.23599457740783691, "rewards/rejected": 1.2049041986465454, "step": 5450 }, { "epoch": 0.25349366265843354, "grad_norm": 57.613441467285156, "learning_rate": 2.8481266539765073e-07, "logits/chosen": -16.76311492919922, "logits/rejected": -16.495834350585938, "logps/chosen": -372.286376953125, "logps/rejected": -280.94659423828125, "loss": 0.6364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9393056035041809, "rewards/margins": 0.16526946425437927, "rewards/rejected": 0.774036169052124, "step": 5460 }, { "epoch": 0.2539579367658666, "grad_norm": 93.16790008544922, "learning_rate": 2.847848089512048e-07, "logits/chosen": -17.9488468170166, "logits/rejected": -17.293123245239258, "logps/chosen": -466.2003479003906, "logps/rejected": -427.83367919921875, "loss": 0.6277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3949158191680908, "rewards/margins": 0.36429139971733093, "rewards/rejected": 1.0306243896484375, "step": 5470 }, { "epoch": 0.2544222108732996, "grad_norm": 151.01158142089844, "learning_rate": 2.847569525047588e-07, "logits/chosen": -18.57393455505371, "logits/rejected": -18.079177856445312, "logps/chosen": -530.1072998046875, "logps/rejected": -452.65887451171875, "loss": 0.6257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3268356323242188, "rewards/margins": 0.21824951469898224, "rewards/rejected": 1.1085861921310425, "step": 5480 }, { "epoch": 0.25488648498073263, "grad_norm": 139.4373321533203, "learning_rate": 2.847290960583128e-07, "logits/chosen": -18.034839630126953, "logits/rejected": -17.164278030395508, "logps/chosen": -471.03961181640625, "logps/rejected": -355.0730285644531, "loss": 0.7603, "rewards/accuracies": 0.5, "rewards/chosen": 1.1967313289642334, "rewards/margins": -0.051144879311323166, "rewards/rejected": 1.2478762865066528, "step": 5490 }, { "epoch": 0.2553507590881657, "grad_norm": 35.68324661254883, "learning_rate": 2.8470123961186685e-07, "logits/chosen": -17.513154983520508, "logits/rejected": -17.559350967407227, "logps/chosen": -427.1998596191406, "logps/rejected": -352.28704833984375, "loss": 0.6456, "rewards/accuracies": 0.5, "rewards/chosen": 1.2698432207107544, "rewards/margins": 0.16138045489788055, "rewards/rejected": 1.1084626913070679, "step": 5500 }, { "epoch": 0.2558150331955987, "grad_norm": 127.03516387939453, "learning_rate": 2.8467338316542083e-07, "logits/chosen": -18.324832916259766, "logits/rejected": -17.660951614379883, "logps/chosen": -504.9381408691406, "logps/rejected": -377.5086669921875, "loss": 0.5854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3741111755371094, "rewards/margins": 0.3322618007659912, "rewards/rejected": 1.0418493747711182, "step": 5510 }, { "epoch": 0.2562793073030317, "grad_norm": 53.91299819946289, "learning_rate": 2.846455267189749e-07, "logits/chosen": -18.056089401245117, "logits/rejected": -17.605350494384766, "logps/chosen": -341.9381408691406, "logps/rejected": -263.4127502441406, "loss": 0.6022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1598435640335083, "rewards/margins": 0.38771185278892517, "rewards/rejected": 0.772131621837616, "step": 5520 }, { "epoch": 0.2567435814104647, "grad_norm": 30.52581787109375, "learning_rate": 2.8461767027252886e-07, "logits/chosen": -18.099641799926758, "logits/rejected": -17.74758529663086, "logps/chosen": -369.76593017578125, "logps/rejected": -321.20751953125, "loss": 0.6286, "rewards/accuracies": 0.5, "rewards/chosen": 0.9751185178756714, "rewards/margins": 0.2045937031507492, "rewards/rejected": 0.7705248594284058, "step": 5530 }, { "epoch": 0.25720785551789777, "grad_norm": 151.14459228515625, "learning_rate": 2.845898138260829e-07, "logits/chosen": -18.124563217163086, "logits/rejected": -17.793855667114258, "logps/chosen": -412.79034423828125, "logps/rejected": -291.31976318359375, "loss": 0.6339, "rewards/accuracies": 0.5, "rewards/chosen": 1.271551489830017, "rewards/margins": 0.4182102084159851, "rewards/rejected": 0.8533414006233215, "step": 5540 }, { "epoch": 0.2576721296253308, "grad_norm": 117.51629638671875, "learning_rate": 2.8456195737963694e-07, "logits/chosen": -17.37653350830078, "logits/rejected": -17.185277938842773, "logps/chosen": -428.52752685546875, "logps/rejected": -356.10125732421875, "loss": 0.6403, "rewards/accuracies": 0.5, "rewards/chosen": 1.2486752271652222, "rewards/margins": 0.20899668335914612, "rewards/rejected": 1.0396784543991089, "step": 5550 }, { "epoch": 0.2581364037327638, "grad_norm": 56.417606353759766, "learning_rate": 2.8453410093319093e-07, "logits/chosen": -18.04244041442871, "logits/rejected": -16.690256118774414, "logps/chosen": -495.914794921875, "logps/rejected": -310.18878173828125, "loss": 0.4779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5609524250030518, "rewards/margins": 0.7115113139152527, "rewards/rejected": 0.8494412302970886, "step": 5560 }, { "epoch": 0.25860067784019686, "grad_norm": 140.16184997558594, "learning_rate": 2.8450624448674497e-07, "logits/chosen": -18.784090042114258, "logits/rejected": -18.53175163269043, "logps/chosen": -407.45684814453125, "logps/rejected": -381.2064514160156, "loss": 0.7517, "rewards/accuracies": 0.5, "rewards/chosen": 1.2508275508880615, "rewards/margins": -0.016730424016714096, "rewards/rejected": 1.2675578594207764, "step": 5570 }, { "epoch": 0.2590649519476299, "grad_norm": 29.246267318725586, "learning_rate": 2.8447838804029896e-07, "logits/chosen": -17.571346282958984, "logits/rejected": -16.72505760192871, "logps/chosen": -348.135986328125, "logps/rejected": -316.5165710449219, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9199895858764648, "rewards/margins": 0.2198220044374466, "rewards/rejected": 0.7001675367355347, "step": 5580 }, { "epoch": 0.2595292260550629, "grad_norm": 16.520998001098633, "learning_rate": 2.84450531593853e-07, "logits/chosen": -19.030807495117188, "logits/rejected": -17.58944320678711, "logps/chosen": -357.6689453125, "logps/rejected": -320.0427551269531, "loss": 0.7594, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9046198129653931, "rewards/margins": 0.0017458796501159668, "rewards/rejected": 0.9028738737106323, "step": 5590 }, { "epoch": 0.25999350016249595, "grad_norm": 48.176536560058594, "learning_rate": 2.8442267514740704e-07, "logits/chosen": -18.44316864013672, "logits/rejected": -16.579818725585938, "logps/chosen": -391.97760009765625, "logps/rejected": -203.78271484375, "loss": 0.425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4552829265594482, "rewards/margins": 0.7683486342430115, "rewards/rejected": 0.6869341135025024, "step": 5600 }, { "epoch": 0.26045777426992894, "grad_norm": 42.55251693725586, "learning_rate": 2.8439481870096103e-07, "logits/chosen": -18.58191680908203, "logits/rejected": -17.170642852783203, "logps/chosen": -363.65338134765625, "logps/rejected": -182.65530395507812, "loss": 0.5592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9637312889099121, "rewards/margins": 0.3766840100288391, "rewards/rejected": 0.5870472192764282, "step": 5610 }, { "epoch": 0.260922048377362, "grad_norm": 46.851165771484375, "learning_rate": 2.8436696225451507e-07, "logits/chosen": -17.952693939208984, "logits/rejected": -17.498619079589844, "logps/chosen": -281.4603576660156, "logps/rejected": -234.693603515625, "loss": 0.5682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0469937324523926, "rewards/margins": 0.3517417311668396, "rewards/rejected": 0.6952520608901978, "step": 5620 }, { "epoch": 0.26138632248479504, "grad_norm": 52.60160827636719, "learning_rate": 2.8433910580806906e-07, "logits/chosen": -18.04552459716797, "logits/rejected": -17.249143600463867, "logps/chosen": -328.5736999511719, "logps/rejected": -262.7996520996094, "loss": 0.624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.101150393486023, "rewards/margins": 0.21692109107971191, "rewards/rejected": 0.8842293620109558, "step": 5630 }, { "epoch": 0.26185059659222804, "grad_norm": 47.50684356689453, "learning_rate": 2.8431124936162305e-07, "logits/chosen": -18.024730682373047, "logits/rejected": -17.8968563079834, "logps/chosen": -386.4278869628906, "logps/rejected": -436.233154296875, "loss": 0.7656, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.156690239906311, "rewards/margins": -0.06140853092074394, "rewards/rejected": 1.218098759651184, "step": 5640 }, { "epoch": 0.2623148706996611, "grad_norm": 30.394371032714844, "learning_rate": 2.842833929151771e-07, "logits/chosen": -17.764131546020508, "logits/rejected": -17.40953826904297, "logps/chosen": -386.974365234375, "logps/rejected": -327.70416259765625, "loss": 0.7079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0947850942611694, "rewards/margins": 0.13834871351718903, "rewards/rejected": 0.9564364552497864, "step": 5650 }, { "epoch": 0.26277914480709413, "grad_norm": 103.80447387695312, "learning_rate": 2.8425553646873113e-07, "logits/chosen": -17.000667572021484, "logits/rejected": -16.66374397277832, "logps/chosen": -376.09112548828125, "logps/rejected": -273.88128662109375, "loss": 0.6062, "rewards/accuracies": 0.5, "rewards/chosen": 1.1164056062698364, "rewards/margins": 0.33031216263771057, "rewards/rejected": 0.786093533039093, "step": 5660 }, { "epoch": 0.2632434189145271, "grad_norm": 74.98225402832031, "learning_rate": 2.8422768002228517e-07, "logits/chosen": -18.208255767822266, "logits/rejected": -17.39864730834961, "logps/chosen": -428.165283203125, "logps/rejected": -301.45147705078125, "loss": 0.6665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6406663656234741, "rewards/margins": 0.21150870621204376, "rewards/rejected": 1.4291576147079468, "step": 5670 }, { "epoch": 0.2637076930219602, "grad_norm": 61.52045822143555, "learning_rate": 2.8419982357583916e-07, "logits/chosen": -17.728883743286133, "logits/rejected": -17.54781723022461, "logps/chosen": -299.2795715332031, "logps/rejected": -307.7386169433594, "loss": 0.6827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8128770589828491, "rewards/margins": 0.06912261247634888, "rewards/rejected": 0.7437543869018555, "step": 5680 }, { "epoch": 0.26417196712939317, "grad_norm": 81.35936737060547, "learning_rate": 2.841719671293932e-07, "logits/chosen": -18.47464370727539, "logits/rejected": -18.444595336914062, "logps/chosen": -464.2998046875, "logps/rejected": -411.41680908203125, "loss": 0.6456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3816540241241455, "rewards/margins": 0.1758098304271698, "rewards/rejected": 1.2058441638946533, "step": 5690 }, { "epoch": 0.2646362412368262, "grad_norm": 91.24411010742188, "learning_rate": 2.841441106829472e-07, "logits/chosen": -17.799457550048828, "logits/rejected": -17.334524154663086, "logps/chosen": -426.08465576171875, "logps/rejected": -397.59991455078125, "loss": 0.6803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.27030611038208, "rewards/margins": 0.13009071350097656, "rewards/rejected": 1.1402153968811035, "step": 5700 }, { "epoch": 0.26510051534425927, "grad_norm": 92.78794860839844, "learning_rate": 2.8411625423650123e-07, "logits/chosen": -18.935871124267578, "logits/rejected": -18.4376220703125, "logps/chosen": -325.72650146484375, "logps/rejected": -305.5580139160156, "loss": 0.6501, "rewards/accuracies": 0.5, "rewards/chosen": 1.0956329107284546, "rewards/margins": 0.19518904387950897, "rewards/rejected": 0.9004438519477844, "step": 5710 }, { "epoch": 0.26556478945169226, "grad_norm": 116.644287109375, "learning_rate": 2.840883977900552e-07, "logits/chosen": -18.21790313720703, "logits/rejected": -17.518224716186523, "logps/chosen": -543.6649169921875, "logps/rejected": -459.7926330566406, "loss": 0.6783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4475934505462646, "rewards/margins": 0.08770699799060822, "rewards/rejected": 1.3598864078521729, "step": 5720 }, { "epoch": 0.2660290635591253, "grad_norm": 70.30846405029297, "learning_rate": 2.8406054134360926e-07, "logits/chosen": -17.59714126586914, "logits/rejected": -17.23799705505371, "logps/chosen": -474.3416442871094, "logps/rejected": -436.72296142578125, "loss": 0.7405, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4045649766921997, "rewards/margins": -0.011160284280776978, "rewards/rejected": 1.4157252311706543, "step": 5730 }, { "epoch": 0.26649333766655836, "grad_norm": 37.58671951293945, "learning_rate": 2.840326848971633e-07, "logits/chosen": -18.611047744750977, "logits/rejected": -18.420366287231445, "logps/chosen": -367.35369873046875, "logps/rejected": -361.42364501953125, "loss": 0.6785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.296018362045288, "rewards/margins": 0.11741559207439423, "rewards/rejected": 1.178602933883667, "step": 5740 }, { "epoch": 0.26695761177399135, "grad_norm": 128.61940002441406, "learning_rate": 2.840048284507173e-07, "logits/chosen": -18.484067916870117, "logits/rejected": -18.219005584716797, "logps/chosen": -511.8435974121094, "logps/rejected": -388.30615234375, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.430830717086792, "rewards/margins": 0.34319019317626953, "rewards/rejected": 1.0876405239105225, "step": 5750 }, { "epoch": 0.2674218858814244, "grad_norm": 117.94403839111328, "learning_rate": 2.839769720042713e-07, "logits/chosen": -17.326915740966797, "logits/rejected": -16.38994026184082, "logps/chosen": -464.23760986328125, "logps/rejected": -334.8603820800781, "loss": 0.5337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4161572456359863, "rewards/margins": 0.4077610969543457, "rewards/rejected": 1.0083961486816406, "step": 5760 }, { "epoch": 0.2678861599888574, "grad_norm": 51.78324890136719, "learning_rate": 2.839491155578253e-07, "logits/chosen": -18.285633087158203, "logits/rejected": -17.036401748657227, "logps/chosen": -473.02655029296875, "logps/rejected": -315.1029052734375, "loss": 0.4994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3938701152801514, "rewards/margins": 0.5535570979118347, "rewards/rejected": 0.8403130769729614, "step": 5770 }, { "epoch": 0.26835043409629045, "grad_norm": 104.38003540039062, "learning_rate": 2.8392125911137936e-07, "logits/chosen": -17.120038986206055, "logits/rejected": -17.290254592895508, "logps/chosen": -242.31216430664062, "logps/rejected": -313.56695556640625, "loss": 0.7918, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8448537588119507, "rewards/margins": -0.075102299451828, "rewards/rejected": 0.9199560880661011, "step": 5780 }, { "epoch": 0.2688147082037235, "grad_norm": 80.54580688476562, "learning_rate": 2.838934026649334e-07, "logits/chosen": -18.245004653930664, "logits/rejected": -18.533403396606445, "logps/chosen": -450.58184814453125, "logps/rejected": -455.082763671875, "loss": 0.7623, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1276335716247559, "rewards/margins": -0.08058623969554901, "rewards/rejected": 1.2082197666168213, "step": 5790 }, { "epoch": 0.2692789823111565, "grad_norm": 94.714599609375, "learning_rate": 2.838655462184874e-07, "logits/chosen": -18.381982803344727, "logits/rejected": -16.77960205078125, "logps/chosen": -517.3436889648438, "logps/rejected": -332.55859375, "loss": 0.452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8716633319854736, "rewards/margins": 0.7249577641487122, "rewards/rejected": 1.1467056274414062, "step": 5800 }, { "epoch": 0.26974325641858954, "grad_norm": 57.283203125, "learning_rate": 2.838376897720414e-07, "logits/chosen": -17.694534301757812, "logits/rejected": -17.651561737060547, "logps/chosen": -433.9910583496094, "logps/rejected": -302.82769775390625, "loss": 0.667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0464438199996948, "rewards/margins": 0.12954135239124298, "rewards/rejected": 0.9169025421142578, "step": 5810 }, { "epoch": 0.2702075305260226, "grad_norm": 72.26593780517578, "learning_rate": 2.838098333255954e-07, "logits/chosen": -17.220836639404297, "logits/rejected": -16.871244430541992, "logps/chosen": -329.8766174316406, "logps/rejected": -313.7626037597656, "loss": 0.7893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9436739087104797, "rewards/margins": -0.01840991899371147, "rewards/rejected": 0.9620838165283203, "step": 5820 }, { "epoch": 0.2706718046334556, "grad_norm": 38.75969314575195, "learning_rate": 2.837819768791494e-07, "logits/chosen": -17.347698211669922, "logits/rejected": -17.626049041748047, "logps/chosen": -373.2635803222656, "logps/rejected": -396.3816223144531, "loss": 0.8042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0139431953430176, "rewards/margins": -0.18291538953781128, "rewards/rejected": 1.196858525276184, "step": 5830 }, { "epoch": 0.27113607874088863, "grad_norm": 52.63250732421875, "learning_rate": 2.8375412043270345e-07, "logits/chosen": -18.089096069335938, "logits/rejected": -17.898338317871094, "logps/chosen": -294.9027099609375, "logps/rejected": -282.5901184082031, "loss": 0.6613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8833093643188477, "rewards/margins": 0.11324381828308105, "rewards/rejected": 0.7700655460357666, "step": 5840 }, { "epoch": 0.2716003528483216, "grad_norm": 105.0658950805664, "learning_rate": 2.837262639862575e-07, "logits/chosen": -18.964122772216797, "logits/rejected": -18.358675003051758, "logps/chosen": -621.6658935546875, "logps/rejected": -513.9593505859375, "loss": 0.6447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6823713779449463, "rewards/margins": 0.23622623085975647, "rewards/rejected": 1.4461450576782227, "step": 5850 }, { "epoch": 0.2720646269557547, "grad_norm": 81.09480285644531, "learning_rate": 2.8369840753981153e-07, "logits/chosen": -18.82179832458496, "logits/rejected": -16.746845245361328, "logps/chosen": -444.3614807128906, "logps/rejected": -280.1859130859375, "loss": 0.62, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3633806705474854, "rewards/margins": 0.21798133850097656, "rewards/rejected": 1.1453993320465088, "step": 5860 }, { "epoch": 0.2725289010631877, "grad_norm": 128.9797821044922, "learning_rate": 2.836705510933655e-07, "logits/chosen": -17.885303497314453, "logits/rejected": -17.062774658203125, "logps/chosen": -426.4461975097656, "logps/rejected": -343.80560302734375, "loss": 0.6635, "rewards/accuracies": 0.5, "rewards/chosen": 1.1932342052459717, "rewards/margins": 0.13702313601970673, "rewards/rejected": 1.056211233139038, "step": 5870 }, { "epoch": 0.2729931751706207, "grad_norm": 104.99828338623047, "learning_rate": 2.836426946469195e-07, "logits/chosen": -17.593059539794922, "logits/rejected": -16.73614501953125, "logps/chosen": -403.3441467285156, "logps/rejected": -242.77023315429688, "loss": 0.5135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2575901746749878, "rewards/margins": 0.5141729712486267, "rewards/rejected": 0.7434172630310059, "step": 5880 }, { "epoch": 0.27345744927805377, "grad_norm": 137.20758056640625, "learning_rate": 2.8361483820047354e-07, "logits/chosen": -17.958086013793945, "logits/rejected": -17.181652069091797, "logps/chosen": -428.03594970703125, "logps/rejected": -405.1958923339844, "loss": 0.6714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3711007833480835, "rewards/margins": 0.12516868114471436, "rewards/rejected": 1.2459322214126587, "step": 5890 }, { "epoch": 0.2739217233854868, "grad_norm": 62.65420913696289, "learning_rate": 2.835869817540276e-07, "logits/chosen": -17.507917404174805, "logits/rejected": -16.60812759399414, "logps/chosen": -334.7933654785156, "logps/rejected": -258.69549560546875, "loss": 0.6455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.185690999031067, "rewards/margins": 0.28292331099510193, "rewards/rejected": 0.9027677774429321, "step": 5900 }, { "epoch": 0.2743859974929198, "grad_norm": 147.26205444335938, "learning_rate": 2.835591253075816e-07, "logits/chosen": -18.224964141845703, "logits/rejected": -18.537057876586914, "logps/chosen": -458.9078674316406, "logps/rejected": -478.62152099609375, "loss": 0.853, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4408048391342163, "rewards/margins": -0.235199972987175, "rewards/rejected": 1.6760050058364868, "step": 5910 }, { "epoch": 0.27485027160035286, "grad_norm": 76.8711929321289, "learning_rate": 2.835312688611356e-07, "logits/chosen": -18.698036193847656, "logits/rejected": -17.949813842773438, "logps/chosen": -420.56646728515625, "logps/rejected": -342.1430358886719, "loss": 0.5097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4901659488677979, "rewards/margins": 0.4740417003631592, "rewards/rejected": 1.0161243677139282, "step": 5920 }, { "epoch": 0.27531454570778585, "grad_norm": 179.94488525390625, "learning_rate": 2.835034124146896e-07, "logits/chosen": -18.106245040893555, "logits/rejected": -17.936893463134766, "logps/chosen": -442.04278564453125, "logps/rejected": -395.531494140625, "loss": 0.8512, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0645036697387695, "rewards/margins": -0.20902717113494873, "rewards/rejected": 1.2735308408737183, "step": 5930 }, { "epoch": 0.2757788198152189, "grad_norm": 117.38388061523438, "learning_rate": 2.8347555596824364e-07, "logits/chosen": -18.14739990234375, "logits/rejected": -17.459796905517578, "logps/chosen": -256.3905334472656, "logps/rejected": -238.35205078125, "loss": 0.6726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9296575784683228, "rewards/margins": 0.1269720494747162, "rewards/rejected": 0.802685558795929, "step": 5940 }, { "epoch": 0.27624309392265195, "grad_norm": 63.408111572265625, "learning_rate": 2.8344769952179763e-07, "logits/chosen": -17.421825408935547, "logits/rejected": -16.96817970275879, "logps/chosen": -428.635009765625, "logps/rejected": -367.59228515625, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 1.138279676437378, "rewards/margins": 0.12373474985361099, "rewards/rejected": 1.0145447254180908, "step": 5950 }, { "epoch": 0.27670736803008494, "grad_norm": 72.04630279541016, "learning_rate": 2.8341984307535167e-07, "logits/chosen": -17.961322784423828, "logits/rejected": -17.3222599029541, "logps/chosen": -515.8750610351562, "logps/rejected": -400.58984375, "loss": 0.6541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5779104232788086, "rewards/margins": 0.2208854705095291, "rewards/rejected": 1.357024908065796, "step": 5960 }, { "epoch": 0.277171642137518, "grad_norm": 25.547151565551758, "learning_rate": 2.833919866289057e-07, "logits/chosen": -18.268047332763672, "logits/rejected": -17.257549285888672, "logps/chosen": -471.73065185546875, "logps/rejected": -313.69610595703125, "loss": 0.5228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6250641345977783, "rewards/margins": 0.47199225425720215, "rewards/rejected": 1.1530717611312866, "step": 5970 }, { "epoch": 0.27763591624495104, "grad_norm": 48.07146072387695, "learning_rate": 2.833641301824597e-07, "logits/chosen": -18.63670539855957, "logits/rejected": -18.508987426757812, "logps/chosen": -365.6355895996094, "logps/rejected": -425.7191467285156, "loss": 0.6285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.13224196434021, "rewards/margins": 0.16568435728549957, "rewards/rejected": 0.9665576815605164, "step": 5980 }, { "epoch": 0.27810019035238404, "grad_norm": 24.51225471496582, "learning_rate": 2.8333627373601374e-07, "logits/chosen": -18.776432037353516, "logits/rejected": -18.342411041259766, "logps/chosen": -500.8999938964844, "logps/rejected": -435.83819580078125, "loss": 0.7372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.28928804397583, "rewards/margins": 0.04133499786257744, "rewards/rejected": 1.247953176498413, "step": 5990 }, { "epoch": 0.2785644644598171, "grad_norm": 61.45894241333008, "learning_rate": 2.8330841728956773e-07, "logits/chosen": -18.622760772705078, "logits/rejected": -17.072677612304688, "logps/chosen": -474.80047607421875, "logps/rejected": -226.7122344970703, "loss": 0.4417, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.609180212020874, "rewards/margins": 0.6366636157035828, "rewards/rejected": 0.9725168347358704, "step": 6000 }, { "epoch": 0.2790287385672501, "grad_norm": 76.98654174804688, "learning_rate": 2.8328056084312177e-07, "logits/chosen": -17.935550689697266, "logits/rejected": -16.30316734313965, "logps/chosen": -351.7173767089844, "logps/rejected": -193.454345703125, "loss": 0.4756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1161466836929321, "rewards/margins": 0.6226522326469421, "rewards/rejected": 0.49349457025527954, "step": 6010 }, { "epoch": 0.27949301267468313, "grad_norm": 103.42042541503906, "learning_rate": 2.832527043966758e-07, "logits/chosen": -17.475793838500977, "logits/rejected": -16.934551239013672, "logps/chosen": -433.910400390625, "logps/rejected": -394.2396545410156, "loss": 0.6564, "rewards/accuracies": 0.5, "rewards/chosen": 1.3244984149932861, "rewards/margins": 0.158858984708786, "rewards/rejected": 1.1656395196914673, "step": 6020 }, { "epoch": 0.2799572867821162, "grad_norm": 183.05471801757812, "learning_rate": 2.832248479502298e-07, "logits/chosen": -19.286684036254883, "logits/rejected": -17.80526351928711, "logps/chosen": -353.97576904296875, "logps/rejected": -301.635009765625, "loss": 0.6445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9968827366828918, "rewards/margins": 0.19566211104393005, "rewards/rejected": 0.8012205958366394, "step": 6030 }, { "epoch": 0.28042156088954917, "grad_norm": 62.768333435058594, "learning_rate": 2.8319699150378384e-07, "logits/chosen": -18.548450469970703, "logits/rejected": -18.120738983154297, "logps/chosen": -403.75177001953125, "logps/rejected": -364.31048583984375, "loss": 0.7058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.973127007484436, "rewards/margins": 0.011012649163603783, "rewards/rejected": 0.9621143341064453, "step": 6040 }, { "epoch": 0.2808858349969822, "grad_norm": 119.8240737915039, "learning_rate": 2.8316913505733783e-07, "logits/chosen": -18.029865264892578, "logits/rejected": -17.54848289489746, "logps/chosen": -443.3814392089844, "logps/rejected": -366.99267578125, "loss": 0.7026, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6384010314941406, "rewards/margins": 0.15922769904136658, "rewards/rejected": 1.4791734218597412, "step": 6050 }, { "epoch": 0.28135010910441527, "grad_norm": 137.02662658691406, "learning_rate": 2.831412786108918e-07, "logits/chosen": -18.641077041625977, "logits/rejected": -17.775320053100586, "logps/chosen": -488.67889404296875, "logps/rejected": -352.40679931640625, "loss": 0.5257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5539904832839966, "rewards/margins": 0.5440573692321777, "rewards/rejected": 1.0099331140518188, "step": 6060 }, { "epoch": 0.28181438321184826, "grad_norm": 146.76548767089844, "learning_rate": 2.8311342216444586e-07, "logits/chosen": -18.804691314697266, "logits/rejected": -18.364044189453125, "logps/chosen": -421.884765625, "logps/rejected": -371.81317138671875, "loss": 0.7033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2863714694976807, "rewards/margins": 0.09038345515727997, "rewards/rejected": 1.1959879398345947, "step": 6070 }, { "epoch": 0.2822786573192813, "grad_norm": 46.79974365234375, "learning_rate": 2.830855657179999e-07, "logits/chosen": -18.540761947631836, "logits/rejected": -17.814796447753906, "logps/chosen": -432.44134521484375, "logps/rejected": -430.5287170410156, "loss": 0.7728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4875731468200684, "rewards/margins": -0.07463376969099045, "rewards/rejected": 1.5622069835662842, "step": 6080 }, { "epoch": 0.2827429314267143, "grad_norm": 74.26770782470703, "learning_rate": 2.8305770927155394e-07, "logits/chosen": -19.036590576171875, "logits/rejected": -17.74308204650879, "logps/chosen": -583.1907958984375, "logps/rejected": -411.49603271484375, "loss": 0.5886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8251116275787354, "rewards/margins": 0.4260888695716858, "rewards/rejected": 1.3990226984024048, "step": 6090 }, { "epoch": 0.28320720553414735, "grad_norm": 115.00211334228516, "learning_rate": 2.8302985282510793e-07, "logits/chosen": -17.30453872680664, "logits/rejected": -16.822668075561523, "logps/chosen": -329.105712890625, "logps/rejected": -303.49346923828125, "loss": 0.6107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.177074670791626, "rewards/margins": 0.27725619077682495, "rewards/rejected": 0.8998183012008667, "step": 6100 }, { "epoch": 0.2836714796415804, "grad_norm": 67.98278045654297, "learning_rate": 2.8300199637866197e-07, "logits/chosen": -17.274633407592773, "logits/rejected": -17.32047462463379, "logps/chosen": -320.60870361328125, "logps/rejected": -267.09149169921875, "loss": 0.7431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9338949918746948, "rewards/margins": -0.009626142680644989, "rewards/rejected": 0.9435211420059204, "step": 6110 }, { "epoch": 0.2841357537490134, "grad_norm": 89.03011322021484, "learning_rate": 2.8297413993221596e-07, "logits/chosen": -18.863788604736328, "logits/rejected": -17.253007888793945, "logps/chosen": -376.0623474121094, "logps/rejected": -304.656005859375, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.584093689918518, "rewards/margins": 0.4219602644443512, "rewards/rejected": 1.1621334552764893, "step": 6120 }, { "epoch": 0.28460002785644645, "grad_norm": 26.72378158569336, "learning_rate": 2.8294628348577e-07, "logits/chosen": -17.55353355407715, "logits/rejected": -16.203392028808594, "logps/chosen": -494.48388671875, "logps/rejected": -276.81781005859375, "loss": 0.5055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6937475204467773, "rewards/margins": 0.5217657089233398, "rewards/rejected": 1.171981930732727, "step": 6130 }, { "epoch": 0.2850643019638795, "grad_norm": 51.63068389892578, "learning_rate": 2.82918427039324e-07, "logits/chosen": -18.560985565185547, "logits/rejected": -18.067218780517578, "logps/chosen": -475.61102294921875, "logps/rejected": -430.01763916015625, "loss": 0.6396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5694725513458252, "rewards/margins": 0.1596061736345291, "rewards/rejected": 1.409866452217102, "step": 6140 }, { "epoch": 0.2855285760713125, "grad_norm": 82.06026458740234, "learning_rate": 2.8289057059287803e-07, "logits/chosen": -18.61037254333496, "logits/rejected": -17.82164764404297, "logps/chosen": -468.47735595703125, "logps/rejected": -360.1468505859375, "loss": 0.6181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5359247922897339, "rewards/margins": 0.2517154812812805, "rewards/rejected": 1.2842094898223877, "step": 6150 }, { "epoch": 0.28599285017874554, "grad_norm": 62.923099517822266, "learning_rate": 2.8286271414643207e-07, "logits/chosen": -19.402099609375, "logits/rejected": -16.786083221435547, "logps/chosen": -502.7772521972656, "logps/rejected": -243.9799346923828, "loss": 0.4589, "rewards/accuracies": 1.0, "rewards/chosen": 1.694968581199646, "rewards/margins": 0.7065141797065735, "rewards/rejected": 0.9884544610977173, "step": 6160 }, { "epoch": 0.28645712428617853, "grad_norm": 59.27924346923828, "learning_rate": 2.8283485769998606e-07, "logits/chosen": -17.080177307128906, "logits/rejected": -17.549680709838867, "logps/chosen": -370.97271728515625, "logps/rejected": -372.0350646972656, "loss": 0.7503, "rewards/accuracies": 0.5, "rewards/chosen": 1.1414381265640259, "rewards/margins": -0.006283545400947332, "rewards/rejected": 1.147721529006958, "step": 6170 }, { "epoch": 0.2869213983936116, "grad_norm": 50.003150939941406, "learning_rate": 2.8280700125354005e-07, "logits/chosen": -18.16727066040039, "logits/rejected": -17.933242797851562, "logps/chosen": -372.8255920410156, "logps/rejected": -409.9263610839844, "loss": 0.6847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2645918130874634, "rewards/margins": 0.08485645055770874, "rewards/rejected": 1.1797354221343994, "step": 6180 }, { "epoch": 0.28738567250104463, "grad_norm": 69.32410430908203, "learning_rate": 2.827791448070941e-07, "logits/chosen": -17.88956642150879, "logits/rejected": -17.340051651000977, "logps/chosen": -429.12237548828125, "logps/rejected": -326.9961242675781, "loss": 0.6196, "rewards/accuracies": 0.5, "rewards/chosen": 1.3674525022506714, "rewards/margins": 0.3282436728477478, "rewards/rejected": 1.039209008216858, "step": 6190 }, { "epoch": 0.2878499466084776, "grad_norm": 137.34503173828125, "learning_rate": 2.8275128836064813e-07, "logits/chosen": -18.78687286376953, "logits/rejected": -18.24026107788086, "logps/chosen": -350.4273986816406, "logps/rejected": -262.16986083984375, "loss": 0.6989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.228506326675415, "rewards/margins": 0.1230059266090393, "rewards/rejected": 1.1055004596710205, "step": 6200 }, { "epoch": 0.2883142207159107, "grad_norm": 118.94216918945312, "learning_rate": 2.8272343191420217e-07, "logits/chosen": -17.17477798461914, "logits/rejected": -16.805044174194336, "logps/chosen": -395.12176513671875, "logps/rejected": -339.94012451171875, "loss": 0.5596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3302277326583862, "rewards/margins": 0.3540373742580414, "rewards/rejected": 0.976190447807312, "step": 6210 }, { "epoch": 0.2887784948233437, "grad_norm": 44.63764190673828, "learning_rate": 2.8269557546775616e-07, "logits/chosen": -18.544694900512695, "logits/rejected": -17.504150390625, "logps/chosen": -417.3500061035156, "logps/rejected": -296.192626953125, "loss": 0.5154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.493332028388977, "rewards/margins": 0.467967689037323, "rewards/rejected": 1.0253641605377197, "step": 6220 }, { "epoch": 0.2892427689307767, "grad_norm": 130.9678192138672, "learning_rate": 2.8266771902131014e-07, "logits/chosen": -19.99203109741211, "logits/rejected": -19.248186111450195, "logps/chosen": -462.48468017578125, "logps/rejected": -377.44256591796875, "loss": 0.6476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4001561403274536, "rewards/margins": 0.16986477375030518, "rewards/rejected": 1.230291485786438, "step": 6230 }, { "epoch": 0.28970704303820977, "grad_norm": 69.01789093017578, "learning_rate": 2.826398625748642e-07, "logits/chosen": -18.532215118408203, "logits/rejected": -17.823850631713867, "logps/chosen": -447.27301025390625, "logps/rejected": -404.43048095703125, "loss": 0.7053, "rewards/accuracies": 0.5, "rewards/chosen": 1.4483015537261963, "rewards/margins": 0.024968111887574196, "rewards/rejected": 1.4233334064483643, "step": 6240 }, { "epoch": 0.29017131714564276, "grad_norm": 41.483367919921875, "learning_rate": 2.826120061284182e-07, "logits/chosen": -18.86798667907715, "logits/rejected": -17.445777893066406, "logps/chosen": -530.5051879882812, "logps/rejected": -327.4862365722656, "loss": 0.5072, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5819834470748901, "rewards/margins": 0.49047842621803284, "rewards/rejected": 1.0915049314498901, "step": 6250 }, { "epoch": 0.2906355912530758, "grad_norm": 92.78592681884766, "learning_rate": 2.825841496819722e-07, "logits/chosen": -19.505619049072266, "logits/rejected": -18.030986785888672, "logps/chosen": -424.40625, "logps/rejected": -324.09613037109375, "loss": 0.5261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4252722263336182, "rewards/margins": 0.46682968735694885, "rewards/rejected": 0.9584425687789917, "step": 6260 }, { "epoch": 0.29109986536050886, "grad_norm": 58.446556091308594, "learning_rate": 2.8255629323552626e-07, "logits/chosen": -18.585485458374023, "logits/rejected": -18.224620819091797, "logps/chosen": -452.4828186035156, "logps/rejected": -334.24371337890625, "loss": 0.6001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.500612497329712, "rewards/margins": 0.28857311606407166, "rewards/rejected": 1.2120394706726074, "step": 6270 }, { "epoch": 0.29156413946794185, "grad_norm": 101.11536407470703, "learning_rate": 2.825284367890803e-07, "logits/chosen": -18.618335723876953, "logits/rejected": -17.462039947509766, "logps/chosen": -428.39288330078125, "logps/rejected": -336.6691589355469, "loss": 0.7224, "rewards/accuracies": 0.5, "rewards/chosen": 1.423764705657959, "rewards/margins": 0.08443666994571686, "rewards/rejected": 1.3393280506134033, "step": 6280 }, { "epoch": 0.2920284135753749, "grad_norm": 111.57018280029297, "learning_rate": 2.825005803426343e-07, "logits/chosen": -17.585599899291992, "logits/rejected": -16.84695053100586, "logps/chosen": -476.17889404296875, "logps/rejected": -373.00244140625, "loss": 0.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.335265874862671, "rewards/margins": 0.343305379152298, "rewards/rejected": 0.9919605255126953, "step": 6290 }, { "epoch": 0.29249268768280795, "grad_norm": 91.83248138427734, "learning_rate": 2.8247272389618827e-07, "logits/chosen": -19.215547561645508, "logits/rejected": -18.014968872070312, "logps/chosen": -429.798583984375, "logps/rejected": -287.77203369140625, "loss": 0.5692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4850707054138184, "rewards/margins": 0.38084790110588074, "rewards/rejected": 1.1042226552963257, "step": 6300 }, { "epoch": 0.29295696179024094, "grad_norm": 53.73744583129883, "learning_rate": 2.824448674497423e-07, "logits/chosen": -18.46267318725586, "logits/rejected": -17.24526596069336, "logps/chosen": -439.4100646972656, "logps/rejected": -255.94485473632812, "loss": 0.5069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2796475887298584, "rewards/margins": 0.5367183089256287, "rewards/rejected": 0.7429292798042297, "step": 6310 }, { "epoch": 0.293421235897674, "grad_norm": 108.6399917602539, "learning_rate": 2.8241701100329635e-07, "logits/chosen": -18.003551483154297, "logits/rejected": -17.655630111694336, "logps/chosen": -511.2579040527344, "logps/rejected": -412.0205078125, "loss": 0.6124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5798183679580688, "rewards/margins": 0.2903912663459778, "rewards/rejected": 1.2894270420074463, "step": 6320 }, { "epoch": 0.29388551000510704, "grad_norm": 73.51225280761719, "learning_rate": 2.8238915455685034e-07, "logits/chosen": -18.548866271972656, "logits/rejected": -17.324928283691406, "logps/chosen": -456.2220764160156, "logps/rejected": -265.81170654296875, "loss": 0.5008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.567233681678772, "rewards/margins": 0.5786924362182617, "rewards/rejected": 0.9885414242744446, "step": 6330 }, { "epoch": 0.29434978411254004, "grad_norm": 70.2595443725586, "learning_rate": 2.823612981104044e-07, "logits/chosen": -19.06447982788086, "logits/rejected": -18.362293243408203, "logps/chosen": -470.2616271972656, "logps/rejected": -382.6977233886719, "loss": 0.5509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5257689952850342, "rewards/margins": 0.5156899094581604, "rewards/rejected": 1.0100791454315186, "step": 6340 }, { "epoch": 0.2948140582199731, "grad_norm": 122.36858367919922, "learning_rate": 2.8233344166395837e-07, "logits/chosen": -18.380327224731445, "logits/rejected": -17.206884384155273, "logps/chosen": -450.4966735839844, "logps/rejected": -322.4947204589844, "loss": 0.5193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5410102605819702, "rewards/margins": 0.4672970771789551, "rewards/rejected": 1.0737131834030151, "step": 6350 }, { "epoch": 0.2952783323274061, "grad_norm": 129.09808349609375, "learning_rate": 2.823055852175124e-07, "logits/chosen": -18.288965225219727, "logits/rejected": -17.30660629272461, "logps/chosen": -388.5520324707031, "logps/rejected": -310.36553955078125, "loss": 0.5984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2197234630584717, "rewards/margins": 0.3069465756416321, "rewards/rejected": 0.9127769470214844, "step": 6360 }, { "epoch": 0.29574260643483913, "grad_norm": 109.0119400024414, "learning_rate": 2.822777287710664e-07, "logits/chosen": -18.040081024169922, "logits/rejected": -18.727169036865234, "logps/chosen": -461.17578125, "logps/rejected": -460.989990234375, "loss": 0.7295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2715356349945068, "rewards/margins": -0.03309275954961777, "rewards/rejected": 1.3046284914016724, "step": 6370 }, { "epoch": 0.2962068805422722, "grad_norm": 25.091819763183594, "learning_rate": 2.8224987232462044e-07, "logits/chosen": -18.185970306396484, "logits/rejected": -16.358501434326172, "logps/chosen": -502.3348083496094, "logps/rejected": -272.4356384277344, "loss": 0.4433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0520167350769043, "rewards/margins": 0.8284175992012024, "rewards/rejected": 1.2235994338989258, "step": 6380 }, { "epoch": 0.29667115464970517, "grad_norm": 99.55687713623047, "learning_rate": 2.822220158781745e-07, "logits/chosen": -19.431934356689453, "logits/rejected": -18.433216094970703, "logps/chosen": -556.4983520507812, "logps/rejected": -418.8359375, "loss": 0.4902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0930094718933105, "rewards/margins": 0.644646167755127, "rewards/rejected": 1.4483630657196045, "step": 6390 }, { "epoch": 0.2971354287571382, "grad_norm": 109.49957275390625, "learning_rate": 2.8219415943172847e-07, "logits/chosen": -18.32586097717285, "logits/rejected": -17.250566482543945, "logps/chosen": -417.5484313964844, "logps/rejected": -326.86468505859375, "loss": 0.6053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.47306489944458, "rewards/margins": 0.315652072429657, "rewards/rejected": 1.1574128866195679, "step": 6400 }, { "epoch": 0.29759970286457127, "grad_norm": 56.043521881103516, "learning_rate": 2.821663029852825e-07, "logits/chosen": -18.395689010620117, "logits/rejected": -18.286104202270508, "logps/chosen": -374.7069091796875, "logps/rejected": -379.99688720703125, "loss": 0.7416, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2220662832260132, "rewards/margins": 0.03307078033685684, "rewards/rejected": 1.1889954805374146, "step": 6410 }, { "epoch": 0.29806397697200426, "grad_norm": 60.407737731933594, "learning_rate": 2.821384465388365e-07, "logits/chosen": -17.66402816772461, "logits/rejected": -17.07553482055664, "logps/chosen": -375.3789978027344, "logps/rejected": -316.27838134765625, "loss": 0.5493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6158912181854248, "rewards/margins": 0.4323519766330719, "rewards/rejected": 1.1835391521453857, "step": 6420 }, { "epoch": 0.2985282510794373, "grad_norm": 50.344818115234375, "learning_rate": 2.8211059009239054e-07, "logits/chosen": -18.3256893157959, "logits/rejected": -17.45761489868164, "logps/chosen": -397.7962951660156, "logps/rejected": -291.5664367675781, "loss": 0.5572, "rewards/accuracies": 0.5, "rewards/chosen": 1.5297950506210327, "rewards/margins": 0.48333436250686646, "rewards/rejected": 1.046460747718811, "step": 6430 }, { "epoch": 0.2989925251868703, "grad_norm": 42.91642379760742, "learning_rate": 2.8208273364594453e-07, "logits/chosen": -18.09988784790039, "logits/rejected": -18.25196647644043, "logps/chosen": -384.46893310546875, "logps/rejected": -401.0810241699219, "loss": 0.7039, "rewards/accuracies": 0.5, "rewards/chosen": 1.4050171375274658, "rewards/margins": 0.04554818943142891, "rewards/rejected": 1.3594688177108765, "step": 6440 }, { "epoch": 0.29945679929430336, "grad_norm": 125.60847473144531, "learning_rate": 2.8205487719949857e-07, "logits/chosen": -17.905109405517578, "logits/rejected": -17.771556854248047, "logps/chosen": -377.4832458496094, "logps/rejected": -338.3680114746094, "loss": 0.6105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4158743619918823, "rewards/margins": 0.24602875113487244, "rewards/rejected": 1.169845700263977, "step": 6450 }, { "epoch": 0.2999210734017364, "grad_norm": 38.680728912353516, "learning_rate": 2.820270207530526e-07, "logits/chosen": -18.0065975189209, "logits/rejected": -16.99590492248535, "logps/chosen": -373.888427734375, "logps/rejected": -233.5717315673828, "loss": 0.4811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6826832294464111, "rewards/margins": 0.7467484474182129, "rewards/rejected": 0.935934841632843, "step": 6460 }, { "epoch": 0.3003853475091694, "grad_norm": 64.48577880859375, "learning_rate": 2.819991643066066e-07, "logits/chosen": -18.490135192871094, "logits/rejected": -18.069259643554688, "logps/chosen": -308.54205322265625, "logps/rejected": -322.30999755859375, "loss": 0.7602, "rewards/accuracies": 0.5, "rewards/chosen": 1.3602468967437744, "rewards/margins": -0.0191363338381052, "rewards/rejected": 1.3793830871582031, "step": 6470 }, { "epoch": 0.30084962161660245, "grad_norm": 91.1697769165039, "learning_rate": 2.819713078601606e-07, "logits/chosen": -18.160694122314453, "logits/rejected": -17.24542808532715, "logps/chosen": -425.2110900878906, "logps/rejected": -287.70379638671875, "loss": 0.5479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3635101318359375, "rewards/margins": 0.39809730648994446, "rewards/rejected": 0.9654127359390259, "step": 6480 }, { "epoch": 0.3013138957240355, "grad_norm": 146.2998809814453, "learning_rate": 2.8194345141371463e-07, "logits/chosen": -18.427507400512695, "logits/rejected": -17.16115951538086, "logps/chosen": -469.259521484375, "logps/rejected": -325.6911315917969, "loss": 0.6221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7381865978240967, "rewards/margins": 0.4330078959465027, "rewards/rejected": 1.3051787614822388, "step": 6490 }, { "epoch": 0.3017781698314685, "grad_norm": 85.85530853271484, "learning_rate": 2.8191559496726867e-07, "logits/chosen": -17.89688491821289, "logits/rejected": -17.579843521118164, "logps/chosen": -447.1761169433594, "logps/rejected": -318.3563232421875, "loss": 0.6629, "rewards/accuracies": 0.5, "rewards/chosen": 1.1919182538986206, "rewards/margins": 0.139937624335289, "rewards/rejected": 1.05198073387146, "step": 6500 }, { "epoch": 0.30224244393890154, "grad_norm": 143.8192901611328, "learning_rate": 2.818877385208227e-07, "logits/chosen": -18.457897186279297, "logits/rejected": -18.310039520263672, "logps/chosen": -500.47503662109375, "logps/rejected": -482.7120666503906, "loss": 0.707, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9293113946914673, "rewards/margins": 0.12991440296173096, "rewards/rejected": 1.7993968725204468, "step": 6510 }, { "epoch": 0.30270671804633453, "grad_norm": 56.77265930175781, "learning_rate": 2.818598820743767e-07, "logits/chosen": -17.535404205322266, "logits/rejected": -17.074249267578125, "logps/chosen": -398.1362609863281, "logps/rejected": -348.6992492675781, "loss": 0.7183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5433324575424194, "rewards/margins": 0.4506151080131531, "rewards/rejected": 1.0927174091339111, "step": 6520 }, { "epoch": 0.3031709921537676, "grad_norm": 88.76654052734375, "learning_rate": 2.8183202562793074e-07, "logits/chosen": -18.035079956054688, "logits/rejected": -17.888072967529297, "logps/chosen": -463.99273681640625, "logps/rejected": -349.57562255859375, "loss": 0.5689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7861030101776123, "rewards/margins": 0.4719111919403076, "rewards/rejected": 1.3141918182373047, "step": 6530 }, { "epoch": 0.30363526626120063, "grad_norm": 18.968961715698242, "learning_rate": 2.8180416918148473e-07, "logits/chosen": -18.488300323486328, "logits/rejected": -16.78458595275879, "logps/chosen": -443.03680419921875, "logps/rejected": -315.15521240234375, "loss": 0.498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6143019199371338, "rewards/margins": 0.5525808930397034, "rewards/rejected": 1.0617210865020752, "step": 6540 }, { "epoch": 0.3040995403686336, "grad_norm": 76.80751037597656, "learning_rate": 2.8177631273503877e-07, "logits/chosen": -17.962366104125977, "logits/rejected": -17.148452758789062, "logps/chosen": -416.58941650390625, "logps/rejected": -333.3707275390625, "loss": 0.5968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.268651008605957, "rewards/margins": 0.27581357955932617, "rewards/rejected": 0.9928374290466309, "step": 6550 }, { "epoch": 0.3045638144760667, "grad_norm": 73.5513687133789, "learning_rate": 2.8174845628859276e-07, "logits/chosen": -17.06705665588379, "logits/rejected": -16.742450714111328, "logps/chosen": -337.60284423828125, "logps/rejected": -322.6252746582031, "loss": 0.7415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3849458694458008, "rewards/margins": 0.08571286499500275, "rewards/rejected": 1.299232840538025, "step": 6560 }, { "epoch": 0.3050280885834997, "grad_norm": 49.62143325805664, "learning_rate": 2.817205998421468e-07, "logits/chosen": -17.040491104125977, "logits/rejected": -16.689472198486328, "logps/chosen": -411.87481689453125, "logps/rejected": -366.6802062988281, "loss": 0.7805, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3449465036392212, "rewards/margins": 0.07157131284475327, "rewards/rejected": 1.2733752727508545, "step": 6570 }, { "epoch": 0.3054923626909327, "grad_norm": 93.186279296875, "learning_rate": 2.8169274339570084e-07, "logits/chosen": -18.118022918701172, "logits/rejected": -17.45657730102539, "logps/chosen": -493.8006286621094, "logps/rejected": -363.58282470703125, "loss": 0.5271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6259729862213135, "rewards/margins": 0.4241371154785156, "rewards/rejected": 1.2018356323242188, "step": 6580 }, { "epoch": 0.30595663679836577, "grad_norm": 47.24249267578125, "learning_rate": 2.8166488694925483e-07, "logits/chosen": -18.508243560791016, "logits/rejected": -18.08218765258789, "logps/chosen": -360.0394287109375, "logps/rejected": -331.64495849609375, "loss": 0.739, "rewards/accuracies": 0.5, "rewards/chosen": 1.1753575801849365, "rewards/margins": -0.0827508419752121, "rewards/rejected": 1.258108377456665, "step": 6590 }, { "epoch": 0.30642091090579876, "grad_norm": 107.99543762207031, "learning_rate": 2.816370305028088e-07, "logits/chosen": -18.127643585205078, "logits/rejected": -17.638521194458008, "logps/chosen": -404.1790466308594, "logps/rejected": -301.44366455078125, "loss": 0.4914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7666727304458618, "rewards/margins": 0.6050260066986084, "rewards/rejected": 1.1616467237472534, "step": 6600 }, { "epoch": 0.3068851850132318, "grad_norm": 35.324432373046875, "learning_rate": 2.8160917405636286e-07, "logits/chosen": -18.748149871826172, "logits/rejected": -17.75318717956543, "logps/chosen": -364.2628173828125, "logps/rejected": -276.2629089355469, "loss": 0.6007, "rewards/accuracies": 0.5, "rewards/chosen": 1.3070074319839478, "rewards/margins": 0.27829068899154663, "rewards/rejected": 1.028716802597046, "step": 6610 }, { "epoch": 0.30734945912066486, "grad_norm": 152.48382568359375, "learning_rate": 2.815813176099169e-07, "logits/chosen": -18.38277244567871, "logits/rejected": -18.160425186157227, "logps/chosen": -370.100830078125, "logps/rejected": -349.63885498046875, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": 1.0753172636032104, "rewards/margins": 0.06828060001134872, "rewards/rejected": 1.007036805152893, "step": 6620 }, { "epoch": 0.30781373322809785, "grad_norm": 21.321514129638672, "learning_rate": 2.815534611634709e-07, "logits/chosen": -18.8270320892334, "logits/rejected": -17.335651397705078, "logps/chosen": -558.6619873046875, "logps/rejected": -364.73687744140625, "loss": 0.3879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8932971954345703, "rewards/margins": 0.9299437403678894, "rewards/rejected": 0.9633535146713257, "step": 6630 }, { "epoch": 0.3082780073355309, "grad_norm": 26.66524314880371, "learning_rate": 2.815256047170249e-07, "logits/chosen": -18.013259887695312, "logits/rejected": -18.321142196655273, "logps/chosen": -321.890869140625, "logps/rejected": -350.5838623046875, "loss": 0.6691, "rewards/accuracies": 0.5, "rewards/chosen": 1.0831702947616577, "rewards/margins": 0.12315788120031357, "rewards/rejected": 0.9600124359130859, "step": 6640 }, { "epoch": 0.30874228144296395, "grad_norm": 76.88909149169922, "learning_rate": 2.814977482705789e-07, "logits/chosen": -18.64292335510254, "logits/rejected": -18.061237335205078, "logps/chosen": -472.91314697265625, "logps/rejected": -385.52734375, "loss": 0.5603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3628723621368408, "rewards/margins": 0.3841468393802643, "rewards/rejected": 0.9787254333496094, "step": 6650 }, { "epoch": 0.30920655555039694, "grad_norm": 112.46466827392578, "learning_rate": 2.8146989182413296e-07, "logits/chosen": -17.938491821289062, "logits/rejected": -18.173114776611328, "logps/chosen": -331.68743896484375, "logps/rejected": -345.90704345703125, "loss": 0.9414, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2522732019424438, "rewards/margins": -0.2915376126766205, "rewards/rejected": 1.5438108444213867, "step": 6660 }, { "epoch": 0.30967082965783, "grad_norm": 116.09253692626953, "learning_rate": 2.8144203537768694e-07, "logits/chosen": -19.124479293823242, "logits/rejected": -18.374134063720703, "logps/chosen": -461.5088806152344, "logps/rejected": -316.69207763671875, "loss": 0.532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6069176197052002, "rewards/margins": 0.5118556618690491, "rewards/rejected": 1.0950617790222168, "step": 6670 }, { "epoch": 0.310135103765263, "grad_norm": 59.62317657470703, "learning_rate": 2.81414178931241e-07, "logits/chosen": -18.41995620727539, "logits/rejected": -18.081493377685547, "logps/chosen": -380.1661071777344, "logps/rejected": -273.17132568359375, "loss": 0.6554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2909971475601196, "rewards/margins": 0.11437149345874786, "rewards/rejected": 1.1766256093978882, "step": 6680 }, { "epoch": 0.31059937787269604, "grad_norm": 77.33695220947266, "learning_rate": 2.81386322484795e-07, "logits/chosen": -18.250564575195312, "logits/rejected": -17.021394729614258, "logps/chosen": -480.4158630371094, "logps/rejected": -311.26287841796875, "loss": 0.5602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3930468559265137, "rewards/margins": 0.40283042192459106, "rewards/rejected": 0.9902163743972778, "step": 6690 }, { "epoch": 0.3110636519801291, "grad_norm": 47.98262405395508, "learning_rate": 2.8135846603834907e-07, "logits/chosen": -17.85920524597168, "logits/rejected": -16.534944534301758, "logps/chosen": -491.18170166015625, "logps/rejected": -292.2484130859375, "loss": 0.5018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7942798137664795, "rewards/margins": 0.7100499272346497, "rewards/rejected": 1.084229826927185, "step": 6700 }, { "epoch": 0.3115279260875621, "grad_norm": 188.03109741210938, "learning_rate": 2.8133060959190305e-07, "logits/chosen": -18.059228897094727, "logits/rejected": -17.78693199157715, "logps/chosen": -459.23760986328125, "logps/rejected": -382.53021240234375, "loss": 0.6054, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.025832176208496, "rewards/margins": 0.2965324819087982, "rewards/rejected": 1.729299783706665, "step": 6710 }, { "epoch": 0.31199220019499513, "grad_norm": 24.55229949951172, "learning_rate": 2.8130275314545704e-07, "logits/chosen": -18.37141227722168, "logits/rejected": -18.526737213134766, "logps/chosen": -343.1701354980469, "logps/rejected": -347.18072509765625, "loss": 0.8769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1716612577438354, "rewards/margins": -0.16895544528961182, "rewards/rejected": 1.3406168222427368, "step": 6720 }, { "epoch": 0.3124564743024282, "grad_norm": 65.84721374511719, "learning_rate": 2.812748966990111e-07, "logits/chosen": -17.884689331054688, "logits/rejected": -17.925622940063477, "logps/chosen": -459.9960021972656, "logps/rejected": -467.6410217285156, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 1.5333025455474854, "rewards/margins": 0.14885130524635315, "rewards/rejected": 1.3844512701034546, "step": 6730 }, { "epoch": 0.31292074840986117, "grad_norm": 153.4997100830078, "learning_rate": 2.812470402525651e-07, "logits/chosen": -17.76352310180664, "logits/rejected": -17.324626922607422, "logps/chosen": -335.35540771484375, "logps/rejected": -334.90484619140625, "loss": 0.8633, "rewards/accuracies": 0.5, "rewards/chosen": 1.1774958372116089, "rewards/margins": -0.14180955290794373, "rewards/rejected": 1.319305419921875, "step": 6740 }, { "epoch": 0.3133850225172942, "grad_norm": 44.345943450927734, "learning_rate": 2.812191838061191e-07, "logits/chosen": -18.442136764526367, "logits/rejected": -17.712915420532227, "logps/chosen": -275.7879333496094, "logps/rejected": -218.2089080810547, "loss": 0.5815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.133555293083191, "rewards/margins": 0.2554624378681183, "rewards/rejected": 0.8780929446220398, "step": 6750 }, { "epoch": 0.3138492966247272, "grad_norm": 35.23334884643555, "learning_rate": 2.8119132735967315e-07, "logits/chosen": -18.541885375976562, "logits/rejected": -18.408573150634766, "logps/chosen": -453.5835876464844, "logps/rejected": -416.99102783203125, "loss": 0.7021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7003355026245117, "rewards/margins": 0.0976884588599205, "rewards/rejected": 1.602647066116333, "step": 6760 }, { "epoch": 0.31431357073216026, "grad_norm": 63.83586502075195, "learning_rate": 2.8116347091322714e-07, "logits/chosen": -19.394662857055664, "logits/rejected": -19.174118041992188, "logps/chosen": -434.18438720703125, "logps/rejected": -400.8966979980469, "loss": 0.6829, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.429903268814087, "rewards/margins": 0.09234434366226196, "rewards/rejected": 1.3375589847564697, "step": 6770 }, { "epoch": 0.3147778448395933, "grad_norm": 76.29812622070312, "learning_rate": 2.811356144667812e-07, "logits/chosen": -19.73082733154297, "logits/rejected": -18.397445678710938, "logps/chosen": -439.5008850097656, "logps/rejected": -268.5002746582031, "loss": 0.4895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6709321737289429, "rewards/margins": 0.577869176864624, "rewards/rejected": 1.0930629968643188, "step": 6780 }, { "epoch": 0.3152421189470263, "grad_norm": 27.4822940826416, "learning_rate": 2.8110775802033517e-07, "logits/chosen": -18.100099563598633, "logits/rejected": -17.057098388671875, "logps/chosen": -468.0010681152344, "logps/rejected": -313.4549560546875, "loss": 0.5263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7884098291397095, "rewards/margins": 0.5824893116950989, "rewards/rejected": 1.205920696258545, "step": 6790 }, { "epoch": 0.31570639305445936, "grad_norm": 70.33642578125, "learning_rate": 2.810799015738892e-07, "logits/chosen": -18.785015106201172, "logits/rejected": -18.489465713500977, "logps/chosen": -501.2879943847656, "logps/rejected": -490.99627685546875, "loss": 0.6601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9496183395385742, "rewards/margins": 0.17771203815937042, "rewards/rejected": 1.7719064950942993, "step": 6800 }, { "epoch": 0.3161706671618924, "grad_norm": 43.197784423828125, "learning_rate": 2.8105204512744325e-07, "logits/chosen": -18.648761749267578, "logits/rejected": -18.706256866455078, "logps/chosen": -439.80072021484375, "logps/rejected": -348.316650390625, "loss": 0.4786, "rewards/accuracies": 1.0, "rewards/chosen": 1.7538106441497803, "rewards/margins": 0.5730448961257935, "rewards/rejected": 1.1807657480239868, "step": 6810 }, { "epoch": 0.3166349412693254, "grad_norm": 60.896419525146484, "learning_rate": 2.8102418868099724e-07, "logits/chosen": -17.563182830810547, "logits/rejected": -17.28200912475586, "logps/chosen": -593.21728515625, "logps/rejected": -421.8974609375, "loss": 0.5856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.889600157737732, "rewards/margins": 0.5475308299064636, "rewards/rejected": 1.342069387435913, "step": 6820 }, { "epoch": 0.31709921537675845, "grad_norm": 11.568193435668945, "learning_rate": 2.809963322345513e-07, "logits/chosen": -18.813190460205078, "logits/rejected": -17.80489730834961, "logps/chosen": -340.48992919921875, "logps/rejected": -257.06768798828125, "loss": 0.5021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.282505989074707, "rewards/margins": 0.5350127816200256, "rewards/rejected": 0.7474932074546814, "step": 6830 }, { "epoch": 0.31756348948419144, "grad_norm": 70.51226806640625, "learning_rate": 2.8096847578810527e-07, "logits/chosen": -19.267131805419922, "logits/rejected": -18.92329216003418, "logps/chosen": -389.77862548828125, "logps/rejected": -394.56982421875, "loss": 0.8312, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4445805549621582, "rewards/margins": -0.2157914936542511, "rewards/rejected": 1.6603721380233765, "step": 6840 }, { "epoch": 0.3180277635916245, "grad_norm": 67.24008178710938, "learning_rate": 2.809406193416593e-07, "logits/chosen": -18.1120548248291, "logits/rejected": -17.556121826171875, "logps/chosen": -379.216552734375, "logps/rejected": -337.84661865234375, "loss": 0.6446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.300068736076355, "rewards/margins": 0.18975140154361725, "rewards/rejected": 1.1103174686431885, "step": 6850 }, { "epoch": 0.31849203769905754, "grad_norm": 200.96524047851562, "learning_rate": 2.809127628952133e-07, "logits/chosen": -19.783157348632812, "logits/rejected": -18.506145477294922, "logps/chosen": -458.1900329589844, "logps/rejected": -294.65362548828125, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 1.3314646482467651, "rewards/margins": 0.29903221130371094, "rewards/rejected": 1.0324325561523438, "step": 6860 }, { "epoch": 0.31895631180649053, "grad_norm": 104.68376922607422, "learning_rate": 2.8088490644876734e-07, "logits/chosen": -19.867504119873047, "logits/rejected": -19.57015609741211, "logps/chosen": -487.3409729003906, "logps/rejected": -479.03668212890625, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0932774543762207, "rewards/margins": 0.13850708305835724, "rewards/rejected": 1.9547703266143799, "step": 6870 }, { "epoch": 0.3194205859139236, "grad_norm": 92.66474151611328, "learning_rate": 2.808570500023214e-07, "logits/chosen": -18.025251388549805, "logits/rejected": -18.49704360961914, "logps/chosen": -344.63665771484375, "logps/rejected": -341.73895263671875, "loss": 0.7477, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1796082258224487, "rewards/margins": -0.04466930031776428, "rewards/rejected": 1.2242774963378906, "step": 6880 }, { "epoch": 0.31988486002135663, "grad_norm": 76.6246109008789, "learning_rate": 2.8082919355587537e-07, "logits/chosen": -18.671144485473633, "logits/rejected": -18.111730575561523, "logps/chosen": -485.99884033203125, "logps/rejected": -402.06060791015625, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 1.4136466979980469, "rewards/margins": 0.09575339406728745, "rewards/rejected": 1.3178932666778564, "step": 6890 }, { "epoch": 0.3203491341287896, "grad_norm": 53.54124450683594, "learning_rate": 2.8080133710942936e-07, "logits/chosen": -17.496057510375977, "logits/rejected": -17.120756149291992, "logps/chosen": -289.29541015625, "logps/rejected": -232.9736328125, "loss": 0.5776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1957231760025024, "rewards/margins": 0.3338671922683716, "rewards/rejected": 0.8618558645248413, "step": 6900 }, { "epoch": 0.3208134082362227, "grad_norm": 50.52534866333008, "learning_rate": 2.807734806629834e-07, "logits/chosen": -18.23827362060547, "logits/rejected": -17.19830894470215, "logps/chosen": -394.6719665527344, "logps/rejected": -206.61911010742188, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/chosen": 1.6783697605133057, "rewards/margins": 0.9664250612258911, "rewards/rejected": 0.7119446992874146, "step": 6910 }, { "epoch": 0.32127768234365567, "grad_norm": 259.3757629394531, "learning_rate": 2.8074562421653744e-07, "logits/chosen": -17.323650360107422, "logits/rejected": -18.19501495361328, "logps/chosen": -395.16015625, "logps/rejected": -520.1090087890625, "loss": 1.1885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2442225217819214, "rewards/margins": -0.6863669157028198, "rewards/rejected": 1.9305893182754517, "step": 6920 }, { "epoch": 0.3217419564510887, "grad_norm": 129.6094207763672, "learning_rate": 2.807177677700915e-07, "logits/chosen": -17.592823028564453, "logits/rejected": -17.1661434173584, "logps/chosen": -366.8015441894531, "logps/rejected": -343.7069091796875, "loss": 0.6434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5432007312774658, "rewards/margins": 0.2170935422182083, "rewards/rejected": 1.3261072635650635, "step": 6930 }, { "epoch": 0.32220623055852177, "grad_norm": 18.27103042602539, "learning_rate": 2.8068991132364547e-07, "logits/chosen": -18.558361053466797, "logits/rejected": -17.718738555908203, "logps/chosen": -474.66375732421875, "logps/rejected": -403.8751220703125, "loss": 0.6264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.631842851638794, "rewards/margins": 0.3539004921913147, "rewards/rejected": 1.277942419052124, "step": 6940 }, { "epoch": 0.32267050466595476, "grad_norm": 170.20700073242188, "learning_rate": 2.806620548771995e-07, "logits/chosen": -18.27337646484375, "logits/rejected": -18.581449508666992, "logps/chosen": -486.47845458984375, "logps/rejected": -527.5504150390625, "loss": 0.8108, "rewards/accuracies": 0.5, "rewards/chosen": 1.7841627597808838, "rewards/margins": -0.09967061132192612, "rewards/rejected": 1.8838332891464233, "step": 6950 }, { "epoch": 0.3231347787733878, "grad_norm": 245.69357299804688, "learning_rate": 2.806341984307535e-07, "logits/chosen": -18.61670684814453, "logits/rejected": -17.16604232788086, "logps/chosen": -525.7271728515625, "logps/rejected": -345.21661376953125, "loss": 0.4775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9543211460113525, "rewards/margins": 0.8402231335639954, "rewards/rejected": 1.114098072052002, "step": 6960 }, { "epoch": 0.32359905288082086, "grad_norm": 93.59210205078125, "learning_rate": 2.8060634198430754e-07, "logits/chosen": -17.88860511779785, "logits/rejected": -18.00082015991211, "logps/chosen": -371.815673828125, "logps/rejected": -274.77410888671875, "loss": 0.6442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1540887355804443, "rewards/margins": 0.21076953411102295, "rewards/rejected": 0.9433192014694214, "step": 6970 }, { "epoch": 0.32406332698825385, "grad_norm": 83.9805679321289, "learning_rate": 2.805784855378615e-07, "logits/chosen": -18.08831024169922, "logits/rejected": -18.32344627380371, "logps/chosen": -425.2972717285156, "logps/rejected": -410.47894287109375, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": 1.4787300825119019, "rewards/margins": 0.11867067962884903, "rewards/rejected": 1.3600596189498901, "step": 6980 }, { "epoch": 0.3245276010956869, "grad_norm": 89.6276626586914, "learning_rate": 2.8055062909141557e-07, "logits/chosen": -19.012155532836914, "logits/rejected": -18.469955444335938, "logps/chosen": -423.4425354003906, "logps/rejected": -280.6160583496094, "loss": 0.5641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5443909168243408, "rewards/margins": 0.4766172766685486, "rewards/rejected": 1.0677735805511475, "step": 6990 }, { "epoch": 0.3249918752031199, "grad_norm": 146.36956787109375, "learning_rate": 2.805227726449696e-07, "logits/chosen": -17.824840545654297, "logits/rejected": -16.924636840820312, "logps/chosen": -511.1319274902344, "logps/rejected": -450.20086669921875, "loss": 0.6516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8054378032684326, "rewards/margins": 0.3274969458580017, "rewards/rejected": 1.4779407978057861, "step": 7000 }, { "epoch": 0.32545614931055294, "grad_norm": 160.03102111816406, "learning_rate": 2.804949161985236e-07, "logits/chosen": -18.49166488647461, "logits/rejected": -17.94329071044922, "logps/chosen": -377.96710205078125, "logps/rejected": -312.5229187011719, "loss": 0.6221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5118191242218018, "rewards/margins": 0.21422795951366425, "rewards/rejected": 1.297591209411621, "step": 7010 }, { "epoch": 0.325920423417986, "grad_norm": 110.88301849365234, "learning_rate": 2.804670597520776e-07, "logits/chosen": -18.589235305786133, "logits/rejected": -17.560861587524414, "logps/chosen": -454.85040283203125, "logps/rejected": -311.08502197265625, "loss": 0.5296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5156357288360596, "rewards/margins": 0.5036636590957642, "rewards/rejected": 1.011972188949585, "step": 7020 }, { "epoch": 0.326384697525419, "grad_norm": 60.73857116699219, "learning_rate": 2.804392033056316e-07, "logits/chosen": -17.907222747802734, "logits/rejected": -16.878721237182617, "logps/chosen": -348.4668273925781, "logps/rejected": -184.5360870361328, "loss": 0.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4061920642852783, "rewards/margins": 0.6343857049942017, "rewards/rejected": 0.7718062996864319, "step": 7030 }, { "epoch": 0.32684897163285204, "grad_norm": 48.40562057495117, "learning_rate": 2.8041134685918567e-07, "logits/chosen": -18.559316635131836, "logits/rejected": -17.558486938476562, "logps/chosen": -484.3365173339844, "logps/rejected": -376.0651550292969, "loss": 0.6128, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7160364389419556, "rewards/margins": 0.5167444348335266, "rewards/rejected": 1.1992919445037842, "step": 7040 }, { "epoch": 0.3273132457402851, "grad_norm": 76.92587280273438, "learning_rate": 2.8038349041273965e-07, "logits/chosen": -18.225160598754883, "logits/rejected": -17.320690155029297, "logps/chosen": -443.77935791015625, "logps/rejected": -275.91900634765625, "loss": 0.5121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5473458766937256, "rewards/margins": 0.46725305914878845, "rewards/rejected": 1.0800927877426147, "step": 7050 }, { "epoch": 0.3277775198477181, "grad_norm": 49.07536697387695, "learning_rate": 2.803556339662937e-07, "logits/chosen": -19.272268295288086, "logits/rejected": -17.11189842224121, "logps/chosen": -443.2857360839844, "logps/rejected": -282.78131103515625, "loss": 0.425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6503784656524658, "rewards/margins": 0.827567458152771, "rewards/rejected": 0.82281094789505, "step": 7060 }, { "epoch": 0.32824179395515113, "grad_norm": 23.040382385253906, "learning_rate": 2.803277775198477e-07, "logits/chosen": -19.234848022460938, "logits/rejected": -17.38795280456543, "logps/chosen": -556.3185424804688, "logps/rejected": -274.6829833984375, "loss": 0.4398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9134063720703125, "rewards/margins": 0.7877304553985596, "rewards/rejected": 1.125675916671753, "step": 7070 }, { "epoch": 0.3287060680625841, "grad_norm": 59.18164825439453, "learning_rate": 2.802999210734017e-07, "logits/chosen": -18.318056106567383, "logits/rejected": -17.499755859375, "logps/chosen": -496.7470703125, "logps/rejected": -355.55682373046875, "loss": 0.5379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7095394134521484, "rewards/margins": 0.4190872609615326, "rewards/rejected": 1.290452241897583, "step": 7080 }, { "epoch": 0.32917034217001717, "grad_norm": 35.5888671875, "learning_rate": 2.802720646269557e-07, "logits/chosen": -18.057470321655273, "logits/rejected": -16.7681827545166, "logps/chosen": -427.589599609375, "logps/rejected": -280.07574462890625, "loss": 0.5379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4357521533966064, "rewards/margins": 0.5229074954986572, "rewards/rejected": 0.9128445386886597, "step": 7090 }, { "epoch": 0.3296346162774502, "grad_norm": 47.96428298950195, "learning_rate": 2.8024420818050975e-07, "logits/chosen": -17.683544158935547, "logits/rejected": -17.844341278076172, "logps/chosen": -220.23733520507812, "logps/rejected": -211.57852172851562, "loss": 0.6416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8803306818008423, "rewards/margins": 0.14372453093528748, "rewards/rejected": 0.736606240272522, "step": 7100 }, { "epoch": 0.3300988903848832, "grad_norm": 116.56368255615234, "learning_rate": 2.802163517340638e-07, "logits/chosen": -18.205354690551758, "logits/rejected": -17.934249877929688, "logps/chosen": -346.7800598144531, "logps/rejected": -341.26806640625, "loss": 0.6507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.542680025100708, "rewards/margins": 0.1747635304927826, "rewards/rejected": 1.367916464805603, "step": 7110 }, { "epoch": 0.33056316449231626, "grad_norm": 285.2057800292969, "learning_rate": 2.8018849528761784e-07, "logits/chosen": -18.159175872802734, "logits/rejected": -18.299028396606445, "logps/chosen": -307.30535888671875, "logps/rejected": -287.46441650390625, "loss": 0.702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3150060176849365, "rewards/margins": 0.1146450862288475, "rewards/rejected": 1.200360894203186, "step": 7120 }, { "epoch": 0.3310274385997493, "grad_norm": 47.99126434326172, "learning_rate": 2.801606388411718e-07, "logits/chosen": -18.408430099487305, "logits/rejected": -17.916460037231445, "logps/chosen": -412.2501525878906, "logps/rejected": -454.63348388671875, "loss": 0.7117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6742639541625977, "rewards/margins": 0.15936417877674103, "rewards/rejected": 1.514899730682373, "step": 7130 }, { "epoch": 0.3314917127071823, "grad_norm": 89.01355743408203, "learning_rate": 2.801327823947258e-07, "logits/chosen": -18.131790161132812, "logits/rejected": -16.958818435668945, "logps/chosen": -375.52850341796875, "logps/rejected": -274.48052978515625, "loss": 0.6184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8683176040649414, "rewards/margins": 0.3504473567008972, "rewards/rejected": 1.5178704261779785, "step": 7140 }, { "epoch": 0.33195598681461536, "grad_norm": 100.959716796875, "learning_rate": 2.8010492594827985e-07, "logits/chosen": -18.94738006591797, "logits/rejected": -18.639934539794922, "logps/chosen": -468.68511962890625, "logps/rejected": -385.4607849121094, "loss": 0.7621, "rewards/accuracies": 0.5, "rewards/chosen": 1.5921754837036133, "rewards/margins": -0.015793394297361374, "rewards/rejected": 1.6079689264297485, "step": 7150 }, { "epoch": 0.3324202609220484, "grad_norm": 137.566162109375, "learning_rate": 2.800770695018339e-07, "logits/chosen": -18.634578704833984, "logits/rejected": -18.237586975097656, "logps/chosen": -427.7691955566406, "logps/rejected": -365.31097412109375, "loss": 0.5099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8022035360336304, "rewards/margins": 0.47633615136146545, "rewards/rejected": 1.3258672952651978, "step": 7160 }, { "epoch": 0.3328845350294814, "grad_norm": 52.11871337890625, "learning_rate": 2.800492130553879e-07, "logits/chosen": -19.272371292114258, "logits/rejected": -18.850749969482422, "logps/chosen": -321.9498291015625, "logps/rejected": -258.68731689453125, "loss": 0.6397, "rewards/accuracies": 0.5, "rewards/chosen": 1.2525551319122314, "rewards/margins": 0.20168261229991913, "rewards/rejected": 1.050872564315796, "step": 7170 }, { "epoch": 0.33334880913691445, "grad_norm": 32.8437385559082, "learning_rate": 2.800213566089419e-07, "logits/chosen": -18.3865909576416, "logits/rejected": -17.873445510864258, "logps/chosen": -403.06121826171875, "logps/rejected": -328.3410339355469, "loss": 0.583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3934369087219238, "rewards/margins": 0.3232802450656891, "rewards/rejected": 1.0701568126678467, "step": 7180 }, { "epoch": 0.33381308324434744, "grad_norm": 24.122915267944336, "learning_rate": 2.799935001624959e-07, "logits/chosen": -20.54042625427246, "logits/rejected": -18.71632957458496, "logps/chosen": -380.2112121582031, "logps/rejected": -248.3370819091797, "loss": 0.4996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6007112264633179, "rewards/margins": 0.6015897989273071, "rewards/rejected": 0.9991215467453003, "step": 7190 }, { "epoch": 0.3342773573517805, "grad_norm": 141.17819213867188, "learning_rate": 2.7996564371604995e-07, "logits/chosen": -18.353256225585938, "logits/rejected": -18.19085693359375, "logps/chosen": -466.65557861328125, "logps/rejected": -472.4537048339844, "loss": 0.804, "rewards/accuracies": 0.5, "rewards/chosen": 1.7395544052124023, "rewards/margins": -0.009082317352294922, "rewards/rejected": 1.7486368417739868, "step": 7200 }, { "epoch": 0.33474163145921354, "grad_norm": 174.9177703857422, "learning_rate": 2.7993778726960394e-07, "logits/chosen": -18.031383514404297, "logits/rejected": -17.280773162841797, "logps/chosen": -579.1685180664062, "logps/rejected": -427.32281494140625, "loss": 0.5873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6790668964385986, "rewards/margins": 0.3315415680408478, "rewards/rejected": 1.3475251197814941, "step": 7210 }, { "epoch": 0.33520590556664653, "grad_norm": 59.506168365478516, "learning_rate": 2.79909930823158e-07, "logits/chosen": -18.073152542114258, "logits/rejected": -17.65085792541504, "logps/chosen": -321.7232971191406, "logps/rejected": -243.7949676513672, "loss": 0.7633, "rewards/accuracies": 0.5, "rewards/chosen": 1.0461302995681763, "rewards/margins": 0.06383343040943146, "rewards/rejected": 0.9822966456413269, "step": 7220 }, { "epoch": 0.3356701796740796, "grad_norm": 82.4965591430664, "learning_rate": 2.79882074376712e-07, "logits/chosen": -17.969263076782227, "logits/rejected": -17.65166473388672, "logps/chosen": -490.3548889160156, "logps/rejected": -404.1051330566406, "loss": 0.7196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.020758628845215, "rewards/margins": 0.22904424369335175, "rewards/rejected": 1.7917144298553467, "step": 7230 }, { "epoch": 0.33613445378151263, "grad_norm": 179.67884826660156, "learning_rate": 2.79854217930266e-07, "logits/chosen": -18.515562057495117, "logits/rejected": -18.55472183227539, "logps/chosen": -435.16845703125, "logps/rejected": -457.1288146972656, "loss": 0.8274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5204596519470215, "rewards/margins": 0.040426768362522125, "rewards/rejected": 1.4800328016281128, "step": 7240 }, { "epoch": 0.3365987278889456, "grad_norm": 48.5597038269043, "learning_rate": 2.7982636148382005e-07, "logits/chosen": -18.378032684326172, "logits/rejected": -17.009132385253906, "logps/chosen": -479.93072509765625, "logps/rejected": -300.244873046875, "loss": 0.4177, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7653567790985107, "rewards/margins": 0.7614725232124329, "rewards/rejected": 1.0038843154907227, "step": 7250 }, { "epoch": 0.3370630019963787, "grad_norm": 53.326927185058594, "learning_rate": 2.7979850503737404e-07, "logits/chosen": -17.952381134033203, "logits/rejected": -16.619247436523438, "logps/chosen": -391.87457275390625, "logps/rejected": -263.60369873046875, "loss": 0.5817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.320143461227417, "rewards/margins": 0.3998619318008423, "rewards/rejected": 0.9202815890312195, "step": 7260 }, { "epoch": 0.33752727610381167, "grad_norm": 47.65810775756836, "learning_rate": 2.797706485909281e-07, "logits/chosen": -18.020626068115234, "logits/rejected": -17.0617618560791, "logps/chosen": -456.02471923828125, "logps/rejected": -338.48956298828125, "loss": 0.6119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6668018102645874, "rewards/margins": 0.369037002325058, "rewards/rejected": 1.297764778137207, "step": 7270 }, { "epoch": 0.3379915502112447, "grad_norm": 96.36617279052734, "learning_rate": 2.7974279214448207e-07, "logits/chosen": -18.246740341186523, "logits/rejected": -17.207393646240234, "logps/chosen": -464.47186279296875, "logps/rejected": -321.87664794921875, "loss": 0.4849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.130047082901001, "rewards/margins": 0.7371367812156677, "rewards/rejected": 1.392910122871399, "step": 7280 }, { "epoch": 0.33845582431867777, "grad_norm": 50.024078369140625, "learning_rate": 2.797149356980361e-07, "logits/chosen": -17.678937911987305, "logits/rejected": -18.182907104492188, "logps/chosen": -256.17694091796875, "logps/rejected": -309.75494384765625, "loss": 0.821, "rewards/accuracies": 0.5, "rewards/chosen": 1.0904765129089355, "rewards/margins": -0.1203470453619957, "rewards/rejected": 1.210823655128479, "step": 7290 }, { "epoch": 0.33892009842611076, "grad_norm": 66.58306884765625, "learning_rate": 2.7968707925159015e-07, "logits/chosen": -19.473526000976562, "logits/rejected": -18.787540435791016, "logps/chosen": -484.2720642089844, "logps/rejected": -352.4587097167969, "loss": 0.5297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6281286478042603, "rewards/margins": 0.5873858332633972, "rewards/rejected": 1.0407428741455078, "step": 7300 }, { "epoch": 0.3393843725335438, "grad_norm": 71.42357635498047, "learning_rate": 2.7965922280514414e-07, "logits/chosen": -18.340030670166016, "logits/rejected": -17.91631507873535, "logps/chosen": -516.1771240234375, "logps/rejected": -368.5782775878906, "loss": 0.6866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6958200931549072, "rewards/margins": 0.16872993111610413, "rewards/rejected": 1.5270901918411255, "step": 7310 }, { "epoch": 0.33984864664097686, "grad_norm": 77.76092529296875, "learning_rate": 2.796313663586981e-07, "logits/chosen": -18.689468383789062, "logits/rejected": -18.550992965698242, "logps/chosen": -240.17367553710938, "logps/rejected": -254.63232421875, "loss": 0.77, "rewards/accuracies": 0.5, "rewards/chosen": 1.0441856384277344, "rewards/margins": -0.07231410592794418, "rewards/rejected": 1.116499662399292, "step": 7320 }, { "epoch": 0.34031292074840985, "grad_norm": 23.269624710083008, "learning_rate": 2.7960350991225217e-07, "logits/chosen": -18.630619049072266, "logits/rejected": -17.33681869506836, "logps/chosen": -610.2496337890625, "logps/rejected": -412.7857971191406, "loss": 0.4493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9109508991241455, "rewards/margins": 0.7765964269638062, "rewards/rejected": 1.134354591369629, "step": 7330 }, { "epoch": 0.3407771948558429, "grad_norm": 284.6211242675781, "learning_rate": 2.795756534658062e-07, "logits/chosen": -18.16451072692871, "logits/rejected": -17.44550323486328, "logps/chosen": -490.0525817871094, "logps/rejected": -397.61004638671875, "loss": 0.7372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.47737717628479, "rewards/margins": 0.23414330184459686, "rewards/rejected": 1.2432339191436768, "step": 7340 }, { "epoch": 0.3412414689632759, "grad_norm": 50.66896057128906, "learning_rate": 2.7954779701936025e-07, "logits/chosen": -18.5948543548584, "logits/rejected": -17.938766479492188, "logps/chosen": -403.88153076171875, "logps/rejected": -329.822265625, "loss": 0.6072, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.365615725517273, "rewards/margins": 0.25280916690826416, "rewards/rejected": 1.1128064393997192, "step": 7350 }, { "epoch": 0.34170574307070894, "grad_norm": 38.706085205078125, "learning_rate": 2.7951994057291424e-07, "logits/chosen": -18.518857955932617, "logits/rejected": -17.596542358398438, "logps/chosen": -455.0003967285156, "logps/rejected": -281.2146911621094, "loss": 0.5134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4337419271469116, "rewards/margins": 0.592411994934082, "rewards/rejected": 0.8413299322128296, "step": 7360 }, { "epoch": 0.342170017178142, "grad_norm": 33.726985931396484, "learning_rate": 2.794920841264683e-07, "logits/chosen": -18.69191551208496, "logits/rejected": -17.200180053710938, "logps/chosen": -440.6861877441406, "logps/rejected": -228.95968627929688, "loss": 0.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8502622842788696, "rewards/margins": 0.8186323046684265, "rewards/rejected": 1.0316296815872192, "step": 7370 }, { "epoch": 0.342634291285575, "grad_norm": 126.91024017333984, "learning_rate": 2.7946422768002227e-07, "logits/chosen": -17.695308685302734, "logits/rejected": -16.918716430664062, "logps/chosen": -462.060302734375, "logps/rejected": -318.45465087890625, "loss": 0.6433, "rewards/accuracies": 0.5, "rewards/chosen": 1.7678897380828857, "rewards/margins": 0.37361353635787964, "rewards/rejected": 1.3942762613296509, "step": 7380 }, { "epoch": 0.34309856539300804, "grad_norm": 22.7235164642334, "learning_rate": 2.794363712335763e-07, "logits/chosen": -18.532407760620117, "logits/rejected": -17.37083625793457, "logps/chosen": -491.9266052246094, "logps/rejected": -366.3209533691406, "loss": 0.547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7136789560317993, "rewards/margins": 0.45910587906837463, "rewards/rejected": 1.254573106765747, "step": 7390 }, { "epoch": 0.3435628395004411, "grad_norm": 114.00438690185547, "learning_rate": 2.794085147871303e-07, "logits/chosen": -18.282941818237305, "logits/rejected": -18.1820125579834, "logps/chosen": -332.2216491699219, "logps/rejected": -365.63995361328125, "loss": 0.8105, "rewards/accuracies": 0.5, "rewards/chosen": 1.07915198802948, "rewards/margins": -0.09899941831827164, "rewards/rejected": 1.1781513690948486, "step": 7400 }, { "epoch": 0.3440271136078741, "grad_norm": 94.64289855957031, "learning_rate": 2.7938065834068434e-07, "logits/chosen": -17.75035285949707, "logits/rejected": -17.791015625, "logps/chosen": -389.468994140625, "logps/rejected": -414.40142822265625, "loss": 0.8209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.261155366897583, "rewards/margins": -0.17185427248477936, "rewards/rejected": 1.4330095052719116, "step": 7410 }, { "epoch": 0.34449138771530713, "grad_norm": 59.28997802734375, "learning_rate": 2.793528018942384e-07, "logits/chosen": -18.198490142822266, "logits/rejected": -16.93327522277832, "logps/chosen": -437.9358825683594, "logps/rejected": -333.3762512207031, "loss": 0.6794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6518503427505493, "rewards/margins": 0.3551749587059021, "rewards/rejected": 1.296675205230713, "step": 7420 }, { "epoch": 0.3449556618227401, "grad_norm": 108.329833984375, "learning_rate": 2.7932494544779237e-07, "logits/chosen": -19.01488494873047, "logits/rejected": -18.141616821289062, "logps/chosen": -536.9613037109375, "logps/rejected": -413.1268005371094, "loss": 0.4888, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8656189441680908, "rewards/margins": 0.5835222005844116, "rewards/rejected": 1.2820967435836792, "step": 7430 }, { "epoch": 0.34541993593017317, "grad_norm": 77.92210388183594, "learning_rate": 2.7929708900134635e-07, "logits/chosen": -17.082317352294922, "logits/rejected": -16.825117111206055, "logps/chosen": -422.174072265625, "logps/rejected": -393.3301696777344, "loss": 0.772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5141083002090454, "rewards/margins": 0.11605857312679291, "rewards/rejected": 1.3980497121810913, "step": 7440 }, { "epoch": 0.3458842100376062, "grad_norm": 15.936656951904297, "learning_rate": 2.792692325549004e-07, "logits/chosen": -19.078922271728516, "logits/rejected": -17.760255813598633, "logps/chosen": -419.32989501953125, "logps/rejected": -254.353271484375, "loss": 0.5196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7305132150650024, "rewards/margins": 0.6905150413513184, "rewards/rejected": 1.0399982929229736, "step": 7450 }, { "epoch": 0.3463484841450392, "grad_norm": 99.32466125488281, "learning_rate": 2.7924137610845444e-07, "logits/chosen": -18.207977294921875, "logits/rejected": -17.517175674438477, "logps/chosen": -345.8283996582031, "logps/rejected": -326.6045227050781, "loss": 0.604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4320571422576904, "rewards/margins": 0.21903328597545624, "rewards/rejected": 1.2130237817764282, "step": 7460 }, { "epoch": 0.34681275825247226, "grad_norm": 89.58547973632812, "learning_rate": 2.792135196620084e-07, "logits/chosen": -18.09238052368164, "logits/rejected": -17.299894332885742, "logps/chosen": -388.1407165527344, "logps/rejected": -287.958740234375, "loss": 0.5655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2917269468307495, "rewards/margins": 0.33245453238487244, "rewards/rejected": 0.9592725038528442, "step": 7470 }, { "epoch": 0.3472770323599053, "grad_norm": 107.97056579589844, "learning_rate": 2.7918566321556246e-07, "logits/chosen": -18.015491485595703, "logits/rejected": -18.04979705810547, "logps/chosen": -419.5716247558594, "logps/rejected": -374.10546875, "loss": 0.7898, "rewards/accuracies": 0.5, "rewards/chosen": 1.4258439540863037, "rewards/margins": 0.012500124983489513, "rewards/rejected": 1.413343906402588, "step": 7480 }, { "epoch": 0.3477413064673383, "grad_norm": 48.145328521728516, "learning_rate": 2.7915780676911645e-07, "logits/chosen": -18.358150482177734, "logits/rejected": -18.840087890625, "logps/chosen": -290.1719970703125, "logps/rejected": -303.9529724121094, "loss": 0.6732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1506739854812622, "rewards/margins": 0.24167008697986603, "rewards/rejected": 0.9090039134025574, "step": 7490 }, { "epoch": 0.34820558057477136, "grad_norm": 101.35101318359375, "learning_rate": 2.791299503226705e-07, "logits/chosen": -17.98162841796875, "logits/rejected": -17.69950294494629, "logps/chosen": -508.7242126464844, "logps/rejected": -419.80780029296875, "loss": 0.6619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5213103294372559, "rewards/margins": 0.0919618308544159, "rewards/rejected": 1.4293485879898071, "step": 7500 }, { "epoch": 0.34866985468220435, "grad_norm": 37.72317886352539, "learning_rate": 2.791020938762245e-07, "logits/chosen": -18.71195411682129, "logits/rejected": -18.12958335876465, "logps/chosen": -385.1598205566406, "logps/rejected": -376.00933837890625, "loss": 0.7464, "rewards/accuracies": 0.5, "rewards/chosen": 1.3765777349472046, "rewards/margins": -0.00926055945456028, "rewards/rejected": 1.385838270187378, "step": 7510 }, { "epoch": 0.3491341287896374, "grad_norm": 31.977052688598633, "learning_rate": 2.790742374297785e-07, "logits/chosen": -19.168487548828125, "logits/rejected": -17.58739471435547, "logps/chosen": -452.4588317871094, "logps/rejected": -298.68682861328125, "loss": 0.4838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8533462285995483, "rewards/margins": 0.669775128364563, "rewards/rejected": 1.183571219444275, "step": 7520 }, { "epoch": 0.34959840289707045, "grad_norm": 219.74435424804688, "learning_rate": 2.7904638098333256e-07, "logits/chosen": -19.18263816833496, "logits/rejected": -18.87028694152832, "logps/chosen": -505.4278869628906, "logps/rejected": -443.15338134765625, "loss": 0.7305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6230520009994507, "rewards/margins": 0.14439931511878967, "rewards/rejected": 1.4786527156829834, "step": 7530 }, { "epoch": 0.35006267700450344, "grad_norm": 33.5095329284668, "learning_rate": 2.790185245368866e-07, "logits/chosen": -17.87835121154785, "logits/rejected": -17.2302188873291, "logps/chosen": -292.29742431640625, "logps/rejected": -263.2423400878906, "loss": 0.6782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3351130485534668, "rewards/margins": 0.3314077854156494, "rewards/rejected": 1.003705382347107, "step": 7540 }, { "epoch": 0.3505269511119365, "grad_norm": 65.58287811279297, "learning_rate": 2.789906680904406e-07, "logits/chosen": -17.69675064086914, "logits/rejected": -17.377792358398438, "logps/chosen": -343.8879699707031, "logps/rejected": -249.9992218017578, "loss": 0.5679, "rewards/accuracies": 0.5, "rewards/chosen": 1.7869231700897217, "rewards/margins": 0.5712206959724426, "rewards/rejected": 1.2157024145126343, "step": 7550 }, { "epoch": 0.35099122521936954, "grad_norm": 123.70972442626953, "learning_rate": 2.789628116439946e-07, "logits/chosen": -18.132078170776367, "logits/rejected": -17.593168258666992, "logps/chosen": -402.9356994628906, "logps/rejected": -333.23114013671875, "loss": 0.6438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5387108325958252, "rewards/margins": 0.19739513099193573, "rewards/rejected": 1.341315507888794, "step": 7560 }, { "epoch": 0.35145549932680253, "grad_norm": 82.71231842041016, "learning_rate": 2.789349551975486e-07, "logits/chosen": -18.306743621826172, "logits/rejected": -17.68103790283203, "logps/chosen": -435.8028259277344, "logps/rejected": -372.0507507324219, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.708074927330017, "rewards/margins": 0.45681333541870117, "rewards/rejected": 1.251261830329895, "step": 7570 }, { "epoch": 0.3519197734342356, "grad_norm": 90.4440689086914, "learning_rate": 2.7890709875110266e-07, "logits/chosen": -18.260719299316406, "logits/rejected": -17.686222076416016, "logps/chosen": -504.74835205078125, "logps/rejected": -397.22882080078125, "loss": 0.7043, "rewards/accuracies": 0.5, "rewards/chosen": 1.7792609930038452, "rewards/margins": 0.2310440093278885, "rewards/rejected": 1.5482171773910522, "step": 7580 }, { "epoch": 0.3523840475416686, "grad_norm": 175.4667205810547, "learning_rate": 2.7887924230465665e-07, "logits/chosen": -18.455434799194336, "logits/rejected": -17.425418853759766, "logps/chosen": -478.1778869628906, "logps/rejected": -420.4872131347656, "loss": 0.6024, "rewards/accuracies": 0.5, "rewards/chosen": 2.067391872406006, "rewards/margins": 0.4483807682991028, "rewards/rejected": 1.6190112829208374, "step": 7590 }, { "epoch": 0.3528483216491016, "grad_norm": 139.8739013671875, "learning_rate": 2.788513858582107e-07, "logits/chosen": -17.56696128845215, "logits/rejected": -17.218448638916016, "logps/chosen": -426.24530029296875, "logps/rejected": -377.6814880371094, "loss": 0.6301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8925081491470337, "rewards/margins": 0.2877032160758972, "rewards/rejected": 1.6048047542572021, "step": 7600 }, { "epoch": 0.3533125957565347, "grad_norm": 88.40879821777344, "learning_rate": 2.788235294117647e-07, "logits/chosen": -18.544368743896484, "logits/rejected": -16.90390396118164, "logps/chosen": -497.2106018066406, "logps/rejected": -328.7745666503906, "loss": 0.5123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6663566827774048, "rewards/margins": 0.5738624334335327, "rewards/rejected": 1.092494249343872, "step": 7610 }, { "epoch": 0.35377686986396767, "grad_norm": 51.73884201049805, "learning_rate": 2.787956729653187e-07, "logits/chosen": -17.733043670654297, "logits/rejected": -16.741718292236328, "logps/chosen": -365.5931091308594, "logps/rejected": -269.05767822265625, "loss": 0.4889, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5358554124832153, "rewards/margins": 0.5836185216903687, "rewards/rejected": 0.9522369503974915, "step": 7620 }, { "epoch": 0.3542411439714007, "grad_norm": 190.63197326660156, "learning_rate": 2.787678165188727e-07, "logits/chosen": -17.950803756713867, "logits/rejected": -17.412649154663086, "logps/chosen": -377.8394775390625, "logps/rejected": -284.4771728515625, "loss": 0.6097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.005436658859253, "rewards/margins": 0.5817980170249939, "rewards/rejected": 1.4236387014389038, "step": 7630 }, { "epoch": 0.35470541807883377, "grad_norm": 45.8267822265625, "learning_rate": 2.7873996007242675e-07, "logits/chosen": -18.15277671813965, "logits/rejected": -17.482521057128906, "logps/chosen": -383.67706298828125, "logps/rejected": -321.8901062011719, "loss": 0.977, "rewards/accuracies": 0.5, "rewards/chosen": 1.542009711265564, "rewards/margins": -0.15621024370193481, "rewards/rejected": 1.698219895362854, "step": 7640 }, { "epoch": 0.35516969218626676, "grad_norm": 119.60762023925781, "learning_rate": 2.787121036259808e-07, "logits/chosen": -19.106706619262695, "logits/rejected": -18.859567642211914, "logps/chosen": -441.5118103027344, "logps/rejected": -364.4373474121094, "loss": 0.5899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6301565170288086, "rewards/margins": 0.2938545048236847, "rewards/rejected": 1.3363020420074463, "step": 7650 }, { "epoch": 0.3556339662936998, "grad_norm": 25.490468978881836, "learning_rate": 2.786842471795348e-07, "logits/chosen": -17.679189682006836, "logits/rejected": -16.924306869506836, "logps/chosen": -400.19451904296875, "logps/rejected": -361.510009765625, "loss": 0.6519, "rewards/accuracies": 0.5, "rewards/chosen": 1.674975037574768, "rewards/margins": 0.15980315208435059, "rewards/rejected": 1.5151718854904175, "step": 7660 }, { "epoch": 0.3560982404011328, "grad_norm": 35.81780242919922, "learning_rate": 2.786563907330888e-07, "logits/chosen": -18.798358917236328, "logits/rejected": -16.759700775146484, "logps/chosen": -467.42352294921875, "logps/rejected": -248.5077362060547, "loss": 0.4417, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8032734394073486, "rewards/margins": 0.6932486295700073, "rewards/rejected": 1.1100248098373413, "step": 7670 }, { "epoch": 0.35656251450856585, "grad_norm": 40.064178466796875, "learning_rate": 2.786285342866428e-07, "logits/chosen": -19.437137603759766, "logits/rejected": -18.027793884277344, "logps/chosen": -570.6904296875, "logps/rejected": -403.1722412109375, "loss": 0.4665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2612483501434326, "rewards/margins": 0.655205249786377, "rewards/rejected": 1.6060432195663452, "step": 7680 }, { "epoch": 0.3570267886159989, "grad_norm": 140.47035217285156, "learning_rate": 2.7860067784019685e-07, "logits/chosen": -17.98012351989746, "logits/rejected": -17.894826889038086, "logps/chosen": -357.274658203125, "logps/rejected": -377.9755859375, "loss": 0.9467, "rewards/accuracies": 0.5, "rewards/chosen": 1.481264352798462, "rewards/margins": -0.2691879868507385, "rewards/rejected": 1.7504523992538452, "step": 7690 }, { "epoch": 0.3574910627234319, "grad_norm": 54.43815612792969, "learning_rate": 2.7857282139375084e-07, "logits/chosen": -18.97592544555664, "logits/rejected": -17.81205177307129, "logps/chosen": -434.7647399902344, "logps/rejected": -306.20819091796875, "loss": 0.5943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2734897136688232, "rewards/margins": 0.2925547659397125, "rewards/rejected": 0.9809349775314331, "step": 7700 }, { "epoch": 0.35795533683086495, "grad_norm": 109.69985961914062, "learning_rate": 2.785449649473049e-07, "logits/chosen": -18.970890045166016, "logits/rejected": -17.269990921020508, "logps/chosen": -503.1312561035156, "logps/rejected": -313.16070556640625, "loss": 0.5299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8871984481811523, "rewards/margins": 0.5664150714874268, "rewards/rejected": 1.3207833766937256, "step": 7710 }, { "epoch": 0.358419610938298, "grad_norm": 160.8581085205078, "learning_rate": 2.785171085008589e-07, "logits/chosen": -17.4078311920166, "logits/rejected": -16.938583374023438, "logps/chosen": -363.25213623046875, "logps/rejected": -373.80035400390625, "loss": 0.8199, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3410742282867432, "rewards/margins": -0.14659187197685242, "rewards/rejected": 1.487666368484497, "step": 7720 }, { "epoch": 0.358883885045731, "grad_norm": 170.3343048095703, "learning_rate": 2.784892520544129e-07, "logits/chosen": -17.72509002685547, "logits/rejected": -17.718990325927734, "logps/chosen": -454.414306640625, "logps/rejected": -471.1851501464844, "loss": 0.7708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7411072254180908, "rewards/margins": 0.02078350819647312, "rewards/rejected": 1.7203235626220703, "step": 7730 }, { "epoch": 0.35934815915316404, "grad_norm": 54.958251953125, "learning_rate": 2.784613956079669e-07, "logits/chosen": -17.920873641967773, "logits/rejected": -17.660221099853516, "logps/chosen": -320.89508056640625, "logps/rejected": -282.0245056152344, "loss": 0.7549, "rewards/accuracies": 0.5, "rewards/chosen": 1.3149018287658691, "rewards/margins": -0.019554417580366135, "rewards/rejected": 1.334456205368042, "step": 7740 }, { "epoch": 0.35981243326059703, "grad_norm": 63.706546783447266, "learning_rate": 2.7843353916152094e-07, "logits/chosen": -17.803646087646484, "logits/rejected": -16.791414260864258, "logps/chosen": -442.79241943359375, "logps/rejected": -305.505859375, "loss": 0.5468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7768198251724243, "rewards/margins": 0.5036585330963135, "rewards/rejected": 1.2731611728668213, "step": 7750 }, { "epoch": 0.3602767073680301, "grad_norm": 141.13926696777344, "learning_rate": 2.78405682715075e-07, "logits/chosen": -18.818614959716797, "logits/rejected": -18.449085235595703, "logps/chosen": -379.2069091796875, "logps/rejected": -403.09039306640625, "loss": 0.6681, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5678774118423462, "rewards/margins": 0.21677589416503906, "rewards/rejected": 1.3511016368865967, "step": 7760 }, { "epoch": 0.36074098147546313, "grad_norm": 58.15289306640625, "learning_rate": 2.78377826268629e-07, "logits/chosen": -17.526386260986328, "logits/rejected": -17.360931396484375, "logps/chosen": -319.09466552734375, "logps/rejected": -326.0968017578125, "loss": 0.6303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2217843532562256, "rewards/margins": 0.1835257112979889, "rewards/rejected": 1.038258671760559, "step": 7770 }, { "epoch": 0.3612052555828961, "grad_norm": 75.70230102539062, "learning_rate": 2.78349969822183e-07, "logits/chosen": -18.91893768310547, "logits/rejected": -18.104713439941406, "logps/chosen": -481.4046936035156, "logps/rejected": -385.7168884277344, "loss": 0.6181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8161529302597046, "rewards/margins": 0.36475670337677, "rewards/rejected": 1.4513962268829346, "step": 7780 }, { "epoch": 0.3616695296903292, "grad_norm": 127.98685455322266, "learning_rate": 2.7832211337573705e-07, "logits/chosen": -18.44475746154785, "logits/rejected": -18.10881233215332, "logps/chosen": -413.16180419921875, "logps/rejected": -405.4207763671875, "loss": 0.5769, "rewards/accuracies": 0.5, "rewards/chosen": 1.7537477016448975, "rewards/margins": 0.3971439301967621, "rewards/rejected": 1.3566038608551025, "step": 7790 }, { "epoch": 0.3621338037977622, "grad_norm": 18.368974685668945, "learning_rate": 2.7829425692929104e-07, "logits/chosen": -17.534303665161133, "logits/rejected": -17.155344009399414, "logps/chosen": -406.56182861328125, "logps/rejected": -357.6475830078125, "loss": 0.5741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8261706829071045, "rewards/margins": 0.5072072744369507, "rewards/rejected": 1.3189634084701538, "step": 7800 }, { "epoch": 0.3625980779051952, "grad_norm": 144.92764282226562, "learning_rate": 2.78266400482845e-07, "logits/chosen": -18.180313110351562, "logits/rejected": -18.152027130126953, "logps/chosen": -461.89715576171875, "logps/rejected": -548.322021484375, "loss": 0.8708, "rewards/accuracies": 0.5, "rewards/chosen": 1.9848105907440186, "rewards/margins": -0.22253835201263428, "rewards/rejected": 2.2073488235473633, "step": 7810 }, { "epoch": 0.36306235201262826, "grad_norm": 51.68321228027344, "learning_rate": 2.7823854403639906e-07, "logits/chosen": -19.067676544189453, "logits/rejected": -17.372465133666992, "logps/chosen": -403.8335266113281, "logps/rejected": -332.9347229003906, "loss": 0.4299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9013242721557617, "rewards/margins": 0.7825475931167603, "rewards/rejected": 1.118776559829712, "step": 7820 }, { "epoch": 0.36352662612006126, "grad_norm": 26.55601692199707, "learning_rate": 2.782106875899531e-07, "logits/chosen": -17.736331939697266, "logits/rejected": -16.426198959350586, "logps/chosen": -392.29864501953125, "logps/rejected": -265.56231689453125, "loss": 0.5925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4438596963882446, "rewards/margins": 0.3878316283226013, "rewards/rejected": 1.0560280084609985, "step": 7830 }, { "epoch": 0.3639909002274943, "grad_norm": 44.22624969482422, "learning_rate": 2.7818283114350715e-07, "logits/chosen": -18.533029556274414, "logits/rejected": -17.47597312927246, "logps/chosen": -414.8041076660156, "logps/rejected": -315.6793518066406, "loss": 0.5676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.779748558998108, "rewards/margins": 0.45965051651000977, "rewards/rejected": 1.320097804069519, "step": 7840 }, { "epoch": 0.36445517433492736, "grad_norm": 75.10415649414062, "learning_rate": 2.7815497469706113e-07, "logits/chosen": -18.619792938232422, "logits/rejected": -17.819252014160156, "logps/chosen": -442.79248046875, "logps/rejected": -379.2632141113281, "loss": 0.7103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7807753086090088, "rewards/margins": 0.16533584892749786, "rewards/rejected": 1.6154394149780273, "step": 7850 }, { "epoch": 0.36491944844236035, "grad_norm": 56.88018035888672, "learning_rate": 2.781271182506151e-07, "logits/chosen": -18.80430793762207, "logits/rejected": -17.359214782714844, "logps/chosen": -344.4886169433594, "logps/rejected": -243.90206909179688, "loss": 0.5118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3582087755203247, "rewards/margins": 0.51071697473526, "rewards/rejected": 0.8474918603897095, "step": 7860 }, { "epoch": 0.3653837225497934, "grad_norm": 67.79288482666016, "learning_rate": 2.7809926180416916e-07, "logits/chosen": -18.54314613342285, "logits/rejected": -18.219730377197266, "logps/chosen": -374.01287841796875, "logps/rejected": -302.417724609375, "loss": 0.7001, "rewards/accuracies": 0.5, "rewards/chosen": 1.324648141860962, "rewards/margins": 0.056572578847408295, "rewards/rejected": 1.268075704574585, "step": 7870 }, { "epoch": 0.36584799665722645, "grad_norm": 114.32649993896484, "learning_rate": 2.780714053577232e-07, "logits/chosen": -18.161880493164062, "logits/rejected": -18.05451011657715, "logps/chosen": -461.8146057128906, "logps/rejected": -295.6021728515625, "loss": 0.5985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.660363793373108, "rewards/margins": 0.39300766587257385, "rewards/rejected": 1.267356038093567, "step": 7880 }, { "epoch": 0.36631227076465944, "grad_norm": 75.83302307128906, "learning_rate": 2.780435489112772e-07, "logits/chosen": -18.569293975830078, "logits/rejected": -17.40433692932129, "logps/chosen": -493.1631774902344, "logps/rejected": -294.78082275390625, "loss": 0.3987, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0411489009857178, "rewards/margins": 0.896639347076416, "rewards/rejected": 1.1445096731185913, "step": 7890 }, { "epoch": 0.3667765448720925, "grad_norm": 70.23931121826172, "learning_rate": 2.7801569246483123e-07, "logits/chosen": -18.668745040893555, "logits/rejected": -17.356475830078125, "logps/chosen": -340.09161376953125, "logps/rejected": -278.16632080078125, "loss": 0.5579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4283009767532349, "rewards/margins": 0.43363848328590393, "rewards/rejected": 0.9946624636650085, "step": 7900 }, { "epoch": 0.3672408189795255, "grad_norm": 96.5959701538086, "learning_rate": 2.779878360183852e-07, "logits/chosen": -18.85895347595215, "logits/rejected": -18.138269424438477, "logps/chosen": -409.73468017578125, "logps/rejected": -355.1189270019531, "loss": 0.5092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8051477670669556, "rewards/margins": 0.5193825960159302, "rewards/rejected": 1.2857650518417358, "step": 7910 }, { "epoch": 0.36770509308695853, "grad_norm": 118.61804962158203, "learning_rate": 2.7795997957193926e-07, "logits/chosen": -18.97921371459961, "logits/rejected": -17.851200103759766, "logps/chosen": -387.0513916015625, "logps/rejected": -309.39691162109375, "loss": 0.5745, "rewards/accuracies": 0.5, "rewards/chosen": 1.8115030527114868, "rewards/margins": 0.4856429100036621, "rewards/rejected": 1.3258601427078247, "step": 7920 }, { "epoch": 0.3681693671943916, "grad_norm": 32.80154800415039, "learning_rate": 2.7793212312549325e-07, "logits/chosen": -18.722692489624023, "logits/rejected": -18.296689987182617, "logps/chosen": -362.9092102050781, "logps/rejected": -358.6781311035156, "loss": 0.6304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2375397682189941, "rewards/margins": 0.2669677138328552, "rewards/rejected": 0.9705721139907837, "step": 7930 }, { "epoch": 0.3686336413018246, "grad_norm": 111.51610565185547, "learning_rate": 2.779042666790473e-07, "logits/chosen": -18.61150360107422, "logits/rejected": -18.341838836669922, "logps/chosen": -428.28448486328125, "logps/rejected": -377.2044982910156, "loss": 0.6978, "rewards/accuracies": 0.5, "rewards/chosen": 1.590476632118225, "rewards/margins": 0.06372947990894318, "rewards/rejected": 1.5267473459243774, "step": 7940 }, { "epoch": 0.3690979154092576, "grad_norm": 83.05818176269531, "learning_rate": 2.7787641023260133e-07, "logits/chosen": -17.889860153198242, "logits/rejected": -17.145164489746094, "logps/chosen": -421.8893127441406, "logps/rejected": -313.18402099609375, "loss": 0.5353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0807595252990723, "rewards/margins": 0.5894237756729126, "rewards/rejected": 1.4913359880447388, "step": 7950 }, { "epoch": 0.3695621895166907, "grad_norm": 95.72740936279297, "learning_rate": 2.778485537861554e-07, "logits/chosen": -17.704689025878906, "logits/rejected": -16.850425720214844, "logps/chosen": -352.85858154296875, "logps/rejected": -203.08157348632812, "loss": 0.4964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3824584484100342, "rewards/margins": 0.5742272138595581, "rewards/rejected": 0.8082312345504761, "step": 7960 }, { "epoch": 0.37002646362412367, "grad_norm": 76.55502319335938, "learning_rate": 2.7782069733970936e-07, "logits/chosen": -18.5167293548584, "logits/rejected": -17.0944766998291, "logps/chosen": -368.4093322753906, "logps/rejected": -212.5495147705078, "loss": 0.5334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5387136936187744, "rewards/margins": 0.520837128162384, "rewards/rejected": 1.017876386642456, "step": 7970 }, { "epoch": 0.3704907377315567, "grad_norm": 76.44478607177734, "learning_rate": 2.7779284089326335e-07, "logits/chosen": -18.624309539794922, "logits/rejected": -17.93841552734375, "logps/chosen": -404.1372985839844, "logps/rejected": -364.5601806640625, "loss": 0.6686, "rewards/accuracies": 0.5, "rewards/chosen": 1.6454858779907227, "rewards/margins": 0.18940044939517975, "rewards/rejected": 1.456085443496704, "step": 7980 }, { "epoch": 0.3709550118389897, "grad_norm": 122.847412109375, "learning_rate": 2.777649844468174e-07, "logits/chosen": -18.364879608154297, "logits/rejected": -17.529804229736328, "logps/chosen": -423.9610290527344, "logps/rejected": -315.67218017578125, "loss": 0.6212, "rewards/accuracies": 0.5, "rewards/chosen": 1.8542976379394531, "rewards/margins": 0.42522215843200684, "rewards/rejected": 1.4290752410888672, "step": 7990 }, { "epoch": 0.37141928594642276, "grad_norm": 58.825172424316406, "learning_rate": 2.777371280003714e-07, "logits/chosen": -17.992570877075195, "logits/rejected": -16.483858108520508, "logps/chosen": -352.0699768066406, "logps/rejected": -220.1132049560547, "loss": 0.5014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4267513751983643, "rewards/margins": 0.6410213708877563, "rewards/rejected": 0.7857301235198975, "step": 8000 }, { "epoch": 0.3718835600538558, "grad_norm": 87.62298583984375, "learning_rate": 2.777092715539254e-07, "logits/chosen": -18.734256744384766, "logits/rejected": -18.304080963134766, "logps/chosen": -351.6702880859375, "logps/rejected": -267.1173400878906, "loss": 0.6043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5628725290298462, "rewards/margins": 0.23957212269306183, "rewards/rejected": 1.3233003616333008, "step": 8010 }, { "epoch": 0.3723478341612888, "grad_norm": 198.6377410888672, "learning_rate": 2.7768141510747946e-07, "logits/chosen": -18.417926788330078, "logits/rejected": -17.4855899810791, "logps/chosen": -428.6575622558594, "logps/rejected": -330.6783752441406, "loss": 0.6826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8873735666275024, "rewards/margins": 0.3314420282840729, "rewards/rejected": 1.555931806564331, "step": 8020 }, { "epoch": 0.37281210826872185, "grad_norm": 173.1042022705078, "learning_rate": 2.7765355866103345e-07, "logits/chosen": -19.60749053955078, "logits/rejected": -19.15178108215332, "logps/chosen": -430.6700134277344, "logps/rejected": -408.1106872558594, "loss": 0.5423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7851426601409912, "rewards/margins": 0.4255690574645996, "rewards/rejected": 1.3595738410949707, "step": 8030 }, { "epoch": 0.3732763823761549, "grad_norm": 50.57197189331055, "learning_rate": 2.776257022145875e-07, "logits/chosen": -19.347728729248047, "logits/rejected": -18.74161720275879, "logps/chosen": -532.794921875, "logps/rejected": -408.78887939453125, "loss": 0.6323, "rewards/accuracies": 0.5, "rewards/chosen": 2.150142192840576, "rewards/margins": 0.35598430037498474, "rewards/rejected": 1.7941579818725586, "step": 8040 }, { "epoch": 0.3737406564835879, "grad_norm": 88.3667984008789, "learning_rate": 2.775978457681415e-07, "logits/chosen": -19.384822845458984, "logits/rejected": -17.620790481567383, "logps/chosen": -389.4205017089844, "logps/rejected": -300.20928955078125, "loss": 0.5557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8194715976715088, "rewards/margins": 0.5396736860275269, "rewards/rejected": 1.279798150062561, "step": 8050 }, { "epoch": 0.37420493059102095, "grad_norm": 112.57505798339844, "learning_rate": 2.775699893216955e-07, "logits/chosen": -17.847003936767578, "logits/rejected": -17.012319564819336, "logps/chosen": -364.8690490722656, "logps/rejected": -224.5890655517578, "loss": 0.4863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7362864017486572, "rewards/margins": 0.6636099815368652, "rewards/rejected": 1.072676420211792, "step": 8060 }, { "epoch": 0.374669204698454, "grad_norm": 101.33639526367188, "learning_rate": 2.7754213287524956e-07, "logits/chosen": -19.089237213134766, "logits/rejected": -18.023136138916016, "logps/chosen": -586.9718627929688, "logps/rejected": -437.1886291503906, "loss": 0.5103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9755347967147827, "rewards/margins": 0.559700071811676, "rewards/rejected": 1.415834665298462, "step": 8070 }, { "epoch": 0.375133478805887, "grad_norm": 189.34922790527344, "learning_rate": 2.7751427642880355e-07, "logits/chosen": -17.41657066345215, "logits/rejected": -17.079967498779297, "logps/chosen": -363.12371826171875, "logps/rejected": -342.1452331542969, "loss": 0.7048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7267968654632568, "rewards/margins": 0.2369937002658844, "rewards/rejected": 1.4898035526275635, "step": 8080 }, { "epoch": 0.37559775291332004, "grad_norm": 130.64596557617188, "learning_rate": 2.774864199823576e-07, "logits/chosen": -17.629806518554688, "logits/rejected": -17.613521575927734, "logps/chosen": -396.8274230957031, "logps/rejected": -373.48907470703125, "loss": 0.7223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7889009714126587, "rewards/margins": 0.21284040808677673, "rewards/rejected": 1.5760605335235596, "step": 8090 }, { "epoch": 0.37606202702075303, "grad_norm": 58.54801940917969, "learning_rate": 2.774585635359116e-07, "logits/chosen": -17.184276580810547, "logits/rejected": -16.134647369384766, "logps/chosen": -276.97845458984375, "logps/rejected": -209.0432586669922, "loss": 0.5507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5281697511672974, "rewards/margins": 0.6541174650192261, "rewards/rejected": 0.8740523457527161, "step": 8100 }, { "epoch": 0.3765263011281861, "grad_norm": 73.75045776367188, "learning_rate": 2.774307070894656e-07, "logits/chosen": -18.83572006225586, "logits/rejected": -17.09821128845215, "logps/chosen": -420.1578063964844, "logps/rejected": -274.198974609375, "loss": 0.4371, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.654531478881836, "rewards/margins": 0.7944265007972717, "rewards/rejected": 0.860105037689209, "step": 8110 }, { "epoch": 0.37699057523561913, "grad_norm": 66.61492156982422, "learning_rate": 2.774028506430196e-07, "logits/chosen": -18.49230194091797, "logits/rejected": -16.995311737060547, "logps/chosen": -497.7848205566406, "logps/rejected": -316.87957763671875, "loss": 0.4505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7069259881973267, "rewards/margins": 0.6370813250541687, "rewards/rejected": 1.0698444843292236, "step": 8120 }, { "epoch": 0.3774548493430521, "grad_norm": 39.56071472167969, "learning_rate": 2.7737499419657365e-07, "logits/chosen": -18.376333236694336, "logits/rejected": -17.57716941833496, "logps/chosen": -412.5279846191406, "logps/rejected": -317.9140625, "loss": 0.6144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.857377052307129, "rewards/margins": 0.38147541880607605, "rewards/rejected": 1.4759016036987305, "step": 8130 }, { "epoch": 0.3779191234504852, "grad_norm": 126.83989715576172, "learning_rate": 2.773471377501277e-07, "logits/chosen": -18.355083465576172, "logits/rejected": -19.035797119140625, "logps/chosen": -414.82666015625, "logps/rejected": -447.5631408691406, "loss": 0.8422, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.5146664381027222, "rewards/margins": -0.19596785306930542, "rewards/rejected": 1.710634469985962, "step": 8140 }, { "epoch": 0.3783833975579182, "grad_norm": 34.95094299316406, "learning_rate": 2.773192813036817e-07, "logits/chosen": -18.195213317871094, "logits/rejected": -17.656991958618164, "logps/chosen": -361.17877197265625, "logps/rejected": -276.8155212402344, "loss": 0.7215, "rewards/accuracies": 0.5, "rewards/chosen": 1.518131971359253, "rewards/margins": 0.23349833488464355, "rewards/rejected": 1.2846336364746094, "step": 8150 }, { "epoch": 0.3788476716653512, "grad_norm": 48.90740203857422, "learning_rate": 2.7729142485723567e-07, "logits/chosen": -18.97928237915039, "logits/rejected": -17.900806427001953, "logps/chosen": -518.870361328125, "logps/rejected": -331.72894287109375, "loss": 0.4401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0948171615600586, "rewards/margins": 0.8141806721687317, "rewards/rejected": 1.2806364297866821, "step": 8160 }, { "epoch": 0.37931194577278426, "grad_norm": 110.8256607055664, "learning_rate": 2.772635684107897e-07, "logits/chosen": -17.9208927154541, "logits/rejected": -17.512630462646484, "logps/chosen": -435.41180419921875, "logps/rejected": -338.84722900390625, "loss": 0.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2168045043945312, "rewards/margins": 0.6110354661941528, "rewards/rejected": 1.6057689189910889, "step": 8170 }, { "epoch": 0.37977621988021726, "grad_norm": 27.566198348999023, "learning_rate": 2.7723571196434375e-07, "logits/chosen": -18.316211700439453, "logits/rejected": -17.040752410888672, "logps/chosen": -542.36767578125, "logps/rejected": -306.2732849121094, "loss": 0.4403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.10001540184021, "rewards/margins": 0.869475245475769, "rewards/rejected": 1.2305400371551514, "step": 8180 }, { "epoch": 0.3802404939876503, "grad_norm": 37.09906768798828, "learning_rate": 2.772078555178978e-07, "logits/chosen": -19.625690460205078, "logits/rejected": -18.453699111938477, "logps/chosen": -357.83807373046875, "logps/rejected": -233.2054901123047, "loss": 0.5095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5711097717285156, "rewards/margins": 0.5562691688537598, "rewards/rejected": 1.0148406028747559, "step": 8190 }, { "epoch": 0.38070476809508336, "grad_norm": 195.77394104003906, "learning_rate": 2.771799990714518e-07, "logits/chosen": -17.81902503967285, "logits/rejected": -17.193836212158203, "logps/chosen": -496.4659729003906, "logps/rejected": -406.2695007324219, "loss": 0.6064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.780591368675232, "rewards/margins": 0.3088168501853943, "rewards/rejected": 1.471774697303772, "step": 8200 }, { "epoch": 0.38116904220251635, "grad_norm": 172.7240447998047, "learning_rate": 2.771521426250058e-07, "logits/chosen": -18.682369232177734, "logits/rejected": -18.39559555053711, "logps/chosen": -416.84637451171875, "logps/rejected": -334.8705139160156, "loss": 0.6634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7239145040512085, "rewards/margins": 0.18061573803424835, "rewards/rejected": 1.5432987213134766, "step": 8210 }, { "epoch": 0.3816333163099494, "grad_norm": 238.91583251953125, "learning_rate": 2.771242861785598e-07, "logits/chosen": -18.202045440673828, "logits/rejected": -18.315902709960938, "logps/chosen": -418.8609313964844, "logps/rejected": -424.76544189453125, "loss": 0.8288, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5613679885864258, "rewards/margins": 0.0026144206058233976, "rewards/rejected": 1.5587536096572876, "step": 8220 }, { "epoch": 0.38209759041738245, "grad_norm": 86.13294219970703, "learning_rate": 2.770964297321138e-07, "logits/chosen": -17.957000732421875, "logits/rejected": -17.38083267211914, "logps/chosen": -441.08062744140625, "logps/rejected": -357.42352294921875, "loss": 0.5364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.608149766921997, "rewards/margins": 0.3992193341255188, "rewards/rejected": 1.208930253982544, "step": 8230 }, { "epoch": 0.38256186452481544, "grad_norm": 29.083951950073242, "learning_rate": 2.7706857328566783e-07, "logits/chosen": -18.15059471130371, "logits/rejected": -16.373018264770508, "logps/chosen": -358.2613220214844, "logps/rejected": -197.11679077148438, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5531904697418213, "rewards/margins": 0.7822600603103638, "rewards/rejected": 0.7709304690361023, "step": 8240 }, { "epoch": 0.3830261386322485, "grad_norm": 103.83134460449219, "learning_rate": 2.770407168392219e-07, "logits/chosen": -17.907657623291016, "logits/rejected": -17.810161590576172, "logps/chosen": -372.17987060546875, "logps/rejected": -364.8680725097656, "loss": 0.6425, "rewards/accuracies": 0.5, "rewards/chosen": 1.6787296533584595, "rewards/margins": 0.19098356366157532, "rewards/rejected": 1.4877461194992065, "step": 8250 }, { "epoch": 0.3834904127396815, "grad_norm": 44.86310958862305, "learning_rate": 2.770128603927759e-07, "logits/chosen": -17.60790252685547, "logits/rejected": -17.911876678466797, "logps/chosen": -349.42510986328125, "logps/rejected": -388.28277587890625, "loss": 0.7654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5852832794189453, "rewards/margins": 0.11397626250982285, "rewards/rejected": 1.4713071584701538, "step": 8260 }, { "epoch": 0.38395468684711453, "grad_norm": 113.65021514892578, "learning_rate": 2.769850039463299e-07, "logits/chosen": -17.712339401245117, "logits/rejected": -17.42567253112793, "logps/chosen": -286.7957763671875, "logps/rejected": -282.0284729003906, "loss": 0.6963, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5353515148162842, "rewards/margins": 0.2610934376716614, "rewards/rejected": 1.2742582559585571, "step": 8270 }, { "epoch": 0.3844189609545476, "grad_norm": 60.00703811645508, "learning_rate": 2.769571474998839e-07, "logits/chosen": -18.022022247314453, "logits/rejected": -18.207393646240234, "logps/chosen": -424.132568359375, "logps/rejected": -480.81683349609375, "loss": 0.9668, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.5979597568511963, "rewards/margins": -0.36232662200927734, "rewards/rejected": 1.9602861404418945, "step": 8280 }, { "epoch": 0.3848832350619806, "grad_norm": 92.92088317871094, "learning_rate": 2.7692929105343793e-07, "logits/chosen": -17.89535903930664, "logits/rejected": -17.798784255981445, "logps/chosen": -411.7608947753906, "logps/rejected": -319.53399658203125, "loss": 0.608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8565056324005127, "rewards/margins": 0.3548924922943115, "rewards/rejected": 1.501612901687622, "step": 8290 }, { "epoch": 0.3853475091694136, "grad_norm": 37.44837951660156, "learning_rate": 2.76901434606992e-07, "logits/chosen": -18.578495025634766, "logits/rejected": -18.2799072265625, "logps/chosen": -474.99371337890625, "logps/rejected": -396.14739990234375, "loss": 0.738, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8250033855438232, "rewards/margins": 0.12727610766887665, "rewards/rejected": 1.6977274417877197, "step": 8300 }, { "epoch": 0.3858117832768467, "grad_norm": 15.122879981994629, "learning_rate": 2.768763638051906e-07, "logits/chosen": -17.726306915283203, "logits/rejected": -16.6313533782959, "logps/chosen": -391.36956787109375, "logps/rejected": -333.4660339355469, "loss": 0.6006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6286436319351196, "rewards/margins": 0.5351325273513794, "rewards/rejected": 1.0935109853744507, "step": 8310 }, { "epoch": 0.38627605738427967, "grad_norm": 31.063386917114258, "learning_rate": 2.768485073587446e-07, "logits/chosen": -18.260852813720703, "logits/rejected": -17.03647232055664, "logps/chosen": -426.0755920410156, "logps/rejected": -278.305908203125, "loss": 0.4264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9514262676239014, "rewards/margins": 0.8406173586845398, "rewards/rejected": 1.110809087753296, "step": 8320 }, { "epoch": 0.3867403314917127, "grad_norm": 98.32080078125, "learning_rate": 2.768206509122986e-07, "logits/chosen": -18.139427185058594, "logits/rejected": -17.906431198120117, "logps/chosen": -477.54046630859375, "logps/rejected": -398.7455139160156, "loss": 0.6389, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8840748071670532, "rewards/margins": 0.21725299954414368, "rewards/rejected": 1.666822075843811, "step": 8330 }, { "epoch": 0.3872046055991457, "grad_norm": 97.83011627197266, "learning_rate": 2.767927944658526e-07, "logits/chosen": -17.74966049194336, "logits/rejected": -17.538373947143555, "logps/chosen": -371.8831481933594, "logps/rejected": -332.8742370605469, "loss": 0.8098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3979885578155518, "rewards/margins": -0.0686236247420311, "rewards/rejected": 1.4666121006011963, "step": 8340 }, { "epoch": 0.38766887970657876, "grad_norm": 132.7923583984375, "learning_rate": 2.7676493801940665e-07, "logits/chosen": -18.45637321472168, "logits/rejected": -18.58644676208496, "logps/chosen": -416.88232421875, "logps/rejected": -430.2711486816406, "loss": 0.7665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8567079305648804, "rewards/margins": -0.02526226080954075, "rewards/rejected": 1.8819701671600342, "step": 8350 }, { "epoch": 0.3881331538140118, "grad_norm": 94.72901153564453, "learning_rate": 2.7673708157296064e-07, "logits/chosen": -18.4586124420166, "logits/rejected": -18.353307723999023, "logps/chosen": -424.3387145996094, "logps/rejected": -408.40625, "loss": 0.7971, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.475488305091858, "rewards/margins": -0.12364419549703598, "rewards/rejected": 1.5991325378417969, "step": 8360 }, { "epoch": 0.3885974279214448, "grad_norm": 98.64228820800781, "learning_rate": 2.767092251265147e-07, "logits/chosen": -17.530637741088867, "logits/rejected": -17.52112579345703, "logps/chosen": -305.729248046875, "logps/rejected": -284.1754455566406, "loss": 0.8682, "rewards/accuracies": 0.5, "rewards/chosen": 0.9032360315322876, "rewards/margins": -0.16804002225399017, "rewards/rejected": 1.071276068687439, "step": 8370 }, { "epoch": 0.38906170202887785, "grad_norm": 40.31504440307617, "learning_rate": 2.766813686800687e-07, "logits/chosen": -19.1435489654541, "logits/rejected": -19.481998443603516, "logps/chosen": -352.64459228515625, "logps/rejected": -449.0728454589844, "loss": 0.9691, "rewards/accuracies": 0.5, "rewards/chosen": 1.4138307571411133, "rewards/margins": -0.30790331959724426, "rewards/rejected": 1.7217340469360352, "step": 8380 }, { "epoch": 0.3895259761363109, "grad_norm": 138.5943145751953, "learning_rate": 2.766535122336227e-07, "logits/chosen": -18.723896026611328, "logits/rejected": -18.179393768310547, "logps/chosen": -386.9305419921875, "logps/rejected": -275.2694396972656, "loss": 0.6468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.342969298362732, "rewards/margins": 0.20444932579994202, "rewards/rejected": 1.1385201215744019, "step": 8390 }, { "epoch": 0.3899902502437439, "grad_norm": 55.690120697021484, "learning_rate": 2.766256557871767e-07, "logits/chosen": -18.542835235595703, "logits/rejected": -17.801050186157227, "logps/chosen": -394.91357421875, "logps/rejected": -302.1562194824219, "loss": 0.6801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4796804189682007, "rewards/margins": 0.2591560482978821, "rewards/rejected": 1.2205243110656738, "step": 8400 }, { "epoch": 0.39045452435117695, "grad_norm": 70.61920928955078, "learning_rate": 2.7659779934073074e-07, "logits/chosen": -18.73837661743164, "logits/rejected": -17.910701751708984, "logps/chosen": -351.1997985839844, "logps/rejected": -266.86090087890625, "loss": 0.6012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5656907558441162, "rewards/margins": 0.3558880388736725, "rewards/rejected": 1.2098028659820557, "step": 8410 }, { "epoch": 0.39091879845860994, "grad_norm": 31.119279861450195, "learning_rate": 2.765699428942848e-07, "logits/chosen": -17.49501609802246, "logits/rejected": -17.424144744873047, "logps/chosen": -279.2254333496094, "logps/rejected": -290.3982849121094, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": 1.0894410610198975, "rewards/margins": 0.038120973855257034, "rewards/rejected": 1.051320195198059, "step": 8420 }, { "epoch": 0.391383072566043, "grad_norm": 67.5772705078125, "learning_rate": 2.765420864478388e-07, "logits/chosen": -17.941486358642578, "logits/rejected": -17.693424224853516, "logps/chosen": -502.730712890625, "logps/rejected": -457.25103759765625, "loss": 0.6554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.304002046585083, "rewards/margins": 0.39812493324279785, "rewards/rejected": 1.9058771133422852, "step": 8430 }, { "epoch": 0.39184734667347604, "grad_norm": 206.6841583251953, "learning_rate": 2.765142300013928e-07, "logits/chosen": -18.231849670410156, "logits/rejected": -17.941547393798828, "logps/chosen": -426.56939697265625, "logps/rejected": -356.5230407714844, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 1.6029189825057983, "rewards/margins": 0.302966833114624, "rewards/rejected": 1.2999522686004639, "step": 8440 }, { "epoch": 0.39231162078090903, "grad_norm": 10.061957359313965, "learning_rate": 2.7648637355494685e-07, "logits/chosen": -18.33741569519043, "logits/rejected": -17.932857513427734, "logps/chosen": -487.91705322265625, "logps/rejected": -411.28125, "loss": 0.668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.117710590362549, "rewards/margins": 0.2854386270046234, "rewards/rejected": 1.8322721719741821, "step": 8450 }, { "epoch": 0.3927758948883421, "grad_norm": 79.93700408935547, "learning_rate": 2.7645851710850084e-07, "logits/chosen": -18.773733139038086, "logits/rejected": -18.104412078857422, "logps/chosen": -405.680908203125, "logps/rejected": -281.85064697265625, "loss": 0.581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7974176406860352, "rewards/margins": 0.36168649792671204, "rewards/rejected": 1.4357311725616455, "step": 8460 }, { "epoch": 0.39324016899577513, "grad_norm": 138.1877899169922, "learning_rate": 2.7643066066205483e-07, "logits/chosen": -17.866743087768555, "logits/rejected": -17.570148468017578, "logps/chosen": -444.3954162597656, "logps/rejected": -437.2501525878906, "loss": 0.7964, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1194915771484375, "rewards/margins": 0.014207261614501476, "rewards/rejected": 2.1052842140197754, "step": 8470 }, { "epoch": 0.3937044431032081, "grad_norm": 65.93042755126953, "learning_rate": 2.7640280421560887e-07, "logits/chosen": -18.223791122436523, "logits/rejected": -17.557859420776367, "logps/chosen": -369.3104553222656, "logps/rejected": -320.2506408691406, "loss": 0.5551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7709026336669922, "rewards/margins": 0.42190760374069214, "rewards/rejected": 1.3489948511123657, "step": 8480 }, { "epoch": 0.3941687172106412, "grad_norm": 122.96089935302734, "learning_rate": 2.763749477691629e-07, "logits/chosen": -18.719507217407227, "logits/rejected": -17.77353286743164, "logps/chosen": -338.5353698730469, "logps/rejected": -291.42706298828125, "loss": 0.4981, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7868000268936157, "rewards/margins": 0.572731614112854, "rewards/rejected": 1.2140682935714722, "step": 8490 }, { "epoch": 0.39463299131807417, "grad_norm": 74.76060485839844, "learning_rate": 2.7634709132271695e-07, "logits/chosen": -18.55646324157715, "logits/rejected": -17.374555587768555, "logps/chosen": -397.02923583984375, "logps/rejected": -278.5535583496094, "loss": 0.5256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5672489404678345, "rewards/margins": 0.558434247970581, "rewards/rejected": 1.0088146924972534, "step": 8500 }, { "epoch": 0.3950972654255072, "grad_norm": 36.777801513671875, "learning_rate": 2.7631923487627094e-07, "logits/chosen": -18.89901351928711, "logits/rejected": -18.464937210083008, "logps/chosen": -492.92877197265625, "logps/rejected": -383.6197509765625, "loss": 0.5665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8156156539916992, "rewards/margins": 0.3765949606895447, "rewards/rejected": 1.4390206336975098, "step": 8510 }, { "epoch": 0.39556153953294027, "grad_norm": 114.01363372802734, "learning_rate": 2.7629137842982493e-07, "logits/chosen": -18.050386428833008, "logits/rejected": -17.302135467529297, "logps/chosen": -314.2519836425781, "logps/rejected": -186.29022216796875, "loss": 0.5674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4495315551757812, "rewards/margins": 0.6652695536613464, "rewards/rejected": 0.7842620611190796, "step": 8520 }, { "epoch": 0.39602581364037326, "grad_norm": 81.50763702392578, "learning_rate": 2.7626352198337897e-07, "logits/chosen": -19.625057220458984, "logits/rejected": -18.796907424926758, "logps/chosen": -396.43719482421875, "logps/rejected": -275.78961181640625, "loss": 0.6176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5235580205917358, "rewards/margins": 0.309867262840271, "rewards/rejected": 1.213690996170044, "step": 8530 }, { "epoch": 0.3964900877478063, "grad_norm": 48.9761848449707, "learning_rate": 2.76235665536933e-07, "logits/chosen": -18.850337982177734, "logits/rejected": -17.320985794067383, "logps/chosen": -461.0699157714844, "logps/rejected": -301.9815979003906, "loss": 0.4892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.864856481552124, "rewards/margins": 0.7105998992919922, "rewards/rejected": 1.1542565822601318, "step": 8540 }, { "epoch": 0.39695436185523936, "grad_norm": 83.49512481689453, "learning_rate": 2.76207809090487e-07, "logits/chosen": -18.230045318603516, "logits/rejected": -18.432348251342773, "logps/chosen": -431.2254333496094, "logps/rejected": -444.05535888671875, "loss": 0.7527, "rewards/accuracies": 0.5, "rewards/chosen": 2.1505510807037354, "rewards/margins": 0.01944814994931221, "rewards/rejected": 2.131103038787842, "step": 8550 }, { "epoch": 0.39741863596267235, "grad_norm": 62.96059799194336, "learning_rate": 2.7617995264404104e-07, "logits/chosen": -18.432310104370117, "logits/rejected": -17.097469329833984, "logps/chosen": -475.45947265625, "logps/rejected": -272.05401611328125, "loss": 0.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3052592277526855, "rewards/margins": 1.039770483970642, "rewards/rejected": 1.265488624572754, "step": 8560 }, { "epoch": 0.3978829100701054, "grad_norm": 134.88601684570312, "learning_rate": 2.76152096197595e-07, "logits/chosen": -19.599437713623047, "logits/rejected": -16.8408203125, "logps/chosen": -395.28607177734375, "logps/rejected": -199.3760528564453, "loss": 0.4495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4072452783584595, "rewards/margins": 0.702013373374939, "rewards/rejected": 0.7052319049835205, "step": 8570 }, { "epoch": 0.3983471841775384, "grad_norm": 34.97526168823242, "learning_rate": 2.7612423975114907e-07, "logits/chosen": -17.71707534790039, "logits/rejected": -18.006031036376953, "logps/chosen": -378.1949462890625, "logps/rejected": -380.0614318847656, "loss": 0.7598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6983009576797485, "rewards/margins": 0.08859749138355255, "rewards/rejected": 1.6097034215927124, "step": 8580 }, { "epoch": 0.39881145828497144, "grad_norm": 36.649436950683594, "learning_rate": 2.7609638330470306e-07, "logits/chosen": -17.330554962158203, "logits/rejected": -17.2418155670166, "logps/chosen": -334.24591064453125, "logps/rejected": -341.0941467285156, "loss": 0.8671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5039623975753784, "rewards/margins": -0.010801387019455433, "rewards/rejected": 1.5147638320922852, "step": 8590 }, { "epoch": 0.3992757323924045, "grad_norm": 74.69209289550781, "learning_rate": 2.760685268582571e-07, "logits/chosen": -18.631622314453125, "logits/rejected": -17.785436630249023, "logps/chosen": -386.4765625, "logps/rejected": -263.7236022949219, "loss": 0.5645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.037477493286133, "rewards/margins": 0.4134988784790039, "rewards/rejected": 1.6239783763885498, "step": 8600 }, { "epoch": 0.3997400064998375, "grad_norm": 79.78136444091797, "learning_rate": 2.7604067041181114e-07, "logits/chosen": -17.920751571655273, "logits/rejected": -17.77021026611328, "logps/chosen": -427.4122009277344, "logps/rejected": -359.4266662597656, "loss": 0.5945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7790443897247314, "rewards/margins": 0.3401832580566406, "rewards/rejected": 1.4388608932495117, "step": 8610 }, { "epoch": 0.40020428060727053, "grad_norm": 62.182899475097656, "learning_rate": 2.760128139653652e-07, "logits/chosen": -19.016094207763672, "logits/rejected": -19.117267608642578, "logps/chosen": -418.2398376464844, "logps/rejected": -383.2612609863281, "loss": 0.8412, "rewards/accuracies": 0.5, "rewards/chosen": 1.5863494873046875, "rewards/margins": -0.17081117630004883, "rewards/rejected": 1.7571605443954468, "step": 8620 }, { "epoch": 0.4006685547147036, "grad_norm": 14.454983711242676, "learning_rate": 2.7598495751891917e-07, "logits/chosen": -18.227750778198242, "logits/rejected": -18.165176391601562, "logps/chosen": -367.5467224121094, "logps/rejected": -391.95269775390625, "loss": 0.7683, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7018569707870483, "rewards/margins": 0.06830492615699768, "rewards/rejected": 1.6335519552230835, "step": 8630 }, { "epoch": 0.4011328288221366, "grad_norm": 100.25208282470703, "learning_rate": 2.7595710107247315e-07, "logits/chosen": -17.966936111450195, "logits/rejected": -17.286672592163086, "logps/chosen": -396.6233825683594, "logps/rejected": -324.7394104003906, "loss": 0.6519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5989655256271362, "rewards/margins": 0.2545253038406372, "rewards/rejected": 1.3444401025772095, "step": 8640 }, { "epoch": 0.4015971029295696, "grad_norm": 27.519132614135742, "learning_rate": 2.759292446260272e-07, "logits/chosen": -18.17660140991211, "logits/rejected": -17.773515701293945, "logps/chosen": -356.9553527832031, "logps/rejected": -306.81011962890625, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6409183740615845, "rewards/margins": 0.43825626373291016, "rewards/rejected": 1.2026621103286743, "step": 8650 }, { "epoch": 0.4020613770370026, "grad_norm": 67.64582061767578, "learning_rate": 2.759013881795812e-07, "logits/chosen": -18.362812042236328, "logits/rejected": -16.574764251708984, "logps/chosen": -483.660400390625, "logps/rejected": -248.13525390625, "loss": 0.4364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7977721691131592, "rewards/margins": 0.7242233157157898, "rewards/rejected": 1.0735487937927246, "step": 8660 }, { "epoch": 0.40252565114443567, "grad_norm": 17.06365203857422, "learning_rate": 2.758735317331352e-07, "logits/chosen": -18.483257293701172, "logits/rejected": -18.094127655029297, "logps/chosen": -456.53204345703125, "logps/rejected": -413.0887145996094, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.083430767059326, "rewards/margins": 0.43618983030319214, "rewards/rejected": 1.6472409963607788, "step": 8670 }, { "epoch": 0.4029899252518687, "grad_norm": 143.40737915039062, "learning_rate": 2.7584567528668927e-07, "logits/chosen": -19.066585540771484, "logits/rejected": -18.749862670898438, "logps/chosen": -449.3182678222656, "logps/rejected": -377.7928466796875, "loss": 0.6396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.626940369606018, "rewards/margins": 0.24391897022724152, "rewards/rejected": 1.3830214738845825, "step": 8680 }, { "epoch": 0.4034541993593017, "grad_norm": 78.62646484375, "learning_rate": 2.7581781884024325e-07, "logits/chosen": -18.47870445251465, "logits/rejected": -18.976320266723633, "logps/chosen": -399.3321228027344, "logps/rejected": -385.5757751464844, "loss": 0.6818, "rewards/accuracies": 0.5, "rewards/chosen": 1.8135631084442139, "rewards/margins": 0.21327634155750275, "rewards/rejected": 1.6002867221832275, "step": 8690 }, { "epoch": 0.40391847346673476, "grad_norm": 59.991004943847656, "learning_rate": 2.757899623937973e-07, "logits/chosen": -18.10542106628418, "logits/rejected": -17.675851821899414, "logps/chosen": -291.04376220703125, "logps/rejected": -350.9228515625, "loss": 0.7744, "rewards/accuracies": 0.5, "rewards/chosen": 1.7228796482086182, "rewards/margins": 0.22437334060668945, "rewards/rejected": 1.4985063076019287, "step": 8700 }, { "epoch": 0.4043827475741678, "grad_norm": 90.69773864746094, "learning_rate": 2.757621059473513e-07, "logits/chosen": -17.67110824584961, "logits/rejected": -17.031400680541992, "logps/chosen": -361.82989501953125, "logps/rejected": -277.3132019042969, "loss": 0.6039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.680965781211853, "rewards/margins": 0.38845759630203247, "rewards/rejected": 1.2925081253051758, "step": 8710 }, { "epoch": 0.4048470216816008, "grad_norm": 20.558626174926758, "learning_rate": 2.757342495009053e-07, "logits/chosen": -18.041454315185547, "logits/rejected": -16.843748092651367, "logps/chosen": -484.65435791015625, "logps/rejected": -304.7716064453125, "loss": 0.4555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9952493906021118, "rewards/margins": 0.8587244749069214, "rewards/rejected": 1.1365249156951904, "step": 8720 }, { "epoch": 0.40531129578903385, "grad_norm": 18.190454483032227, "learning_rate": 2.7570639305445936e-07, "logits/chosen": -19.218881607055664, "logits/rejected": -18.743087768554688, "logps/chosen": -440.5901794433594, "logps/rejected": -286.6293640136719, "loss": 0.4277, "rewards/accuracies": 1.0, "rewards/chosen": 1.671065330505371, "rewards/margins": 0.7085392475128174, "rewards/rejected": 0.9625261425971985, "step": 8730 }, { "epoch": 0.40577556989646685, "grad_norm": 144.83615112304688, "learning_rate": 2.7567853660801335e-07, "logits/chosen": -18.331045150756836, "logits/rejected": -18.183277130126953, "logps/chosen": -455.769775390625, "logps/rejected": -464.08184814453125, "loss": 0.6293, "rewards/accuracies": 0.5, "rewards/chosen": 1.9321880340576172, "rewards/margins": 0.22796984016895294, "rewards/rejected": 1.7042181491851807, "step": 8740 }, { "epoch": 0.4062398440038999, "grad_norm": 13.916476249694824, "learning_rate": 2.756506801615674e-07, "logits/chosen": -19.396841049194336, "logits/rejected": -17.72519302368164, "logps/chosen": -539.453125, "logps/rejected": -383.69683837890625, "loss": 0.4975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1351213455200195, "rewards/margins": 0.7144050598144531, "rewards/rejected": 1.4207160472869873, "step": 8750 }, { "epoch": 0.40670411811133295, "grad_norm": 69.30799102783203, "learning_rate": 2.756228237151214e-07, "logits/chosen": -18.788164138793945, "logits/rejected": -17.69993019104004, "logps/chosen": -436.7405700683594, "logps/rejected": -288.5376892089844, "loss": 0.4741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8062286376953125, "rewards/margins": 0.5962594151496887, "rewards/rejected": 1.209969401359558, "step": 8760 }, { "epoch": 0.40716839221876594, "grad_norm": 121.33567810058594, "learning_rate": 2.755949672686754e-07, "logits/chosen": -18.488895416259766, "logits/rejected": -17.678653717041016, "logps/chosen": -458.3797912597656, "logps/rejected": -367.2872619628906, "loss": 0.4735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9932667016983032, "rewards/margins": 0.793963611125946, "rewards/rejected": 1.199303150177002, "step": 8770 }, { "epoch": 0.407632666326199, "grad_norm": 49.37060546875, "learning_rate": 2.755671108222294e-07, "logits/chosen": -19.396852493286133, "logits/rejected": -18.681886672973633, "logps/chosen": -432.991943359375, "logps/rejected": -297.6932067871094, "loss": 0.4831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9465614557266235, "rewards/margins": 0.6627194881439209, "rewards/rejected": 1.2838419675827026, "step": 8780 }, { "epoch": 0.40809694043363204, "grad_norm": 31.765918731689453, "learning_rate": 2.7553925437578345e-07, "logits/chosen": -18.13568878173828, "logits/rejected": -17.567874908447266, "logps/chosen": -482.02130126953125, "logps/rejected": -385.5749816894531, "loss": 0.8233, "rewards/accuracies": 0.5, "rewards/chosen": 1.8575435876846313, "rewards/margins": -0.05875883251428604, "rewards/rejected": 1.9163024425506592, "step": 8790 }, { "epoch": 0.40856121454106503, "grad_norm": 230.27105712890625, "learning_rate": 2.755113979293375e-07, "logits/chosen": -19.212005615234375, "logits/rejected": -19.27701187133789, "logps/chosen": -374.55303955078125, "logps/rejected": -469.0899353027344, "loss": 0.9807, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.594125747680664, "rewards/margins": -0.43421420454978943, "rewards/rejected": 2.0283398628234863, "step": 8800 }, { "epoch": 0.4090254886484981, "grad_norm": 28.018320083618164, "learning_rate": 2.754835414828915e-07, "logits/chosen": -18.4556827545166, "logits/rejected": -18.997249603271484, "logps/chosen": -289.059326171875, "logps/rejected": -316.0637512207031, "loss": 0.943, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3917715549468994, "rewards/margins": -0.3472934663295746, "rewards/rejected": 1.739065170288086, "step": 8810 }, { "epoch": 0.4094897627559311, "grad_norm": 51.45188522338867, "learning_rate": 2.7545568503644547e-07, "logits/chosen": -19.407764434814453, "logits/rejected": -17.735424041748047, "logps/chosen": -395.25750732421875, "logps/rejected": -218.74185180664062, "loss": 0.4142, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8534810543060303, "rewards/margins": 1.0616859197616577, "rewards/rejected": 0.7917951345443726, "step": 8820 }, { "epoch": 0.4099540368633641, "grad_norm": 78.56816101074219, "learning_rate": 2.754278285899995e-07, "logits/chosen": -18.675846099853516, "logits/rejected": -18.194263458251953, "logps/chosen": -333.697021484375, "logps/rejected": -256.32281494140625, "loss": 0.6536, "rewards/accuracies": 0.5, "rewards/chosen": 1.3405002355575562, "rewards/margins": 0.12715116143226624, "rewards/rejected": 1.2133491039276123, "step": 8830 }, { "epoch": 0.4104183109707972, "grad_norm": 36.54701232910156, "learning_rate": 2.7539997214355355e-07, "logits/chosen": -18.85738754272461, "logits/rejected": -17.645999908447266, "logps/chosen": -383.63018798828125, "logps/rejected": -311.48187255859375, "loss": 0.7499, "rewards/accuracies": 0.5, "rewards/chosen": 1.9012902975082397, "rewards/margins": 0.1521330624818802, "rewards/rejected": 1.7491573095321655, "step": 8840 }, { "epoch": 0.41088258507823017, "grad_norm": 105.39581298828125, "learning_rate": 2.753721156971076e-07, "logits/chosen": -18.886754989624023, "logits/rejected": -17.091106414794922, "logps/chosen": -476.6993103027344, "logps/rejected": -288.38128662109375, "loss": 0.403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2788710594177246, "rewards/margins": 1.0906254053115845, "rewards/rejected": 1.1882457733154297, "step": 8850 }, { "epoch": 0.4113468591856632, "grad_norm": 155.23046875, "learning_rate": 2.753442592506616e-07, "logits/chosen": -17.656726837158203, "logits/rejected": -17.547840118408203, "logps/chosen": -346.57818603515625, "logps/rejected": -398.96832275390625, "loss": 0.8309, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4149503707885742, "rewards/margins": -0.2013498842716217, "rewards/rejected": 1.6162999868392944, "step": 8860 }, { "epoch": 0.41181113329309627, "grad_norm": 93.27201080322266, "learning_rate": 2.753164028042156e-07, "logits/chosen": -19.559526443481445, "logits/rejected": -18.974239349365234, "logps/chosen": -396.34588623046875, "logps/rejected": -299.9289855957031, "loss": 0.6065, "rewards/accuracies": 0.5, "rewards/chosen": 2.1972908973693848, "rewards/margins": 0.6659784317016602, "rewards/rejected": 1.5313127040863037, "step": 8870 }, { "epoch": 0.41227540740052926, "grad_norm": 181.7848358154297, "learning_rate": 2.752885463577696e-07, "logits/chosen": -17.867389678955078, "logits/rejected": -17.020397186279297, "logps/chosen": -409.32958984375, "logps/rejected": -289.6178283691406, "loss": 0.5977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6225292682647705, "rewards/margins": 0.315019428730011, "rewards/rejected": 1.3075097799301147, "step": 8880 }, { "epoch": 0.4127396815079623, "grad_norm": 48.640621185302734, "learning_rate": 2.752606899113236e-07, "logits/chosen": -18.05668830871582, "logits/rejected": -17.340858459472656, "logps/chosen": -373.9678649902344, "logps/rejected": -330.43878173828125, "loss": 0.7224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.589712142944336, "rewards/margins": 0.10784971714019775, "rewards/rejected": 1.4818624258041382, "step": 8890 }, { "epoch": 0.4132039556153953, "grad_norm": 90.19160461425781, "learning_rate": 2.7523283346487764e-07, "logits/chosen": -18.7042179107666, "logits/rejected": -17.952404022216797, "logps/chosen": -436.986572265625, "logps/rejected": -379.58648681640625, "loss": 0.5423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9426950216293335, "rewards/margins": 0.41414108872413635, "rewards/rejected": 1.52855384349823, "step": 8900 }, { "epoch": 0.41366822972282835, "grad_norm": 114.99078369140625, "learning_rate": 2.752049770184317e-07, "logits/chosen": -17.594905853271484, "logits/rejected": -16.900638580322266, "logps/chosen": -416.11724853515625, "logps/rejected": -298.3802795410156, "loss": 0.5613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5775394439697266, "rewards/margins": 0.40684741735458374, "rewards/rejected": 1.1706918478012085, "step": 8910 }, { "epoch": 0.4141325038302614, "grad_norm": 163.8309783935547, "learning_rate": 2.751771205719857e-07, "logits/chosen": -19.305482864379883, "logits/rejected": -18.66269874572754, "logps/chosen": -566.43212890625, "logps/rejected": -452.2222595214844, "loss": 0.4984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.301908493041992, "rewards/margins": 0.5127164125442505, "rewards/rejected": 1.7891921997070312, "step": 8920 }, { "epoch": 0.4145967779376944, "grad_norm": 141.19964599609375, "learning_rate": 2.751492641255397e-07, "logits/chosen": -17.473140716552734, "logits/rejected": -17.405805587768555, "logps/chosen": -326.6771240234375, "logps/rejected": -339.6312561035156, "loss": 0.6532, "rewards/accuracies": 0.5, "rewards/chosen": 1.4520037174224854, "rewards/margins": 0.29720166325569153, "rewards/rejected": 1.1548020839691162, "step": 8930 }, { "epoch": 0.41506105204512744, "grad_norm": 109.10100555419922, "learning_rate": 2.751214076790937e-07, "logits/chosen": -19.453693389892578, "logits/rejected": -18.26125717163086, "logps/chosen": -423.13714599609375, "logps/rejected": -275.61651611328125, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 1.8114789724349976, "rewards/margins": 0.8538286089897156, "rewards/rejected": 0.9576500654220581, "step": 8940 }, { "epoch": 0.4155253261525605, "grad_norm": 30.790470123291016, "learning_rate": 2.7509355123264774e-07, "logits/chosen": -19.185312271118164, "logits/rejected": -18.284984588623047, "logps/chosen": -328.46954345703125, "logps/rejected": -259.60174560546875, "loss": 0.5846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6269590854644775, "rewards/margins": 0.3667483925819397, "rewards/rejected": 1.2602107524871826, "step": 8950 }, { "epoch": 0.4159896002599935, "grad_norm": 56.26020431518555, "learning_rate": 2.750656947862018e-07, "logits/chosen": -19.071617126464844, "logits/rejected": -17.93134880065918, "logps/chosen": -447.60650634765625, "logps/rejected": -343.7995910644531, "loss": 0.4205, "rewards/accuracies": 1.0, "rewards/chosen": 2.3231799602508545, "rewards/margins": 0.9044309854507446, "rewards/rejected": 1.4187489748001099, "step": 8960 }, { "epoch": 0.41645387436742654, "grad_norm": 55.22600173950195, "learning_rate": 2.7503783833975577e-07, "logits/chosen": -18.781349182128906, "logits/rejected": -17.556690216064453, "logps/chosen": -437.5464782714844, "logps/rejected": -235.0965118408203, "loss": 0.4708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8876616954803467, "rewards/margins": 0.5654396414756775, "rewards/rejected": 1.322222113609314, "step": 8970 }, { "epoch": 0.4169181484748596, "grad_norm": 157.2919921875, "learning_rate": 2.750099818933098e-07, "logits/chosen": -19.543292999267578, "logits/rejected": -19.96469497680664, "logps/chosen": -467.76617431640625, "logps/rejected": -495.1064453125, "loss": 0.9274, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9670346975326538, "rewards/margins": -0.27617812156677246, "rewards/rejected": 2.2432126998901367, "step": 8980 }, { "epoch": 0.4173824225822926, "grad_norm": 79.29895782470703, "learning_rate": 2.749821254468638e-07, "logits/chosen": -18.283700942993164, "logits/rejected": -17.049346923828125, "logps/chosen": -445.0274353027344, "logps/rejected": -343.23760986328125, "loss": 0.7084, "rewards/accuracies": 0.5, "rewards/chosen": 1.5271899700164795, "rewards/margins": 0.05434682220220566, "rewards/rejected": 1.4728432893753052, "step": 8990 }, { "epoch": 0.4178466966897256, "grad_norm": 40.3316535949707, "learning_rate": 2.7495426900041784e-07, "logits/chosen": -17.967037200927734, "logits/rejected": -16.619056701660156, "logps/chosen": -364.53497314453125, "logps/rejected": -277.80609130859375, "loss": 0.4782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1786317825317383, "rewards/margins": 0.8862847089767456, "rewards/rejected": 1.292346715927124, "step": 9000 }, { "epoch": 0.4183109707971586, "grad_norm": 134.88706970214844, "learning_rate": 2.749264125539718e-07, "logits/chosen": -19.43596076965332, "logits/rejected": -18.201671600341797, "logps/chosen": -579.5101318359375, "logps/rejected": -422.5990295410156, "loss": 0.713, "rewards/accuracies": 0.5, "rewards/chosen": 2.378296375274658, "rewards/margins": 0.2930000424385071, "rewards/rejected": 2.085296154022217, "step": 9010 }, { "epoch": 0.41877524490459167, "grad_norm": 99.59207153320312, "learning_rate": 2.7489855610752587e-07, "logits/chosen": -19.107065200805664, "logits/rejected": -17.087602615356445, "logps/chosen": -541.2847290039062, "logps/rejected": -372.48126220703125, "loss": 0.4082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.231923818588257, "rewards/margins": 0.8923105001449585, "rewards/rejected": 1.339613437652588, "step": 9020 }, { "epoch": 0.4192395190120247, "grad_norm": 18.788925170898438, "learning_rate": 2.748706996610799e-07, "logits/chosen": -17.536584854125977, "logits/rejected": -17.439390182495117, "logps/chosen": -361.32183837890625, "logps/rejected": -336.83062744140625, "loss": 0.8681, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6778173446655273, "rewards/margins": -0.08118553459644318, "rewards/rejected": 1.7590030431747437, "step": 9030 }, { "epoch": 0.4197037931194577, "grad_norm": 78.21605682373047, "learning_rate": 2.7484284321463395e-07, "logits/chosen": -18.433452606201172, "logits/rejected": -17.841520309448242, "logps/chosen": -452.77069091796875, "logps/rejected": -332.2026672363281, "loss": 0.5469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8927301168441772, "rewards/margins": 0.46445998549461365, "rewards/rejected": 1.4282701015472412, "step": 9040 }, { "epoch": 0.42016806722689076, "grad_norm": 32.61295700073242, "learning_rate": 2.7481498676818794e-07, "logits/chosen": -20.302379608154297, "logits/rejected": -19.886707305908203, "logps/chosen": -425.387939453125, "logps/rejected": -381.66131591796875, "loss": 0.6428, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8607021570205688, "rewards/margins": 0.21486949920654297, "rewards/rejected": 1.6458327770233154, "step": 9050 }, { "epoch": 0.4206323413343238, "grad_norm": 94.77680206298828, "learning_rate": 2.747871303217419e-07, "logits/chosen": -17.634035110473633, "logits/rejected": -17.09927749633789, "logps/chosen": -383.38360595703125, "logps/rejected": -297.1947326660156, "loss": 0.6673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.772120475769043, "rewards/margins": 0.2722609043121338, "rewards/rejected": 1.4998595714569092, "step": 9060 }, { "epoch": 0.4210966154417568, "grad_norm": 107.48065185546875, "learning_rate": 2.7475927387529596e-07, "logits/chosen": -17.68699073791504, "logits/rejected": -17.41046905517578, "logps/chosen": -373.79583740234375, "logps/rejected": -373.5689392089844, "loss": 0.6455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.750560998916626, "rewards/margins": 0.20795121788978577, "rewards/rejected": 1.5426098108291626, "step": 9070 }, { "epoch": 0.42156088954918985, "grad_norm": 26.159442901611328, "learning_rate": 2.7473141742884995e-07, "logits/chosen": -18.466960906982422, "logits/rejected": -17.181076049804688, "logps/chosen": -356.794921875, "logps/rejected": -285.53857421875, "loss": 0.566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6240148544311523, "rewards/margins": 0.4930381774902344, "rewards/rejected": 1.130976676940918, "step": 9080 }, { "epoch": 0.42202516365662285, "grad_norm": 73.57875061035156, "learning_rate": 2.74703560982404e-07, "logits/chosen": -18.11366081237793, "logits/rejected": -17.104278564453125, "logps/chosen": -522.3304443359375, "logps/rejected": -400.6622009277344, "loss": 0.4924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.216014862060547, "rewards/margins": 0.6384314298629761, "rewards/rejected": 1.5775833129882812, "step": 9090 }, { "epoch": 0.4224894377640559, "grad_norm": 22.976835250854492, "learning_rate": 2.7467570453595803e-07, "logits/chosen": -17.923851013183594, "logits/rejected": -17.277164459228516, "logps/chosen": -381.3388977050781, "logps/rejected": -298.843505859375, "loss": 0.4911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.006448268890381, "rewards/margins": 0.5630982518196106, "rewards/rejected": 1.443349838256836, "step": 9100 }, { "epoch": 0.42295371187148895, "grad_norm": 49.25988006591797, "learning_rate": 2.74647848089512e-07, "logits/chosen": -18.248228073120117, "logits/rejected": -17.75022315979004, "logps/chosen": -338.4006042480469, "logps/rejected": -305.05328369140625, "loss": 0.7178, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6844892501831055, "rewards/margins": 0.14092738926410675, "rewards/rejected": 1.5435616970062256, "step": 9110 }, { "epoch": 0.42341798597892194, "grad_norm": 64.55186462402344, "learning_rate": 2.7461999164306606e-07, "logits/chosen": -18.914480209350586, "logits/rejected": -17.724441528320312, "logps/chosen": -449.3067932128906, "logps/rejected": -427.42803955078125, "loss": 0.5799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5602760314941406, "rewards/margins": 0.372948557138443, "rewards/rejected": 2.1873278617858887, "step": 9120 }, { "epoch": 0.423882260086355, "grad_norm": 52.79824447631836, "learning_rate": 2.7459213519662005e-07, "logits/chosen": -18.680898666381836, "logits/rejected": -18.634449005126953, "logps/chosen": -488.1565856933594, "logps/rejected": -471.05023193359375, "loss": 0.8439, "rewards/accuracies": 0.5, "rewards/chosen": 1.938122034072876, "rewards/margins": -0.10245788097381592, "rewards/rejected": 2.0405797958374023, "step": 9130 }, { "epoch": 0.42434653419378804, "grad_norm": 42.22023010253906, "learning_rate": 2.745642787501741e-07, "logits/chosen": -18.867773056030273, "logits/rejected": -17.16851806640625, "logps/chosen": -442.1431579589844, "logps/rejected": -275.4366149902344, "loss": 0.4647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1040291786193848, "rewards/margins": 0.840922474861145, "rewards/rejected": 1.2631065845489502, "step": 9140 }, { "epoch": 0.42481080830122103, "grad_norm": 154.16354370117188, "learning_rate": 2.7453642230372813e-07, "logits/chosen": -17.773197174072266, "logits/rejected": -17.204206466674805, "logps/chosen": -389.5381164550781, "logps/rejected": -352.04034423828125, "loss": 0.6482, "rewards/accuracies": 0.5, "rewards/chosen": 1.734881043434143, "rewards/margins": 0.34054532647132874, "rewards/rejected": 1.3943357467651367, "step": 9150 }, { "epoch": 0.4252750824086541, "grad_norm": 215.1483917236328, "learning_rate": 2.745085658572821e-07, "logits/chosen": -18.25752830505371, "logits/rejected": -17.831619262695312, "logps/chosen": -415.5282287597656, "logps/rejected": -354.99359130859375, "loss": 0.847, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.952052116394043, "rewards/margins": 0.09042499214410782, "rewards/rejected": 1.861627221107483, "step": 9160 }, { "epoch": 0.4257393565160871, "grad_norm": 32.62652587890625, "learning_rate": 2.7448070941083616e-07, "logits/chosen": -18.304851531982422, "logits/rejected": -18.51321029663086, "logps/chosen": -406.8687438964844, "logps/rejected": -362.5358581542969, "loss": 0.9386, "rewards/accuracies": 0.5, "rewards/chosen": 1.6979233026504517, "rewards/margins": -0.24009504914283752, "rewards/rejected": 1.9380180835723877, "step": 9170 }, { "epoch": 0.4262036306235201, "grad_norm": 72.17466735839844, "learning_rate": 2.7445285296439015e-07, "logits/chosen": -19.7049503326416, "logits/rejected": -19.245159149169922, "logps/chosen": -477.767333984375, "logps/rejected": -353.69622802734375, "loss": 0.621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.842952013015747, "rewards/margins": 0.2575111985206604, "rewards/rejected": 1.5854408740997314, "step": 9180 }, { "epoch": 0.4266679047309532, "grad_norm": 67.53492736816406, "learning_rate": 2.744249965179442e-07, "logits/chosen": -19.06668472290039, "logits/rejected": -17.79570960998535, "logps/chosen": -361.42437744140625, "logps/rejected": -285.2068176269531, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9203850030899048, "rewards/margins": 0.3476417660713196, "rewards/rejected": 1.5727430582046509, "step": 9190 }, { "epoch": 0.42713217883838617, "grad_norm": 53.65003967285156, "learning_rate": 2.743971400714982e-07, "logits/chosen": -17.32110023498535, "logits/rejected": -16.865222930908203, "logps/chosen": -344.74456787109375, "logps/rejected": -281.71966552734375, "loss": 0.5672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6632331609725952, "rewards/margins": 0.4402545988559723, "rewards/rejected": 1.2229787111282349, "step": 9200 }, { "epoch": 0.4275964529458192, "grad_norm": 15.387460708618164, "learning_rate": 2.743692836250522e-07, "logits/chosen": -17.814342498779297, "logits/rejected": -17.746816635131836, "logps/chosen": -375.20751953125, "logps/rejected": -314.49163818359375, "loss": 0.5777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6857601404190063, "rewards/margins": 0.433783620595932, "rewards/rejected": 1.251976490020752, "step": 9210 }, { "epoch": 0.42806072705325227, "grad_norm": 225.9003143310547, "learning_rate": 2.7434142717860626e-07, "logits/chosen": -17.99590492248535, "logits/rejected": -16.354820251464844, "logps/chosen": -590.6268310546875, "logps/rejected": -360.7095642089844, "loss": 0.5816, "rewards/accuracies": 0.5, "rewards/chosen": 2.386652946472168, "rewards/margins": 0.6035507917404175, "rewards/rejected": 1.783102035522461, "step": 9220 }, { "epoch": 0.42852500116068526, "grad_norm": 171.7530975341797, "learning_rate": 2.7431357073216025e-07, "logits/chosen": -19.176504135131836, "logits/rejected": -19.22279167175293, "logps/chosen": -322.06195068359375, "logps/rejected": -378.1884460449219, "loss": 0.9429, "rewards/accuracies": 0.5, "rewards/chosen": 1.646989107131958, "rewards/margins": -0.16960880160331726, "rewards/rejected": 1.8165979385375977, "step": 9230 }, { "epoch": 0.4289892752681183, "grad_norm": 105.52753448486328, "learning_rate": 2.7428571428571424e-07, "logits/chosen": -18.234371185302734, "logits/rejected": -17.82012176513672, "logps/chosen": -420.9466247558594, "logps/rejected": -350.79620361328125, "loss": 0.5332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0104470252990723, "rewards/margins": 0.4580070376396179, "rewards/rejected": 1.55243980884552, "step": 9240 }, { "epoch": 0.4294535493755513, "grad_norm": 59.82086181640625, "learning_rate": 2.742578578392683e-07, "logits/chosen": -17.96567153930664, "logits/rejected": -17.10868263244629, "logps/chosen": -404.86572265625, "logps/rejected": -270.95867919921875, "loss": 0.6147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0287442207336426, "rewards/margins": 0.47751063108444214, "rewards/rejected": 1.5512335300445557, "step": 9250 }, { "epoch": 0.42991782348298435, "grad_norm": 71.38220977783203, "learning_rate": 2.742300013928223e-07, "logits/chosen": -17.514476776123047, "logits/rejected": -16.707599639892578, "logps/chosen": -453.1058654785156, "logps/rejected": -377.96820068359375, "loss": 0.8174, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7084858417510986, "rewards/margins": 0.06764446198940277, "rewards/rejected": 1.6408412456512451, "step": 9260 }, { "epoch": 0.4303820975904174, "grad_norm": 124.83385467529297, "learning_rate": 2.742021449463763e-07, "logits/chosen": -19.391523361206055, "logits/rejected": -18.059640884399414, "logps/chosen": -555.835693359375, "logps/rejected": -384.2210388183594, "loss": 0.4702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.761971950531006, "rewards/margins": 1.0207597017288208, "rewards/rejected": 1.741212248802185, "step": 9270 }, { "epoch": 0.4308463716978504, "grad_norm": 59.09063720703125, "learning_rate": 2.7417428849993035e-07, "logits/chosen": -18.625011444091797, "logits/rejected": -18.475629806518555, "logps/chosen": -387.9542236328125, "logps/rejected": -437.17437744140625, "loss": 0.7731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.978856086730957, "rewards/margins": 0.005147808697074652, "rewards/rejected": 1.9737085103988647, "step": 9280 }, { "epoch": 0.43131064580528344, "grad_norm": 19.303260803222656, "learning_rate": 2.741464320534844e-07, "logits/chosen": -18.15257453918457, "logits/rejected": -17.609790802001953, "logps/chosen": -538.4744873046875, "logps/rejected": -394.7798767089844, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 1.9519649744033813, "rewards/margins": 0.2914368212223053, "rewards/rejected": 1.6605281829833984, "step": 9290 }, { "epoch": 0.4317749199127165, "grad_norm": 23.41473960876465, "learning_rate": 2.741185756070384e-07, "logits/chosen": -17.63010025024414, "logits/rejected": -16.905864715576172, "logps/chosen": -321.797119140625, "logps/rejected": -227.41738891601562, "loss": 0.6814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.211378812789917, "rewards/margins": 0.39368802309036255, "rewards/rejected": 0.8176907300949097, "step": 9300 }, { "epoch": 0.4322391940201495, "grad_norm": 41.35853958129883, "learning_rate": 2.7409071916059237e-07, "logits/chosen": -17.74459457397461, "logits/rejected": -18.491107940673828, "logps/chosen": -377.7262268066406, "logps/rejected": -374.64276123046875, "loss": 0.889, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.3387473821640015, "rewards/margins": -0.2738892436027527, "rewards/rejected": 1.6126365661621094, "step": 9310 }, { "epoch": 0.43270346812758254, "grad_norm": 165.66551208496094, "learning_rate": 2.740628627141464e-07, "logits/chosen": -18.3707218170166, "logits/rejected": -18.141582489013672, "logps/chosen": -308.16717529296875, "logps/rejected": -330.73089599609375, "loss": 0.7131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9972565174102783, "rewards/margins": 0.07430374622344971, "rewards/rejected": 1.92295241355896, "step": 9320 }, { "epoch": 0.43316774223501553, "grad_norm": 85.07548522949219, "learning_rate": 2.7403500626770045e-07, "logits/chosen": -18.629436492919922, "logits/rejected": -17.919490814208984, "logps/chosen": -507.59429931640625, "logps/rejected": -421.5956115722656, "loss": 0.5249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.136432647705078, "rewards/margins": 0.41942518949508667, "rewards/rejected": 1.7170073986053467, "step": 9330 }, { "epoch": 0.4336320163424486, "grad_norm": 28.875730514526367, "learning_rate": 2.740071498212545e-07, "logits/chosen": -18.652671813964844, "logits/rejected": -17.787940979003906, "logps/chosen": -362.25689697265625, "logps/rejected": -292.534423828125, "loss": 0.5797, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0285496711730957, "rewards/margins": 0.3402458131313324, "rewards/rejected": 1.6883039474487305, "step": 9340 }, { "epoch": 0.4340962904498816, "grad_norm": 66.259765625, "learning_rate": 2.739792933748085e-07, "logits/chosen": -19.122385025024414, "logits/rejected": -18.39290428161621, "logps/chosen": -423.26910400390625, "logps/rejected": -344.6065368652344, "loss": 0.5121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0863845348358154, "rewards/margins": 0.6131883263587952, "rewards/rejected": 1.473196268081665, "step": 9350 }, { "epoch": 0.4345605645573146, "grad_norm": 88.25328063964844, "learning_rate": 2.7395143692836247e-07, "logits/chosen": -17.220355987548828, "logits/rejected": -16.967594146728516, "logps/chosen": -270.5519104003906, "logps/rejected": -231.9259490966797, "loss": 0.6573, "rewards/accuracies": 0.5, "rewards/chosen": 1.2634727954864502, "rewards/margins": 0.20028848946094513, "rewards/rejected": 1.063184380531311, "step": 9360 }, { "epoch": 0.43502483866474767, "grad_norm": 74.40144348144531, "learning_rate": 2.739235804819165e-07, "logits/chosen": -17.679304122924805, "logits/rejected": -17.958621978759766, "logps/chosen": -460.1653747558594, "logps/rejected": -438.38360595703125, "loss": 0.7706, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0889370441436768, "rewards/margins": -0.08197470009326935, "rewards/rejected": 2.1709115505218506, "step": 9370 }, { "epoch": 0.4354891127721807, "grad_norm": 91.2371597290039, "learning_rate": 2.7389572403547055e-07, "logits/chosen": -17.73490333557129, "logits/rejected": -16.979406356811523, "logps/chosen": -406.632568359375, "logps/rejected": -341.20294189453125, "loss": 0.697, "rewards/accuracies": 0.5, "rewards/chosen": 1.6742677688598633, "rewards/margins": 0.3011311888694763, "rewards/rejected": 1.3731367588043213, "step": 9380 }, { "epoch": 0.4359533868796137, "grad_norm": 134.6151580810547, "learning_rate": 2.7386786758902454e-07, "logits/chosen": -19.26583480834961, "logits/rejected": -18.488061904907227, "logps/chosen": -373.69293212890625, "logps/rejected": -305.7380065917969, "loss": 0.4596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8678967952728271, "rewards/margins": 0.7535626292228699, "rewards/rejected": 1.114334225654602, "step": 9390 }, { "epoch": 0.43641766098704676, "grad_norm": 40.762229919433594, "learning_rate": 2.738400111425786e-07, "logits/chosen": -18.268476486206055, "logits/rejected": -16.978788375854492, "logps/chosen": -397.93658447265625, "logps/rejected": -248.88525390625, "loss": 0.3839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7954508066177368, "rewards/margins": 0.9930175542831421, "rewards/rejected": 0.8024331331253052, "step": 9400 }, { "epoch": 0.43688193509447976, "grad_norm": 32.19047164916992, "learning_rate": 2.7381215469613257e-07, "logits/chosen": -18.355365753173828, "logits/rejected": -18.72044563293457, "logps/chosen": -400.0652770996094, "logps/rejected": -440.513671875, "loss": 0.7765, "rewards/accuracies": 0.5, "rewards/chosen": 1.6524975299835205, "rewards/margins": -0.08297620713710785, "rewards/rejected": 1.7354736328125, "step": 9410 }, { "epoch": 0.4373462092019128, "grad_norm": 58.173309326171875, "learning_rate": 2.737842982496866e-07, "logits/chosen": -18.421154022216797, "logits/rejected": -17.318973541259766, "logps/chosen": -508.96759033203125, "logps/rejected": -325.8247985839844, "loss": 0.4704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4382541179656982, "rewards/margins": 0.8278770446777344, "rewards/rejected": 1.610377311706543, "step": 9420 }, { "epoch": 0.43781048330934585, "grad_norm": 109.02799987792969, "learning_rate": 2.737564418032406e-07, "logits/chosen": -18.714248657226562, "logits/rejected": -17.64932632446289, "logps/chosen": -431.99163818359375, "logps/rejected": -343.1660461425781, "loss": 0.5475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.10872220993042, "rewards/margins": 0.5924828052520752, "rewards/rejected": 1.5162392854690552, "step": 9430 }, { "epoch": 0.43827475741677885, "grad_norm": 48.754371643066406, "learning_rate": 2.7372858535679464e-07, "logits/chosen": -17.658178329467773, "logits/rejected": -17.885541915893555, "logps/chosen": -262.98260498046875, "logps/rejected": -277.87835693359375, "loss": 0.8023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3724563121795654, "rewards/margins": -0.07110093533992767, "rewards/rejected": 1.4435573816299438, "step": 9440 }, { "epoch": 0.4387390315242119, "grad_norm": 25.107839584350586, "learning_rate": 2.737007289103487e-07, "logits/chosen": -17.439640045166016, "logits/rejected": -17.444854736328125, "logps/chosen": -349.9244384765625, "logps/rejected": -344.55133056640625, "loss": 0.8174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5383938550949097, "rewards/margins": 0.043497953563928604, "rewards/rejected": 1.4948959350585938, "step": 9450 }, { "epoch": 0.43920330563164495, "grad_norm": 37.194583892822266, "learning_rate": 2.7367287246390266e-07, "logits/chosen": -18.370725631713867, "logits/rejected": -18.541406631469727, "logps/chosen": -359.8744812011719, "logps/rejected": -363.1944274902344, "loss": 0.7908, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6052700281143188, "rewards/margins": -0.10351194441318512, "rewards/rejected": 1.7087819576263428, "step": 9460 }, { "epoch": 0.43966757973907794, "grad_norm": 150.46734619140625, "learning_rate": 2.736450160174567e-07, "logits/chosen": -19.101837158203125, "logits/rejected": -18.2381534576416, "logps/chosen": -357.0393981933594, "logps/rejected": -337.990478515625, "loss": 0.5286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6024582386016846, "rewards/margins": 0.4993689954280853, "rewards/rejected": 1.1030892133712769, "step": 9470 }, { "epoch": 0.440131853846511, "grad_norm": 35.094974517822266, "learning_rate": 2.736171595710107e-07, "logits/chosen": -18.201108932495117, "logits/rejected": -17.098573684692383, "logps/chosen": -425.5556640625, "logps/rejected": -283.149658203125, "loss": 0.469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9609991312026978, "rewards/margins": 0.7202621102333069, "rewards/rejected": 1.240736961364746, "step": 9480 }, { "epoch": 0.440596127953944, "grad_norm": 119.33016204833984, "learning_rate": 2.7358930312456473e-07, "logits/chosen": -19.332008361816406, "logits/rejected": -17.708110809326172, "logps/chosen": -443.4814453125, "logps/rejected": -354.1727294921875, "loss": 0.6723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.067826271057129, "rewards/margins": 0.2808471918106079, "rewards/rejected": 1.786979079246521, "step": 9490 }, { "epoch": 0.44106040206137703, "grad_norm": 101.46951293945312, "learning_rate": 2.735614466781187e-07, "logits/chosen": -18.12635612487793, "logits/rejected": -18.19614028930664, "logps/chosen": -478.3837890625, "logps/rejected": -468.93255615234375, "loss": 0.7768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9384406805038452, "rewards/margins": -0.05672916769981384, "rewards/rejected": 1.995169997215271, "step": 9500 }, { "epoch": 0.4415246761688101, "grad_norm": 43.11406326293945, "learning_rate": 2.7353359023167276e-07, "logits/chosen": -18.088024139404297, "logits/rejected": -17.45915412902832, "logps/chosen": -408.8866271972656, "logps/rejected": -351.43133544921875, "loss": 0.594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7155907154083252, "rewards/margins": 0.30127567052841187, "rewards/rejected": 1.4143149852752686, "step": 9510 }, { "epoch": 0.4419889502762431, "grad_norm": 88.85134887695312, "learning_rate": 2.735057337852268e-07, "logits/chosen": -19.012548446655273, "logits/rejected": -17.6386661529541, "logps/chosen": -446.30645751953125, "logps/rejected": -307.15631103515625, "loss": 0.4818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.006650447845459, "rewards/margins": 0.7413125038146973, "rewards/rejected": 1.2653377056121826, "step": 9520 }, { "epoch": 0.4424532243836761, "grad_norm": 31.128637313842773, "learning_rate": 2.734778773387808e-07, "logits/chosen": -17.86207389831543, "logits/rejected": -17.193967819213867, "logps/chosen": -389.94598388671875, "logps/rejected": -300.1686096191406, "loss": 0.6337, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.975081205368042, "rewards/margins": 0.26312094926834106, "rewards/rejected": 1.7119601964950562, "step": 9530 }, { "epoch": 0.4429174984911092, "grad_norm": 68.74943542480469, "learning_rate": 2.7345002089233483e-07, "logits/chosen": -18.68821907043457, "logits/rejected": -17.79971694946289, "logps/chosen": -355.0784912109375, "logps/rejected": -222.66567993164062, "loss": 0.5282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.533699870109558, "rewards/margins": 0.43418532609939575, "rewards/rejected": 1.0995147228240967, "step": 9540 }, { "epoch": 0.44338177259854217, "grad_norm": 44.291507720947266, "learning_rate": 2.734221644458888e-07, "logits/chosen": -18.121152877807617, "logits/rejected": -17.366809844970703, "logps/chosen": -388.34619140625, "logps/rejected": -308.3915100097656, "loss": 0.667, "rewards/accuracies": 0.5, "rewards/chosen": 1.1958272457122803, "rewards/margins": 0.09704618901014328, "rewards/rejected": 1.0987812280654907, "step": 9550 }, { "epoch": 0.4438460467059752, "grad_norm": 162.94772338867188, "learning_rate": 2.7339430799944286e-07, "logits/chosen": -17.60009002685547, "logits/rejected": -17.423114776611328, "logps/chosen": -397.54608154296875, "logps/rejected": -362.54266357421875, "loss": 0.7427, "rewards/accuracies": 0.5, "rewards/chosen": 1.8001911640167236, "rewards/margins": 0.2439977377653122, "rewards/rejected": 1.5561933517456055, "step": 9560 }, { "epoch": 0.4443103208134082, "grad_norm": 40.55009078979492, "learning_rate": 2.733664515529969e-07, "logits/chosen": -18.891645431518555, "logits/rejected": -18.427759170532227, "logps/chosen": -488.28192138671875, "logps/rejected": -506.5376892089844, "loss": 0.8249, "rewards/accuracies": 0.5, "rewards/chosen": 2.1935856342315674, "rewards/margins": 0.03389499709010124, "rewards/rejected": 2.1596906185150146, "step": 9570 }, { "epoch": 0.44477459492084126, "grad_norm": 177.9410858154297, "learning_rate": 2.733385951065509e-07, "logits/chosen": -18.22329330444336, "logits/rejected": -18.050952911376953, "logps/chosen": -312.81146240234375, "logps/rejected": -335.1623840332031, "loss": 0.914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6038566827774048, "rewards/margins": -0.06434933841228485, "rewards/rejected": 1.668205976486206, "step": 9580 }, { "epoch": 0.4452388690282743, "grad_norm": 15.556985855102539, "learning_rate": 2.7331073866010493e-07, "logits/chosen": -18.663471221923828, "logits/rejected": -17.482746124267578, "logps/chosen": -466.0791015625, "logps/rejected": -338.06719970703125, "loss": 0.5667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8856357336044312, "rewards/margins": 0.5312737226486206, "rewards/rejected": 1.3543620109558105, "step": 9590 }, { "epoch": 0.4457031431357073, "grad_norm": 119.3281478881836, "learning_rate": 2.732828822136589e-07, "logits/chosen": -18.461132049560547, "logits/rejected": -18.101390838623047, "logps/chosen": -408.3330078125, "logps/rejected": -331.11724853515625, "loss": 0.6991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6630773544311523, "rewards/margins": 0.06081853061914444, "rewards/rejected": 1.6022586822509766, "step": 9600 }, { "epoch": 0.44616741724314035, "grad_norm": 24.599624633789062, "learning_rate": 2.7325502576721296e-07, "logits/chosen": -18.077959060668945, "logits/rejected": -16.73186683654785, "logps/chosen": -491.18194580078125, "logps/rejected": -310.8529357910156, "loss": 0.5725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.253227710723877, "rewards/margins": 0.6568130254745483, "rewards/rejected": 1.5964148044586182, "step": 9610 }, { "epoch": 0.4466316913505734, "grad_norm": 118.66889190673828, "learning_rate": 2.7322716932076695e-07, "logits/chosen": -17.62100601196289, "logits/rejected": -18.590049743652344, "logps/chosen": -471.9576110839844, "logps/rejected": -420.34375, "loss": 0.7729, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9694989919662476, "rewards/margins": 0.1338154375553131, "rewards/rejected": 1.8356835842132568, "step": 9620 }, { "epoch": 0.4470959654580064, "grad_norm": 10.356508255004883, "learning_rate": 2.73199312874321e-07, "logits/chosen": -17.602909088134766, "logits/rejected": -16.5037899017334, "logps/chosen": -398.7862243652344, "logps/rejected": -295.53857421875, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.031015157699585, "rewards/margins": 0.5431274175643921, "rewards/rejected": 1.487887978553772, "step": 9630 }, { "epoch": 0.44756023956543944, "grad_norm": 98.26142120361328, "learning_rate": 2.7317145642787503e-07, "logits/chosen": -19.592147827148438, "logits/rejected": -19.420026779174805, "logps/chosen": -428.6949157714844, "logps/rejected": -368.46368408203125, "loss": 0.6501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.801897644996643, "rewards/margins": 0.22431650757789612, "rewards/rejected": 1.5775812864303589, "step": 9640 }, { "epoch": 0.44802451367287244, "grad_norm": 173.89697265625, "learning_rate": 2.73143599981429e-07, "logits/chosen": -19.06264305114746, "logits/rejected": -18.40057373046875, "logps/chosen": -425.84783935546875, "logps/rejected": -354.1115417480469, "loss": 0.6632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.786500334739685, "rewards/margins": 0.24689912796020508, "rewards/rejected": 1.53960120677948, "step": 9650 }, { "epoch": 0.4484887877803055, "grad_norm": 46.00081253051758, "learning_rate": 2.73115743534983e-07, "logits/chosen": -19.476558685302734, "logits/rejected": -18.781354904174805, "logps/chosen": -373.39202880859375, "logps/rejected": -268.59771728515625, "loss": 0.5038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.210897207260132, "rewards/margins": 0.6803816556930542, "rewards/rejected": 1.530515432357788, "step": 9660 }, { "epoch": 0.44895306188773854, "grad_norm": 6.533696174621582, "learning_rate": 2.7308788708853705e-07, "logits/chosen": -17.626846313476562, "logits/rejected": -16.520626068115234, "logps/chosen": -402.02813720703125, "logps/rejected": -266.80242919921875, "loss": 0.4818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9246032238006592, "rewards/margins": 0.8653233647346497, "rewards/rejected": 1.0592799186706543, "step": 9670 }, { "epoch": 0.44941733599517153, "grad_norm": 38.55417251586914, "learning_rate": 2.730600306420911e-07, "logits/chosen": -18.85675621032715, "logits/rejected": -17.636463165283203, "logps/chosen": -434.3018493652344, "logps/rejected": -242.5912322998047, "loss": 0.4432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9763410091400146, "rewards/margins": 1.032958745956421, "rewards/rejected": 0.9433820843696594, "step": 9680 }, { "epoch": 0.4498816101026046, "grad_norm": 23.821273803710938, "learning_rate": 2.730321741956451e-07, "logits/chosen": -18.523365020751953, "logits/rejected": -17.526830673217773, "logps/chosen": -371.10064697265625, "logps/rejected": -305.5059814453125, "loss": 0.6467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7975813150405884, "rewards/margins": 0.2709834575653076, "rewards/rejected": 1.5265977382659912, "step": 9690 }, { "epoch": 0.45034588421003763, "grad_norm": 44.10536575317383, "learning_rate": 2.730043177491991e-07, "logits/chosen": -19.112863540649414, "logits/rejected": -17.79677963256836, "logps/chosen": -393.4976806640625, "logps/rejected": -245.4240264892578, "loss": 0.4029, "rewards/accuracies": 1.0, "rewards/chosen": 2.1342084407806396, "rewards/margins": 0.823609471321106, "rewards/rejected": 1.3105990886688232, "step": 9700 }, { "epoch": 0.4508101583174706, "grad_norm": 30.988645553588867, "learning_rate": 2.7297646130275316e-07, "logits/chosen": -19.179052352905273, "logits/rejected": -17.6661376953125, "logps/chosen": -378.5989685058594, "logps/rejected": -184.44168090820312, "loss": 0.3191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3219263553619385, "rewards/margins": 1.6336543560028076, "rewards/rejected": 0.6882719397544861, "step": 9710 }, { "epoch": 0.45127443242490367, "grad_norm": 130.73974609375, "learning_rate": 2.7294860485630715e-07, "logits/chosen": -19.728193283081055, "logits/rejected": -18.491779327392578, "logps/chosen": -410.5299377441406, "logps/rejected": -343.69244384765625, "loss": 0.5743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8215023279190063, "rewards/margins": 0.3637823462486267, "rewards/rejected": 1.4577199220657349, "step": 9720 }, { "epoch": 0.45173870653233666, "grad_norm": 29.839054107666016, "learning_rate": 2.7292074840986114e-07, "logits/chosen": -19.06475830078125, "logits/rejected": -18.273391723632812, "logps/chosen": -305.2467346191406, "logps/rejected": -195.2567901611328, "loss": 0.529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3938252925872803, "rewards/margins": 0.5354663133621216, "rewards/rejected": 0.8583589792251587, "step": 9730 }, { "epoch": 0.4522029806397697, "grad_norm": 159.8729705810547, "learning_rate": 2.728928919634152e-07, "logits/chosen": -17.069080352783203, "logits/rejected": -16.89511489868164, "logps/chosen": -471.416748046875, "logps/rejected": -398.88958740234375, "loss": 1.0467, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.336381196975708, "rewards/margins": 0.148064523935318, "rewards/rejected": 2.188316822052002, "step": 9740 }, { "epoch": 0.45266725474720276, "grad_norm": 67.04930114746094, "learning_rate": 2.728650355169692e-07, "logits/chosen": -18.176198959350586, "logits/rejected": -18.313983917236328, "logps/chosen": -368.4016418457031, "logps/rejected": -353.4605407714844, "loss": 0.7924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8172047138214111, "rewards/margins": -0.054291654378175735, "rewards/rejected": 1.8714962005615234, "step": 9750 }, { "epoch": 0.45313152885463576, "grad_norm": 33.23154830932617, "learning_rate": 2.7283717907052326e-07, "logits/chosen": -18.405582427978516, "logits/rejected": -16.484375, "logps/chosen": -395.5404052734375, "logps/rejected": -187.17042541503906, "loss": 0.4746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9597800970077515, "rewards/margins": 0.9680252075195312, "rewards/rejected": 0.9917551875114441, "step": 9760 }, { "epoch": 0.4535958029620688, "grad_norm": 47.770050048828125, "learning_rate": 2.7280932262407725e-07, "logits/chosen": -19.530250549316406, "logits/rejected": -18.731494903564453, "logps/chosen": -429.6858825683594, "logps/rejected": -346.298583984375, "loss": 0.5696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9829689264297485, "rewards/margins": 0.40745124220848083, "rewards/rejected": 1.5755176544189453, "step": 9770 }, { "epoch": 0.45406007706950186, "grad_norm": 33.40598678588867, "learning_rate": 2.7278146617763124e-07, "logits/chosen": -18.708383560180664, "logits/rejected": -18.46187973022461, "logps/chosen": -430.9365234375, "logps/rejected": -388.7882080078125, "loss": 0.8033, "rewards/accuracies": 0.5, "rewards/chosen": 1.7746391296386719, "rewards/margins": 0.01615973748266697, "rewards/rejected": 1.7584794759750366, "step": 9780 }, { "epoch": 0.45452435117693485, "grad_norm": 30.227149963378906, "learning_rate": 2.727536097311853e-07, "logits/chosen": -18.55091667175293, "logits/rejected": -17.06338119506836, "logps/chosen": -429.3763732910156, "logps/rejected": -335.2082824707031, "loss": 0.7154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9438467025756836, "rewards/margins": 0.5400727987289429, "rewards/rejected": 1.4037737846374512, "step": 9790 }, { "epoch": 0.4549886252843679, "grad_norm": 50.19799041748047, "learning_rate": 2.727257532847393e-07, "logits/chosen": -17.577138900756836, "logits/rejected": -17.774030685424805, "logps/chosen": -367.7490539550781, "logps/rejected": -335.0439758300781, "loss": 0.6501, "rewards/accuracies": 0.5, "rewards/chosen": 1.7652654647827148, "rewards/margins": 0.19989228248596191, "rewards/rejected": 1.5653730630874634, "step": 9800 }, { "epoch": 0.4554528993918009, "grad_norm": 26.41798210144043, "learning_rate": 2.726978968382933e-07, "logits/chosen": -19.0660457611084, "logits/rejected": -18.893735885620117, "logps/chosen": -451.80194091796875, "logps/rejected": -426.691162109375, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3204641342163086, "rewards/margins": 0.37104547023773193, "rewards/rejected": 1.9494186639785767, "step": 9810 }, { "epoch": 0.45591717349923394, "grad_norm": 29.79341697692871, "learning_rate": 2.7267004039184735e-07, "logits/chosen": -18.514501571655273, "logits/rejected": -17.839651107788086, "logps/chosen": -410.1637268066406, "logps/rejected": -306.68218994140625, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9975957870483398, "rewards/margins": 0.7485059499740601, "rewards/rejected": 1.2490899562835693, "step": 9820 }, { "epoch": 0.456381447606667, "grad_norm": 11.754162788391113, "learning_rate": 2.7264218394540133e-07, "logits/chosen": -18.056106567382812, "logits/rejected": -17.690290451049805, "logps/chosen": -436.3501892089844, "logps/rejected": -277.33709716796875, "loss": 0.5161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.101789951324463, "rewards/margins": 1.0082224607467651, "rewards/rejected": 1.0935674905776978, "step": 9830 }, { "epoch": 0.4568457217141, "grad_norm": 157.6493682861328, "learning_rate": 2.726143274989554e-07, "logits/chosen": -18.400392532348633, "logits/rejected": -18.05879783630371, "logps/chosen": -361.3909606933594, "logps/rejected": -318.09442138671875, "loss": 0.7787, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.5971882343292236, "rewards/margins": -0.011504960246384144, "rewards/rejected": 1.6086928844451904, "step": 9840 }, { "epoch": 0.45730999582153303, "grad_norm": 169.3597412109375, "learning_rate": 2.7258647105250936e-07, "logits/chosen": -18.492847442626953, "logits/rejected": -17.861026763916016, "logps/chosen": -403.33038330078125, "logps/rejected": -301.4912109375, "loss": 0.6868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.509932279586792, "rewards/margins": 0.1847793608903885, "rewards/rejected": 1.3251529932022095, "step": 9850 }, { "epoch": 0.4577742699289661, "grad_norm": 60.30362319946289, "learning_rate": 2.725586146060634e-07, "logits/chosen": -18.87606430053711, "logits/rejected": -18.71232032775879, "logps/chosen": -355.87884521484375, "logps/rejected": -349.3451843261719, "loss": 0.8624, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7539829015731812, "rewards/margins": -0.18503157794475555, "rewards/rejected": 1.9390144348144531, "step": 9860 }, { "epoch": 0.4582385440363991, "grad_norm": 127.36334991455078, "learning_rate": 2.7253075815961745e-07, "logits/chosen": -18.03817367553711, "logits/rejected": -17.646127700805664, "logps/chosen": -431.0484924316406, "logps/rejected": -262.6787109375, "loss": 0.5144, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8808526992797852, "rewards/margins": 0.6249815821647644, "rewards/rejected": 1.255871057510376, "step": 9870 }, { "epoch": 0.4587028181438321, "grad_norm": 43.65994644165039, "learning_rate": 2.7250290171317143e-07, "logits/chosen": -18.563579559326172, "logits/rejected": -17.61760139465332, "logps/chosen": -483.3321228027344, "logps/rejected": -362.52630615234375, "loss": 0.5562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.619305372238159, "rewards/margins": 0.5235936641693115, "rewards/rejected": 2.0957114696502686, "step": 9880 }, { "epoch": 0.4591670922512652, "grad_norm": 74.26038360595703, "learning_rate": 2.724750452667255e-07, "logits/chosen": -18.830738067626953, "logits/rejected": -18.611392974853516, "logps/chosen": -467.66253662109375, "logps/rejected": -467.18487548828125, "loss": 0.6215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2211172580718994, "rewards/margins": 0.37214764952659607, "rewards/rejected": 1.8489696979522705, "step": 9890 }, { "epoch": 0.45963136635869817, "grad_norm": 11.274187088012695, "learning_rate": 2.7244718882027946e-07, "logits/chosen": -19.206119537353516, "logits/rejected": -17.862308502197266, "logps/chosen": -589.7640380859375, "logps/rejected": -352.0383605957031, "loss": 0.4623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.502347469329834, "rewards/margins": 0.899885356426239, "rewards/rejected": 1.6024621725082397, "step": 9900 }, { "epoch": 0.4600956404661312, "grad_norm": 10.742657661437988, "learning_rate": 2.724193323738335e-07, "logits/chosen": -18.38477325439453, "logits/rejected": -18.362428665161133, "logps/chosen": -377.39495849609375, "logps/rejected": -394.26605224609375, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": 1.965506911277771, "rewards/margins": 0.17228463292121887, "rewards/rejected": 1.793222188949585, "step": 9910 }, { "epoch": 0.4605599145735642, "grad_norm": 71.8483657836914, "learning_rate": 2.723914759273875e-07, "logits/chosen": -18.078527450561523, "logits/rejected": -17.672367095947266, "logps/chosen": -342.5365295410156, "logps/rejected": -364.44244384765625, "loss": 0.8153, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6369407176971436, "rewards/margins": -0.09073507785797119, "rewards/rejected": 1.7276757955551147, "step": 9920 }, { "epoch": 0.46102418868099726, "grad_norm": 80.71910095214844, "learning_rate": 2.7236361948094153e-07, "logits/chosen": -18.94686508178711, "logits/rejected": -17.898588180541992, "logps/chosen": -510.5400390625, "logps/rejected": -407.86151123046875, "loss": 0.5184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4547834396362305, "rewards/margins": 0.5197772979736328, "rewards/rejected": 1.9350059032440186, "step": 9930 }, { "epoch": 0.4614884627884303, "grad_norm": 20.743526458740234, "learning_rate": 2.723357630344956e-07, "logits/chosen": -17.858686447143555, "logits/rejected": -17.530073165893555, "logps/chosen": -304.0336608886719, "logps/rejected": -248.32980346679688, "loss": 0.7374, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4512730836868286, "rewards/margins": -0.0029023110400885344, "rewards/rejected": 1.454175353050232, "step": 9940 }, { "epoch": 0.4619527368958633, "grad_norm": 90.43614959716797, "learning_rate": 2.7230790658804956e-07, "logits/chosen": -17.708572387695312, "logits/rejected": -18.246429443359375, "logps/chosen": -347.5236511230469, "logps/rejected": -394.37945556640625, "loss": 0.8535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6191387176513672, "rewards/margins": -0.1666857898235321, "rewards/rejected": 1.7858244180679321, "step": 9950 }, { "epoch": 0.46241701100329635, "grad_norm": 45.562747955322266, "learning_rate": 2.722800501416036e-07, "logits/chosen": -18.824329376220703, "logits/rejected": -18.074634552001953, "logps/chosen": -352.39459228515625, "logps/rejected": -224.0380096435547, "loss": 0.5151, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.981187105178833, "rewards/margins": 0.6157304048538208, "rewards/rejected": 1.3654568195343018, "step": 9960 }, { "epoch": 0.4628812851107294, "grad_norm": 48.678035736083984, "learning_rate": 2.722521936951576e-07, "logits/chosen": -17.805683135986328, "logits/rejected": -17.762414932250977, "logps/chosen": -408.1592102050781, "logps/rejected": -458.0977478027344, "loss": 0.9863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.250861644744873, "rewards/margins": -0.30472543835639954, "rewards/rejected": 2.5555872917175293, "step": 9970 }, { "epoch": 0.4633455592181624, "grad_norm": 18.125316619873047, "learning_rate": 2.7222433724871163e-07, "logits/chosen": -19.178010940551758, "logits/rejected": -18.416147232055664, "logps/chosen": -380.71160888671875, "logps/rejected": -335.0304260253906, "loss": 0.5862, "rewards/accuracies": 0.5, "rewards/chosen": 1.633681297302246, "rewards/margins": 0.36648380756378174, "rewards/rejected": 1.267197608947754, "step": 9980 }, { "epoch": 0.46380983332559544, "grad_norm": 36.26239013671875, "learning_rate": 2.7219648080226567e-07, "logits/chosen": -18.05093765258789, "logits/rejected": -16.853260040283203, "logps/chosen": -464.9571838378906, "logps/rejected": -313.28472900390625, "loss": 0.5197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1659464836120605, "rewards/margins": 0.7572036981582642, "rewards/rejected": 1.4087427854537964, "step": 9990 }, { "epoch": 0.46427410743302844, "grad_norm": 83.8856201171875, "learning_rate": 2.7216862435581966e-07, "logits/chosen": -18.219985961914062, "logits/rejected": -18.50850486755371, "logps/chosen": -377.3536376953125, "logps/rejected": -340.3675537109375, "loss": 0.8295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.420764684677124, "rewards/margins": -0.0804833173751831, "rewards/rejected": 1.5012480020523071, "step": 10000 }, { "epoch": 0.4647383815404615, "grad_norm": 96.20325469970703, "learning_rate": 2.721407679093737e-07, "logits/chosen": -17.93280601501465, "logits/rejected": -16.744674682617188, "logps/chosen": -385.9697265625, "logps/rejected": -238.8911590576172, "loss": 0.5027, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5075138807296753, "rewards/margins": 0.4839532971382141, "rewards/rejected": 1.0235605239868164, "step": 10010 }, { "epoch": 0.46520265564789454, "grad_norm": 146.6331024169922, "learning_rate": 2.721129114629277e-07, "logits/chosen": -19.07605743408203, "logits/rejected": -18.109699249267578, "logps/chosen": -456.2718200683594, "logps/rejected": -358.73419189453125, "loss": 0.6948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.206511974334717, "rewards/margins": 0.1749337911605835, "rewards/rejected": 2.0315780639648438, "step": 10020 }, { "epoch": 0.46566692975532753, "grad_norm": 80.75054168701172, "learning_rate": 2.7208505501648173e-07, "logits/chosen": -19.53677749633789, "logits/rejected": -19.056259155273438, "logps/chosen": -350.98052978515625, "logps/rejected": -260.25567626953125, "loss": 0.7178, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6222079992294312, "rewards/margins": 0.05878344178199768, "rewards/rejected": 1.5634245872497559, "step": 10030 }, { "epoch": 0.4661312038627606, "grad_norm": 115.7652816772461, "learning_rate": 2.720571985700357e-07, "logits/chosen": -18.046918869018555, "logits/rejected": -18.700286865234375, "logps/chosen": -396.1060485839844, "logps/rejected": -420.33416748046875, "loss": 0.6882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9372211694717407, "rewards/margins": 0.36001184582710266, "rewards/rejected": 1.57720947265625, "step": 10040 }, { "epoch": 0.46659547797019363, "grad_norm": 95.31525421142578, "learning_rate": 2.7202934212358976e-07, "logits/chosen": -18.229541778564453, "logits/rejected": -16.816974639892578, "logps/chosen": -417.7842712402344, "logps/rejected": -305.81939697265625, "loss": 0.5285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7211583852767944, "rewards/margins": 0.5021710991859436, "rewards/rejected": 1.2189874649047852, "step": 10050 }, { "epoch": 0.4670597520776266, "grad_norm": 36.675453186035156, "learning_rate": 2.720014856771438e-07, "logits/chosen": -18.26882553100586, "logits/rejected": -17.627004623413086, "logps/chosen": -353.98199462890625, "logps/rejected": -379.70281982421875, "loss": 0.7801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8172340393066406, "rewards/margins": 0.23686270415782928, "rewards/rejected": 1.5803711414337158, "step": 10060 }, { "epoch": 0.46752402618505967, "grad_norm": 21.982378005981445, "learning_rate": 2.719736292306978e-07, "logits/chosen": -19.785324096679688, "logits/rejected": -17.723318099975586, "logps/chosen": -504.99346923828125, "logps/rejected": -288.7574768066406, "loss": 0.4663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5137054920196533, "rewards/margins": 0.9105117917060852, "rewards/rejected": 1.6031936407089233, "step": 10070 }, { "epoch": 0.46798830029249266, "grad_norm": 78.18333435058594, "learning_rate": 2.719457727842518e-07, "logits/chosen": -17.991647720336914, "logits/rejected": -18.250003814697266, "logps/chosen": -413.9312438964844, "logps/rejected": -410.3487854003906, "loss": 0.8906, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.4949804544448853, "rewards/margins": -0.23083026707172394, "rewards/rejected": 1.7258106470108032, "step": 10080 }, { "epoch": 0.4684525743999257, "grad_norm": 98.49935913085938, "learning_rate": 2.719179163378058e-07, "logits/chosen": -19.26998519897461, "logits/rejected": -18.11709213256836, "logps/chosen": -359.0856018066406, "logps/rejected": -283.57452392578125, "loss": 0.6302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6025123596191406, "rewards/margins": 0.33003491163253784, "rewards/rejected": 1.272477388381958, "step": 10090 }, { "epoch": 0.46891684850735876, "grad_norm": 44.78898620605469, "learning_rate": 2.7189005989135986e-07, "logits/chosen": -17.978466033935547, "logits/rejected": -17.412425994873047, "logps/chosen": -388.0556335449219, "logps/rejected": -324.19439697265625, "loss": 0.6639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.919338583946228, "rewards/margins": 0.3155623972415924, "rewards/rejected": 1.603776216506958, "step": 10100 }, { "epoch": 0.46938112261479176, "grad_norm": 31.041519165039062, "learning_rate": 2.7186220344491385e-07, "logits/chosen": -18.209758758544922, "logits/rejected": -17.134143829345703, "logps/chosen": -471.0082092285156, "logps/rejected": -317.4961242675781, "loss": 0.4975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.169301986694336, "rewards/margins": 0.7325555682182312, "rewards/rejected": 1.43674635887146, "step": 10110 }, { "epoch": 0.4698453967222248, "grad_norm": 44.43429183959961, "learning_rate": 2.718343469984679e-07, "logits/chosen": -18.20813751220703, "logits/rejected": -17.185190200805664, "logps/chosen": -477.8919982910156, "logps/rejected": -340.9844665527344, "loss": 0.5209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2735049724578857, "rewards/margins": 0.6209110021591187, "rewards/rejected": 1.652593970298767, "step": 10120 }, { "epoch": 0.47030967082965786, "grad_norm": 144.93423461914062, "learning_rate": 2.7180649055202193e-07, "logits/chosen": -18.259279251098633, "logits/rejected": -17.560989379882812, "logps/chosen": -385.4202575683594, "logps/rejected": -357.5373229980469, "loss": 0.7488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5202915668487549, "rewards/margins": 0.02735617198050022, "rewards/rejected": 1.4929354190826416, "step": 10130 }, { "epoch": 0.47077394493709085, "grad_norm": 147.90704345703125, "learning_rate": 2.717786341055759e-07, "logits/chosen": -18.895885467529297, "logits/rejected": -18.554977416992188, "logps/chosen": -400.29266357421875, "logps/rejected": -300.51336669921875, "loss": 0.6028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6706777811050415, "rewards/margins": 0.288219153881073, "rewards/rejected": 1.3824586868286133, "step": 10140 }, { "epoch": 0.4712382190445239, "grad_norm": 83.32110595703125, "learning_rate": 2.717507776591299e-07, "logits/chosen": -18.412845611572266, "logits/rejected": -18.011672973632812, "logps/chosen": -500.2102966308594, "logps/rejected": -470.0872497558594, "loss": 0.7099, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3069660663604736, "rewards/margins": 0.18071208894252777, "rewards/rejected": 2.126253843307495, "step": 10150 }, { "epoch": 0.4717024931519569, "grad_norm": 65.39945983886719, "learning_rate": 2.7172292121268395e-07, "logits/chosen": -19.215749740600586, "logits/rejected": -18.325878143310547, "logps/chosen": -353.88323974609375, "logps/rejected": -302.9779968261719, "loss": 0.6407, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.88234543800354, "rewards/margins": 0.41696953773498535, "rewards/rejected": 1.4653757810592651, "step": 10160 }, { "epoch": 0.47216676725938994, "grad_norm": 51.66756820678711, "learning_rate": 2.71695064766238e-07, "logits/chosen": -18.67911148071289, "logits/rejected": -18.048545837402344, "logps/chosen": -501.58782958984375, "logps/rejected": -423.740234375, "loss": 0.7048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.071284770965576, "rewards/margins": 0.171672984957695, "rewards/rejected": 1.8996117115020752, "step": 10170 }, { "epoch": 0.472631041366823, "grad_norm": 79.01083374023438, "learning_rate": 2.7166720831979203e-07, "logits/chosen": -18.482057571411133, "logits/rejected": -17.32680892944336, "logps/chosen": -323.6944274902344, "logps/rejected": -247.595703125, "loss": 0.6283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8018264770507812, "rewards/margins": 0.3576703667640686, "rewards/rejected": 1.4441560506820679, "step": 10180 }, { "epoch": 0.473095315474256, "grad_norm": 74.91710662841797, "learning_rate": 2.71639351873346e-07, "logits/chosen": -18.370567321777344, "logits/rejected": -18.067678451538086, "logps/chosen": -376.7530212402344, "logps/rejected": -296.6768493652344, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 1.1136701107025146, "rewards/margins": 0.12429451942443848, "rewards/rejected": 0.9893754720687866, "step": 10190 }, { "epoch": 0.47355958958168903, "grad_norm": 138.03419494628906, "learning_rate": 2.716114954269e-07, "logits/chosen": -18.508121490478516, "logits/rejected": -17.547161102294922, "logps/chosen": -366.21246337890625, "logps/rejected": -277.50311279296875, "loss": 0.5768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9228299856185913, "rewards/margins": 0.3550785183906555, "rewards/rejected": 1.5677515268325806, "step": 10200 }, { "epoch": 0.4740238636891221, "grad_norm": 18.817811965942383, "learning_rate": 2.7158363898045405e-07, "logits/chosen": -18.576608657836914, "logits/rejected": -16.978174209594727, "logps/chosen": -380.28778076171875, "logps/rejected": -259.6916809082031, "loss": 0.5342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7757478952407837, "rewards/margins": 0.5251225233078003, "rewards/rejected": 1.2506253719329834, "step": 10210 }, { "epoch": 0.4744881377965551, "grad_norm": 168.3452606201172, "learning_rate": 2.715557825340081e-07, "logits/chosen": -19.467103958129883, "logits/rejected": -18.749530792236328, "logps/chosen": -378.7640380859375, "logps/rejected": -299.4952087402344, "loss": 0.6855, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7177101373672485, "rewards/margins": 0.15175171196460724, "rewards/rejected": 1.5659582614898682, "step": 10220 }, { "epoch": 0.4749524119039881, "grad_norm": 131.2265625, "learning_rate": 2.715279260875621e-07, "logits/chosen": -18.293842315673828, "logits/rejected": -17.87485122680664, "logps/chosen": -383.5770263671875, "logps/rejected": -347.6847229003906, "loss": 0.6339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9334423542022705, "rewards/margins": 0.17319461703300476, "rewards/rejected": 1.7602475881576538, "step": 10230 }, { "epoch": 0.4754166860114211, "grad_norm": 172.2218475341797, "learning_rate": 2.715000696411161e-07, "logits/chosen": -18.46261978149414, "logits/rejected": -17.710866928100586, "logps/chosen": -396.59222412109375, "logps/rejected": -361.1764221191406, "loss": 0.7264, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9429864883422852, "rewards/margins": 0.06768742948770523, "rewards/rejected": 1.8752988576889038, "step": 10240 }, { "epoch": 0.47588096011885417, "grad_norm": 79.62821960449219, "learning_rate": 2.714722131946701e-07, "logits/chosen": -18.45667266845703, "logits/rejected": -18.3985595703125, "logps/chosen": -350.2241516113281, "logps/rejected": -305.4018249511719, "loss": 0.6443, "rewards/accuracies": 0.5, "rewards/chosen": 1.7197402715682983, "rewards/margins": 0.12693020701408386, "rewards/rejected": 1.5928099155426025, "step": 10250 }, { "epoch": 0.4763452342262872, "grad_norm": 52.36394500732422, "learning_rate": 2.7144435674822414e-07, "logits/chosen": -18.372175216674805, "logits/rejected": -18.39626693725586, "logps/chosen": -441.91680908203125, "logps/rejected": -400.101806640625, "loss": 0.7079, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3248202800750732, "rewards/margins": 0.08945424109697342, "rewards/rejected": 2.235366106033325, "step": 10260 }, { "epoch": 0.4768095083337202, "grad_norm": 81.6632308959961, "learning_rate": 2.7141650030177813e-07, "logits/chosen": -18.77884292602539, "logits/rejected": -18.726110458374023, "logps/chosen": -423.6935119628906, "logps/rejected": -366.6564025878906, "loss": 0.754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8522632122039795, "rewards/margins": -0.04543835669755936, "rewards/rejected": 1.8977015018463135, "step": 10270 }, { "epoch": 0.47727378244115326, "grad_norm": 83.54418182373047, "learning_rate": 2.713886438553322e-07, "logits/chosen": -19.058130264282227, "logits/rejected": -18.392459869384766, "logps/chosen": -408.1628112792969, "logps/rejected": -391.5566711425781, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9197813272476196, "rewards/margins": 0.3031959533691406, "rewards/rejected": 1.616585373878479, "step": 10280 }, { "epoch": 0.4777380565485863, "grad_norm": 20.193796157836914, "learning_rate": 2.713607874088862e-07, "logits/chosen": -18.812335968017578, "logits/rejected": -17.213319778442383, "logps/chosen": -248.59487915039062, "logps/rejected": -128.77684020996094, "loss": 0.4464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2533643245697021, "rewards/margins": 0.7352879643440247, "rewards/rejected": 0.5180764198303223, "step": 10290 }, { "epoch": 0.4782023306560193, "grad_norm": 119.90021514892578, "learning_rate": 2.713329309624402e-07, "logits/chosen": -18.083269119262695, "logits/rejected": -17.48283576965332, "logps/chosen": -470.9566345214844, "logps/rejected": -325.76971435546875, "loss": 0.5554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.713557720184326, "rewards/margins": 0.7115022540092468, "rewards/rejected": 2.0020556449890137, "step": 10300 }, { "epoch": 0.47866660476345235, "grad_norm": 45.38672637939453, "learning_rate": 2.7130507451599424e-07, "logits/chosen": -18.463443756103516, "logits/rejected": -17.137895584106445, "logps/chosen": -381.8261413574219, "logps/rejected": -257.1811828613281, "loss": 0.5634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.647979736328125, "rewards/margins": 0.48424944281578064, "rewards/rejected": 1.1637303829193115, "step": 10310 }, { "epoch": 0.47913087887088535, "grad_norm": 91.27469635009766, "learning_rate": 2.7127721806954823e-07, "logits/chosen": -18.479278564453125, "logits/rejected": -17.186351776123047, "logps/chosen": -417.08905029296875, "logps/rejected": -270.900390625, "loss": 0.5229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6180036067962646, "rewards/margins": 0.5081952810287476, "rewards/rejected": 1.1098084449768066, "step": 10320 }, { "epoch": 0.4795951529783184, "grad_norm": 28.889373779296875, "learning_rate": 2.7124936162310227e-07, "logits/chosen": -18.37420654296875, "logits/rejected": -17.20932388305664, "logps/chosen": -447.28118896484375, "logps/rejected": -376.25555419921875, "loss": 0.4565, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.466373920440674, "rewards/margins": 0.7595349550247192, "rewards/rejected": 1.7068389654159546, "step": 10330 }, { "epoch": 0.48005942708575144, "grad_norm": 109.48326110839844, "learning_rate": 2.7122150517665626e-07, "logits/chosen": -19.177433013916016, "logits/rejected": -17.94105339050293, "logps/chosen": -522.7312622070312, "logps/rejected": -387.8079833984375, "loss": 0.6581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.681481122970581, "rewards/margins": 0.583429217338562, "rewards/rejected": 2.0980517864227295, "step": 10340 }, { "epoch": 0.48052370119318444, "grad_norm": 22.584001541137695, "learning_rate": 2.711936487302103e-07, "logits/chosen": -18.513029098510742, "logits/rejected": -17.164749145507812, "logps/chosen": -381.9421691894531, "logps/rejected": -230.402099609375, "loss": 0.4815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0378499031066895, "rewards/margins": 0.7711586952209473, "rewards/rejected": 1.2666908502578735, "step": 10350 }, { "epoch": 0.4809879753006175, "grad_norm": 99.20584869384766, "learning_rate": 2.7116579228376434e-07, "logits/chosen": -18.688156127929688, "logits/rejected": -17.326692581176758, "logps/chosen": -489.09808349609375, "logps/rejected": -358.55963134765625, "loss": 0.5103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.096022129058838, "rewards/margins": 0.6347746849060059, "rewards/rejected": 1.461247205734253, "step": 10360 }, { "epoch": 0.48145224940805054, "grad_norm": 23.685178756713867, "learning_rate": 2.7113793583731833e-07, "logits/chosen": -18.239391326904297, "logits/rejected": -17.6801815032959, "logps/chosen": -354.58062744140625, "logps/rejected": -316.2992248535156, "loss": 0.622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8361297845840454, "rewards/margins": 0.4429861903190613, "rewards/rejected": 1.3931434154510498, "step": 10370 }, { "epoch": 0.48191652351548353, "grad_norm": 57.49217224121094, "learning_rate": 2.7111007939087237e-07, "logits/chosen": -18.95342445373535, "logits/rejected": -17.721529006958008, "logps/chosen": -429.8365783691406, "logps/rejected": -322.33123779296875, "loss": 0.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.089512825012207, "rewards/margins": 0.7406560778617859, "rewards/rejected": 1.348856806755066, "step": 10380 }, { "epoch": 0.4823807976229166, "grad_norm": 41.342926025390625, "learning_rate": 2.7108222294442636e-07, "logits/chosen": -18.696033477783203, "logits/rejected": -17.511484146118164, "logps/chosen": -474.49163818359375, "logps/rejected": -283.59124755859375, "loss": 0.401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.345820426940918, "rewards/margins": 1.0691381692886353, "rewards/rejected": 1.2766822576522827, "step": 10390 }, { "epoch": 0.4828450717303496, "grad_norm": 52.854366302490234, "learning_rate": 2.710543664979804e-07, "logits/chosen": -18.097068786621094, "logits/rejected": -18.0877742767334, "logps/chosen": -378.6811828613281, "logps/rejected": -317.81500244140625, "loss": 0.6658, "rewards/accuracies": 0.5, "rewards/chosen": 1.881455659866333, "rewards/margins": 0.151948481798172, "rewards/rejected": 1.7295074462890625, "step": 10400 }, { "epoch": 0.4833093458377826, "grad_norm": 87.99474334716797, "learning_rate": 2.7102651005153444e-07, "logits/chosen": -18.607709884643555, "logits/rejected": -18.086835861206055, "logps/chosen": -393.59942626953125, "logps/rejected": -345.9241027832031, "loss": 0.7141, "rewards/accuracies": 0.5, "rewards/chosen": 1.9935235977172852, "rewards/margins": 0.11462777853012085, "rewards/rejected": 1.8788957595825195, "step": 10410 }, { "epoch": 0.48377361994521567, "grad_norm": 222.08511352539062, "learning_rate": 2.7099865360508843e-07, "logits/chosen": -17.722240447998047, "logits/rejected": -17.463382720947266, "logps/chosen": -380.0169982910156, "logps/rejected": -328.813232421875, "loss": 0.723, "rewards/accuracies": 0.5, "rewards/chosen": 1.787946343421936, "rewards/margins": 0.19235500693321228, "rewards/rejected": 1.5955910682678223, "step": 10420 }, { "epoch": 0.48423789405264867, "grad_norm": 205.32498168945312, "learning_rate": 2.7097079715864247e-07, "logits/chosen": -19.331178665161133, "logits/rejected": -18.951278686523438, "logps/chosen": -403.65972900390625, "logps/rejected": -383.9931945800781, "loss": 0.7351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9159505367279053, "rewards/margins": 0.01597486063838005, "rewards/rejected": 1.8999757766723633, "step": 10430 }, { "epoch": 0.4847021681600817, "grad_norm": 97.38232421875, "learning_rate": 2.7094294071219646e-07, "logits/chosen": -17.795597076416016, "logits/rejected": -18.10190200805664, "logps/chosen": -364.2850036621094, "logps/rejected": -399.596435546875, "loss": 0.9919, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5721931457519531, "rewards/margins": -0.2756287455558777, "rewards/rejected": 1.8478221893310547, "step": 10440 }, { "epoch": 0.48516644226751476, "grad_norm": 153.629150390625, "learning_rate": 2.7091508426575045e-07, "logits/chosen": -17.96319580078125, "logits/rejected": -18.298612594604492, "logps/chosen": -368.1172790527344, "logps/rejected": -347.7939758300781, "loss": 0.7301, "rewards/accuracies": 0.5, "rewards/chosen": 1.9042928218841553, "rewards/margins": 0.06195585057139397, "rewards/rejected": 1.8423372507095337, "step": 10450 }, { "epoch": 0.48563071637494776, "grad_norm": 44.66632843017578, "learning_rate": 2.708872278193045e-07, "logits/chosen": -18.839874267578125, "logits/rejected": -18.226211547851562, "logps/chosen": -441.76214599609375, "logps/rejected": -421.23016357421875, "loss": 0.7209, "rewards/accuracies": 0.5, "rewards/chosen": 2.2223589420318604, "rewards/margins": 0.2512141168117523, "rewards/rejected": 1.971144437789917, "step": 10460 }, { "epoch": 0.4860949904823808, "grad_norm": NaN, "learning_rate": 2.708621570175031e-07, "logits/chosen": -18.969242095947266, "logits/rejected": -19.535877227783203, "logps/chosen": -478.5315856933594, "logps/rejected": -415.07708740234375, "loss": 0.9369, "rewards/accuracies": 0.5, "rewards/chosen": 1.9483816623687744, "rewards/margins": -0.27695539593696594, "rewards/rejected": 2.225337028503418, "step": 10470 }, { "epoch": 0.4865592645898138, "grad_norm": 90.35462188720703, "learning_rate": 2.7083430057105715e-07, "logits/chosen": -18.757266998291016, "logits/rejected": -17.375715255737305, "logps/chosen": -547.8988037109375, "logps/rejected": -328.56732177734375, "loss": 0.4363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.487215518951416, "rewards/margins": 0.8995487093925476, "rewards/rejected": 1.5876669883728027, "step": 10480 }, { "epoch": 0.48702353869724685, "grad_norm": 104.7829818725586, "learning_rate": 2.708064441246112e-07, "logits/chosen": -19.450395584106445, "logits/rejected": -18.50406265258789, "logps/chosen": -323.92108154296875, "logps/rejected": -307.3177795410156, "loss": 0.7456, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7653801441192627, "rewards/margins": 0.05299611762166023, "rewards/rejected": 1.7123838663101196, "step": 10490 }, { "epoch": 0.4874878128046799, "grad_norm": 69.4175033569336, "learning_rate": 2.707785876781652e-07, "logits/chosen": -18.13235855102539, "logits/rejected": -17.750198364257812, "logps/chosen": -295.6792907714844, "logps/rejected": -237.8922882080078, "loss": 0.6291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.376446008682251, "rewards/margins": 0.1871090680360794, "rewards/rejected": 1.189336895942688, "step": 10500 }, { "epoch": 0.4879520869121129, "grad_norm": 31.039661407470703, "learning_rate": 2.7075073123171917e-07, "logits/chosen": -17.728891372680664, "logits/rejected": -17.75839614868164, "logps/chosen": -254.2573699951172, "logps/rejected": -248.7272491455078, "loss": 0.7319, "rewards/accuracies": 0.5, "rewards/chosen": 1.4440313577651978, "rewards/margins": 0.05270525813102722, "rewards/rejected": 1.3913259506225586, "step": 10510 }, { "epoch": 0.48841636101954594, "grad_norm": 111.7856216430664, "learning_rate": 2.707228747852732e-07, "logits/chosen": -17.792827606201172, "logits/rejected": -17.674514770507812, "logps/chosen": -399.8480224609375, "logps/rejected": -340.5395202636719, "loss": 0.5056, "rewards/accuracies": 0.5, "rewards/chosen": 2.7387301921844482, "rewards/margins": 1.0454003810882568, "rewards/rejected": 1.6933298110961914, "step": 10520 }, { "epoch": 0.488880635126979, "grad_norm": 124.60269165039062, "learning_rate": 2.7069501833882725e-07, "logits/chosen": -18.188129425048828, "logits/rejected": -17.70477867126465, "logps/chosen": -307.7221984863281, "logps/rejected": -264.0563049316406, "loss": 0.706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8176279067993164, "rewards/margins": 0.18765154480934143, "rewards/rejected": 1.6299762725830078, "step": 10530 }, { "epoch": 0.489344909234412, "grad_norm": 10.636322021484375, "learning_rate": 2.7066716189238124e-07, "logits/chosen": -18.12095069885254, "logits/rejected": -17.076730728149414, "logps/chosen": -476.5769958496094, "logps/rejected": -358.0860290527344, "loss": 0.8624, "rewards/accuracies": 0.5, "rewards/chosen": 2.0117108821868896, "rewards/margins": 0.34870094060897827, "rewards/rejected": 1.6630100011825562, "step": 10540 }, { "epoch": 0.48980918334184503, "grad_norm": 65.28040313720703, "learning_rate": 2.706393054459353e-07, "logits/chosen": -18.465038299560547, "logits/rejected": -17.56900405883789, "logps/chosen": -397.75006103515625, "logps/rejected": -323.7539978027344, "loss": 0.4887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.101438045501709, "rewards/margins": 0.5656121969223022, "rewards/rejected": 1.5358259677886963, "step": 10550 }, { "epoch": 0.490273457449278, "grad_norm": 41.71070098876953, "learning_rate": 2.7061144899948927e-07, "logits/chosen": -18.192955017089844, "logits/rejected": -17.847265243530273, "logps/chosen": -400.5807189941406, "logps/rejected": -389.8558044433594, "loss": 0.6122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8938831090927124, "rewards/margins": 0.2938840389251709, "rewards/rejected": 1.5999990701675415, "step": 10560 }, { "epoch": 0.4907377315567111, "grad_norm": 69.70832061767578, "learning_rate": 2.705835925530433e-07, "logits/chosen": -17.928178787231445, "logits/rejected": -18.026874542236328, "logps/chosen": -372.2172546386719, "logps/rejected": -429.5626525878906, "loss": 0.9174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7827459573745728, "rewards/margins": -0.1941857635974884, "rewards/rejected": 1.9769315719604492, "step": 10570 }, { "epoch": 0.4912020056641441, "grad_norm": 99.63617706298828, "learning_rate": 2.705557361065973e-07, "logits/chosen": -18.249897003173828, "logits/rejected": -17.131397247314453, "logps/chosen": -394.24566650390625, "logps/rejected": -246.631591796875, "loss": 0.4121, "rewards/accuracies": 1.0, "rewards/chosen": 1.7953557968139648, "rewards/margins": 0.7228013873100281, "rewards/rejected": 1.0725542306900024, "step": 10580 }, { "epoch": 0.4916662797715771, "grad_norm": 10.314653396606445, "learning_rate": 2.7052787966015134e-07, "logits/chosen": -17.4466552734375, "logits/rejected": -15.825424194335938, "logps/chosen": -367.9696350097656, "logps/rejected": -197.0930938720703, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9608337879180908, "rewards/margins": 0.9940741658210754, "rewards/rejected": 0.9667595624923706, "step": 10590 }, { "epoch": 0.49213055387901017, "grad_norm": 65.1380386352539, "learning_rate": 2.705000232137054e-07, "logits/chosen": -17.94511604309082, "logits/rejected": -17.75162696838379, "logps/chosen": -465.94525146484375, "logps/rejected": -427.50146484375, "loss": 0.7136, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.238553762435913, "rewards/margins": 0.08504756540060043, "rewards/rejected": 2.15350604057312, "step": 10600 }, { "epoch": 0.4925948279864432, "grad_norm": 26.15887451171875, "learning_rate": 2.7047216676725937e-07, "logits/chosen": -18.904165267944336, "logits/rejected": -17.59433364868164, "logps/chosen": -417.0069274902344, "logps/rejected": -272.08758544921875, "loss": 0.4957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.224061965942383, "rewards/margins": 0.7785617113113403, "rewards/rejected": 1.4455002546310425, "step": 10610 }, { "epoch": 0.4930591020938762, "grad_norm": 105.37947845458984, "learning_rate": 2.704443103208134e-07, "logits/chosen": -19.034305572509766, "logits/rejected": -19.51245880126953, "logps/chosen": -377.37841796875, "logps/rejected": -418.02716064453125, "loss": 0.7059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1751708984375, "rewards/margins": 0.09956948459148407, "rewards/rejected": 2.075601816177368, "step": 10620 }, { "epoch": 0.49352337620130926, "grad_norm": 130.76356506347656, "learning_rate": 2.704164538743674e-07, "logits/chosen": -19.56879997253418, "logits/rejected": -19.455524444580078, "logps/chosen": -355.4689636230469, "logps/rejected": -335.216796875, "loss": 0.7251, "rewards/accuracies": 0.5, "rewards/chosen": 1.8162002563476562, "rewards/margins": 0.1915329247713089, "rewards/rejected": 1.6246671676635742, "step": 10630 }, { "epoch": 0.49398765030874225, "grad_norm": 48.265663146972656, "learning_rate": 2.7038859742792144e-07, "logits/chosen": -18.298574447631836, "logits/rejected": -17.458621978759766, "logps/chosen": -475.3758850097656, "logps/rejected": -381.26092529296875, "loss": 0.586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.364351749420166, "rewards/margins": 0.36131176352500916, "rewards/rejected": 2.003039836883545, "step": 10640 }, { "epoch": 0.4944519244161753, "grad_norm": 132.677001953125, "learning_rate": 2.703607409814755e-07, "logits/chosen": -18.462318420410156, "logits/rejected": -17.480838775634766, "logps/chosen": -473.3233947753906, "logps/rejected": -396.8298645019531, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": 2.5982213020324707, "rewards/margins": 0.5624396204948425, "rewards/rejected": 2.0357818603515625, "step": 10650 }, { "epoch": 0.49491619852360835, "grad_norm": 206.2277374267578, "learning_rate": 2.7033288453502947e-07, "logits/chosen": -18.866222381591797, "logits/rejected": -18.216371536254883, "logps/chosen": -483.26861572265625, "logps/rejected": -462.7279357910156, "loss": 0.6634, "rewards/accuracies": 0.5, "rewards/chosen": 2.4160165786743164, "rewards/margins": 0.2486610859632492, "rewards/rejected": 2.167355537414551, "step": 10660 }, { "epoch": 0.49538047263104135, "grad_norm": 218.8392791748047, "learning_rate": 2.703050280885835e-07, "logits/chosen": -17.645030975341797, "logits/rejected": -18.172122955322266, "logps/chosen": -382.7893981933594, "logps/rejected": -404.458251953125, "loss": 0.8714, "rewards/accuracies": 0.5, "rewards/chosen": 1.522649884223938, "rewards/margins": -0.1540326178073883, "rewards/rejected": 1.6766822338104248, "step": 10670 }, { "epoch": 0.4958447467384744, "grad_norm": 241.5847625732422, "learning_rate": 2.702771716421375e-07, "logits/chosen": -18.234432220458984, "logits/rejected": -17.545001983642578, "logps/chosen": -418.7118225097656, "logps/rejected": -358.55059814453125, "loss": 0.7212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.28840708732605, "rewards/margins": 0.3843148350715637, "rewards/rejected": 1.9040921926498413, "step": 10680 }, { "epoch": 0.49630902084590744, "grad_norm": 70.19486236572266, "learning_rate": 2.7024931519569154e-07, "logits/chosen": -19.085697174072266, "logits/rejected": -18.782329559326172, "logps/chosen": -280.05804443359375, "logps/rejected": -306.55572509765625, "loss": 0.8656, "rewards/accuracies": 0.5, "rewards/chosen": 1.2757093906402588, "rewards/margins": -0.18527142703533173, "rewards/rejected": 1.460980772972107, "step": 10690 }, { "epoch": 0.49677329495334044, "grad_norm": 80.43486785888672, "learning_rate": 2.702214587492455e-07, "logits/chosen": -17.986623764038086, "logits/rejected": -17.769641876220703, "logps/chosen": -362.29449462890625, "logps/rejected": -255.7201690673828, "loss": 0.6554, "rewards/accuracies": 0.5, "rewards/chosen": 2.2218337059020996, "rewards/margins": 0.942770779132843, "rewards/rejected": 1.2790628671646118, "step": 10700 }, { "epoch": 0.4972375690607735, "grad_norm": 30.912485122680664, "learning_rate": 2.7019360230279956e-07, "logits/chosen": -18.464923858642578, "logits/rejected": -17.427932739257812, "logps/chosen": -467.1991271972656, "logps/rejected": -329.8550720214844, "loss": 0.4865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2286415100097656, "rewards/margins": 0.7190823554992676, "rewards/rejected": 1.5095592737197876, "step": 10710 }, { "epoch": 0.4977018431682065, "grad_norm": 107.79532623291016, "learning_rate": 2.701657458563536e-07, "logits/chosen": -17.301166534423828, "logits/rejected": -16.271236419677734, "logps/chosen": -327.21356201171875, "logps/rejected": -273.14349365234375, "loss": 0.5535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7136476039886475, "rewards/margins": 0.6709271669387817, "rewards/rejected": 1.0427204370498657, "step": 10720 }, { "epoch": 0.49816611727563953, "grad_norm": 119.57954406738281, "learning_rate": 2.701378894099076e-07, "logits/chosen": -18.32387924194336, "logits/rejected": -17.794151306152344, "logps/chosen": -416.8692321777344, "logps/rejected": -347.52691650390625, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9586896896362305, "rewards/margins": 0.5911042094230652, "rewards/rejected": 1.3675854206085205, "step": 10730 }, { "epoch": 0.4986303913830726, "grad_norm": 112.62014770507812, "learning_rate": 2.7011003296346163e-07, "logits/chosen": -17.994892120361328, "logits/rejected": -17.207813262939453, "logps/chosen": -456.6470642089844, "logps/rejected": -319.63409423828125, "loss": 0.6449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.337333917617798, "rewards/margins": 0.447040319442749, "rewards/rejected": 1.8902934789657593, "step": 10740 }, { "epoch": 0.4990946654905056, "grad_norm": 74.5589599609375, "learning_rate": 2.700821765170156e-07, "logits/chosen": -18.65155792236328, "logits/rejected": -17.849769592285156, "logps/chosen": -425.1644592285156, "logps/rejected": -353.67510986328125, "loss": 0.5376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3445615768432617, "rewards/margins": 0.5290120840072632, "rewards/rejected": 1.8155494928359985, "step": 10750 }, { "epoch": 0.4995589395979386, "grad_norm": 53.81948471069336, "learning_rate": 2.7005432007056966e-07, "logits/chosen": -18.45077133178711, "logits/rejected": -18.54867935180664, "logps/chosen": -342.00323486328125, "logps/rejected": -382.5155334472656, "loss": 0.9042, "rewards/accuracies": 0.5, "rewards/chosen": 1.6923294067382812, "rewards/margins": -0.27777037024497986, "rewards/rejected": 1.9700998067855835, "step": 10760 }, { "epoch": 0.5000232137053716, "grad_norm": 205.82559204101562, "learning_rate": 2.7002646362412365e-07, "logits/chosen": -19.343643188476562, "logits/rejected": -18.734071731567383, "logps/chosen": -396.9552307128906, "logps/rejected": -369.8052062988281, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": 1.9138915538787842, "rewards/margins": 0.24111099541187286, "rewards/rejected": 1.6727807521820068, "step": 10770 }, { "epoch": 0.5004874878128047, "grad_norm": 113.64875793457031, "learning_rate": 2.699986071776777e-07, "logits/chosen": -18.153696060180664, "logits/rejected": -18.092288970947266, "logps/chosen": -488.9728088378906, "logps/rejected": -511.6886291503906, "loss": 1.0037, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.082430601119995, "rewards/margins": -0.4299392104148865, "rewards/rejected": 2.5123698711395264, "step": 10780 }, { "epoch": 0.5009517619202377, "grad_norm": 36.90016555786133, "learning_rate": 2.6997075073123173e-07, "logits/chosen": -19.26700210571289, "logits/rejected": -18.447284698486328, "logps/chosen": -502.9273376464844, "logps/rejected": -486.12213134765625, "loss": 0.7221, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8842055797576904, "rewards/margins": 0.15170757472515106, "rewards/rejected": 1.7324981689453125, "step": 10790 }, { "epoch": 0.5014160360276707, "grad_norm": 22.033781051635742, "learning_rate": 2.699428942847857e-07, "logits/chosen": -19.192676544189453, "logits/rejected": -16.891111373901367, "logps/chosen": -395.47564697265625, "logps/rejected": -242.35360717773438, "loss": 0.5192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0982913970947266, "rewards/margins": 0.527858316898346, "rewards/rejected": 1.5704330205917358, "step": 10800 }, { "epoch": 0.5018803101351038, "grad_norm": 43.78809356689453, "learning_rate": 2.699150378383397e-07, "logits/chosen": -18.062442779541016, "logits/rejected": -16.98691749572754, "logps/chosen": -515.140625, "logps/rejected": -367.55487060546875, "loss": 0.6558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.500638484954834, "rewards/margins": 0.4101070463657379, "rewards/rejected": 2.090531349182129, "step": 10810 }, { "epoch": 0.5023445842425368, "grad_norm": 66.33341217041016, "learning_rate": 2.6988718139189375e-07, "logits/chosen": -18.863039016723633, "logits/rejected": -18.37906265258789, "logps/chosen": -492.53875732421875, "logps/rejected": -422.83685302734375, "loss": 0.7428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3460752964019775, "rewards/margins": 0.058358334004879, "rewards/rejected": 2.287716865539551, "step": 10820 }, { "epoch": 0.5028088583499698, "grad_norm": 34.425819396972656, "learning_rate": 2.698593249454478e-07, "logits/chosen": -17.851917266845703, "logits/rejected": -18.274682998657227, "logps/chosen": -302.5444641113281, "logps/rejected": -316.3829650878906, "loss": 0.7427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5233378410339355, "rewards/margins": 0.0968930721282959, "rewards/rejected": 1.42644464969635, "step": 10830 }, { "epoch": 0.5032731324574029, "grad_norm": 21.73656463623047, "learning_rate": 2.6983146849900183e-07, "logits/chosen": -18.598234176635742, "logits/rejected": -18.06509780883789, "logps/chosen": -338.93170166015625, "logps/rejected": -272.18023681640625, "loss": 0.6688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.722519874572754, "rewards/margins": 0.45156651735305786, "rewards/rejected": 1.2709534168243408, "step": 10840 }, { "epoch": 0.5037374065648359, "grad_norm": 103.27101135253906, "learning_rate": 2.698036120525558e-07, "logits/chosen": -19.618206024169922, "logits/rejected": -18.695072174072266, "logps/chosen": -421.0043029785156, "logps/rejected": -319.91619873046875, "loss": 0.5013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5932774543762207, "rewards/margins": 0.7304292917251587, "rewards/rejected": 1.862847924232483, "step": 10850 }, { "epoch": 0.5042016806722689, "grad_norm": 48.23903274536133, "learning_rate": 2.697757556061098e-07, "logits/chosen": -18.900590896606445, "logits/rejected": -18.124042510986328, "logps/chosen": -442.44097900390625, "logps/rejected": -441.684814453125, "loss": 0.3869, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6657509803771973, "rewards/margins": 0.8237460255622864, "rewards/rejected": 1.8420050144195557, "step": 10860 }, { "epoch": 0.5046659547797019, "grad_norm": 52.29948043823242, "learning_rate": 2.6974789915966385e-07, "logits/chosen": -18.394445419311523, "logits/rejected": -18.66322898864746, "logps/chosen": -367.56927490234375, "logps/rejected": -363.90478515625, "loss": 0.8173, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7248451709747314, "rewards/margins": -0.1136668473482132, "rewards/rejected": 1.8385120630264282, "step": 10870 }, { "epoch": 0.505130228887135, "grad_norm": 100.25830078125, "learning_rate": 2.697200427132179e-07, "logits/chosen": -18.180118560791016, "logits/rejected": -17.267841339111328, "logps/chosen": -499.4407653808594, "logps/rejected": -332.29095458984375, "loss": 0.5123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.227595806121826, "rewards/margins": 0.6230031847953796, "rewards/rejected": 1.6045925617218018, "step": 10880 }, { "epoch": 0.505594502994568, "grad_norm": 42.7531852722168, "learning_rate": 2.696921862667719e-07, "logits/chosen": -18.541467666625977, "logits/rejected": -17.9490909576416, "logps/chosen": -335.093017578125, "logps/rejected": -210.61380004882812, "loss": 0.7264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5828088521957397, "rewards/margins": 0.3370400071144104, "rewards/rejected": 1.2457689046859741, "step": 10890 }, { "epoch": 0.506058777102001, "grad_norm": 77.91505432128906, "learning_rate": 2.696643298203259e-07, "logits/chosen": -19.78561782836914, "logits/rejected": -18.413516998291016, "logps/chosen": -455.4579162597656, "logps/rejected": -356.49139404296875, "loss": 0.3549, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.751749038696289, "rewards/margins": 1.0815070867538452, "rewards/rejected": 1.6702417135238647, "step": 10900 }, { "epoch": 0.5065230512094341, "grad_norm": 19.2868709564209, "learning_rate": 2.6963647337387996e-07, "logits/chosen": -18.423490524291992, "logits/rejected": -18.11307716369629, "logps/chosen": -296.6795349121094, "logps/rejected": -242.4468994140625, "loss": 0.6699, "rewards/accuracies": 0.5, "rewards/chosen": 1.6272952556610107, "rewards/margins": 0.19352075457572937, "rewards/rejected": 1.4337745904922485, "step": 10910 }, { "epoch": 0.5069873253168671, "grad_norm": 125.13935089111328, "learning_rate": 2.6960861692743395e-07, "logits/chosen": -18.19748878479004, "logits/rejected": -16.620441436767578, "logps/chosen": -290.5183410644531, "logps/rejected": -245.3545684814453, "loss": 0.5364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9154584407806396, "rewards/margins": 0.8994289636611938, "rewards/rejected": 1.0160292387008667, "step": 10920 }, { "epoch": 0.5074515994243001, "grad_norm": 21.128765106201172, "learning_rate": 2.6958076048098794e-07, "logits/chosen": -18.93963050842285, "logits/rejected": -17.845449447631836, "logps/chosen": -439.2738342285156, "logps/rejected": -344.35845947265625, "loss": 0.6894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5208771228790283, "rewards/margins": 0.6551960706710815, "rewards/rejected": 1.8656810522079468, "step": 10930 }, { "epoch": 0.5079158735317332, "grad_norm": 47.420631408691406, "learning_rate": 2.69552904034542e-07, "logits/chosen": -18.74521827697754, "logits/rejected": -18.286596298217773, "logps/chosen": -349.48321533203125, "logps/rejected": -225.906982421875, "loss": 0.6767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8601970672607422, "rewards/margins": 0.6515612602233887, "rewards/rejected": 1.2086360454559326, "step": 10940 }, { "epoch": 0.5083801476391662, "grad_norm": 49.53501510620117, "learning_rate": 2.69525047588096e-07, "logits/chosen": -17.632797241210938, "logits/rejected": -17.62114906311035, "logps/chosen": -309.87591552734375, "logps/rejected": -287.2602844238281, "loss": 0.7159, "rewards/accuracies": 0.5, "rewards/chosen": 1.7087719440460205, "rewards/margins": 0.2509225606918335, "rewards/rejected": 1.457849383354187, "step": 10950 }, { "epoch": 0.5088444217465992, "grad_norm": 52.58115768432617, "learning_rate": 2.6949719114165e-07, "logits/chosen": -18.735759735107422, "logits/rejected": -16.684154510498047, "logps/chosen": -484.418212890625, "logps/rejected": -308.99517822265625, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3681721687316895, "rewards/margins": 0.8373233079910278, "rewards/rejected": 1.530848741531372, "step": 10960 }, { "epoch": 0.5093086958540323, "grad_norm": 93.5477523803711, "learning_rate": 2.6946933469520405e-07, "logits/chosen": -19.39506721496582, "logits/rejected": -18.374025344848633, "logps/chosen": -375.2210388183594, "logps/rejected": -335.6260986328125, "loss": 0.6196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8369413614273071, "rewards/margins": 0.39254099130630493, "rewards/rejected": 1.4444005489349365, "step": 10970 }, { "epoch": 0.5097729699614653, "grad_norm": 44.46255111694336, "learning_rate": 2.6944147824875804e-07, "logits/chosen": -18.316631317138672, "logits/rejected": -17.31389808654785, "logps/chosen": -337.50177001953125, "logps/rejected": -323.80535888671875, "loss": 0.5861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.143510580062866, "rewards/margins": 0.6186485290527344, "rewards/rejected": 1.5248621702194214, "step": 10980 }, { "epoch": 0.5102372440688983, "grad_norm": 58.95661163330078, "learning_rate": 2.694136218023121e-07, "logits/chosen": -18.50503158569336, "logits/rejected": -16.567081451416016, "logps/chosen": -425.4295349121094, "logps/rejected": -220.15087890625, "loss": 0.5583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9458720684051514, "rewards/margins": 0.7735204696655273, "rewards/rejected": 1.172351598739624, "step": 10990 }, { "epoch": 0.5107015181763314, "grad_norm": 155.0765838623047, "learning_rate": 2.6938576535586607e-07, "logits/chosen": -18.74072265625, "logits/rejected": -17.672643661499023, "logps/chosen": -445.27862548828125, "logps/rejected": -292.3271484375, "loss": 0.5459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2880091667175293, "rewards/margins": 0.8967008590698242, "rewards/rejected": 1.3913084268569946, "step": 11000 }, { "epoch": 0.5111657922837644, "grad_norm": 110.53240966796875, "learning_rate": 2.693579089094201e-07, "logits/chosen": -18.863704681396484, "logits/rejected": -18.866334915161133, "logps/chosen": -387.7653503417969, "logps/rejected": -418.607666015625, "loss": 0.711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9070039987564087, "rewards/margins": 0.1345432847738266, "rewards/rejected": 1.7724609375, "step": 11010 }, { "epoch": 0.5116300663911973, "grad_norm": 111.85739135742188, "learning_rate": 2.6933005246297415e-07, "logits/chosen": -18.101451873779297, "logits/rejected": -17.436397552490234, "logps/chosen": -411.57958984375, "logps/rejected": -319.82098388671875, "loss": 0.7351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8636842966079712, "rewards/margins": 0.25211650133132935, "rewards/rejected": 1.6115678548812866, "step": 11020 }, { "epoch": 0.5120943404986303, "grad_norm": 118.63958740234375, "learning_rate": 2.6930219601652814e-07, "logits/chosen": -19.13983726501465, "logits/rejected": -17.975696563720703, "logps/chosen": -412.21246337890625, "logps/rejected": -255.7384033203125, "loss": 0.5208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.101750373840332, "rewards/margins": 0.8090909719467163, "rewards/rejected": 1.2926591634750366, "step": 11030 }, { "epoch": 0.5125586146060634, "grad_norm": 122.01583099365234, "learning_rate": 2.692743395700822e-07, "logits/chosen": -17.98235511779785, "logits/rejected": -17.56993865966797, "logps/chosen": -451.1224670410156, "logps/rejected": -344.8827209472656, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9072660207748413, "rewards/margins": 0.10911880433559418, "rewards/rejected": 1.798147201538086, "step": 11040 }, { "epoch": 0.5130228887134964, "grad_norm": 116.7959976196289, "learning_rate": 2.6924648312363616e-07, "logits/chosen": -18.62238883972168, "logits/rejected": -18.09404945373535, "logps/chosen": -363.0958557128906, "logps/rejected": -280.82208251953125, "loss": 0.5708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1973507404327393, "rewards/margins": 0.6224614381790161, "rewards/rejected": 1.5748891830444336, "step": 11050 }, { "epoch": 0.5134871628209294, "grad_norm": 105.16748046875, "learning_rate": 2.692186266771902e-07, "logits/chosen": -18.504825592041016, "logits/rejected": -18.132064819335938, "logps/chosen": -423.68353271484375, "logps/rejected": -368.7646484375, "loss": 1.0568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0981545448303223, "rewards/margins": -0.12736985087394714, "rewards/rejected": 2.225524425506592, "step": 11060 }, { "epoch": 0.5139514369283625, "grad_norm": 150.250732421875, "learning_rate": 2.6919077023074425e-07, "logits/chosen": -18.157899856567383, "logits/rejected": -18.30799102783203, "logps/chosen": -496.51190185546875, "logps/rejected": -422.35980224609375, "loss": 0.7452, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8084064722061157, "rewards/margins": 0.020280320197343826, "rewards/rejected": 1.788125991821289, "step": 11070 }, { "epoch": 0.5144157110357955, "grad_norm": 18.833261489868164, "learning_rate": 2.6916291378429823e-07, "logits/chosen": -18.117443084716797, "logits/rejected": -18.28525733947754, "logps/chosen": -271.8843688964844, "logps/rejected": -244.00576782226562, "loss": 0.7246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.554572343826294, "rewards/margins": 0.21817462146282196, "rewards/rejected": 1.336397647857666, "step": 11080 }, { "epoch": 0.5148799851432285, "grad_norm": 54.14725875854492, "learning_rate": 2.691350573378523e-07, "logits/chosen": -18.592546463012695, "logits/rejected": -16.855188369750977, "logps/chosen": -341.387451171875, "logps/rejected": -226.35971069335938, "loss": 0.4176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1182990074157715, "rewards/margins": 0.892151951789856, "rewards/rejected": 1.226147174835205, "step": 11090 }, { "epoch": 0.5153442592506616, "grad_norm": 85.1968002319336, "learning_rate": 2.6910720089140626e-07, "logits/chosen": -17.675128936767578, "logits/rejected": -16.605154037475586, "logps/chosen": -410.1138610839844, "logps/rejected": -299.80999755859375, "loss": 0.5386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8442538976669312, "rewards/margins": 0.7123070359230042, "rewards/rejected": 1.1319469213485718, "step": 11100 }, { "epoch": 0.5158085333580946, "grad_norm": 93.0000228881836, "learning_rate": 2.6907934444496025e-07, "logits/chosen": -18.243366241455078, "logits/rejected": -18.105628967285156, "logps/chosen": -324.45941162109375, "logps/rejected": -341.59588623046875, "loss": 0.8954, "rewards/accuracies": 0.5, "rewards/chosen": 1.6461918354034424, "rewards/margins": -0.1706794798374176, "rewards/rejected": 1.8168710470199585, "step": 11110 }, { "epoch": 0.5162728074655276, "grad_norm": 171.9287872314453, "learning_rate": 2.690514879985143e-07, "logits/chosen": -17.642658233642578, "logits/rejected": -16.64297866821289, "logps/chosen": -421.9736328125, "logps/rejected": -283.4100646972656, "loss": 0.6062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9648011922836304, "rewards/margins": 0.5288056135177612, "rewards/rejected": 1.4359958171844482, "step": 11120 }, { "epoch": 0.5167370815729607, "grad_norm": 136.5589599609375, "learning_rate": 2.6902363155206833e-07, "logits/chosen": -18.710580825805664, "logits/rejected": -18.489477157592773, "logps/chosen": -353.8975524902344, "logps/rejected": -315.9505310058594, "loss": 0.7037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9925098419189453, "rewards/margins": 0.12171880900859833, "rewards/rejected": 1.8707910776138306, "step": 11130 }, { "epoch": 0.5172013556803937, "grad_norm": 256.0687255859375, "learning_rate": 2.689957751056224e-07, "logits/chosen": -17.724952697753906, "logits/rejected": -17.266727447509766, "logps/chosen": -278.9027404785156, "logps/rejected": -313.16241455078125, "loss": 0.8052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.134258508682251, "rewards/margins": 0.46307873725891113, "rewards/rejected": 1.6711797714233398, "step": 11140 }, { "epoch": 0.5176656297878267, "grad_norm": 15.023738861083984, "learning_rate": 2.6896791865917636e-07, "logits/chosen": -18.886457443237305, "logits/rejected": -17.42477035522461, "logps/chosen": -538.1199340820312, "logps/rejected": -308.3028869628906, "loss": 0.4055, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.835172176361084, "rewards/margins": 1.2865631580352783, "rewards/rejected": 1.5486090183258057, "step": 11150 }, { "epoch": 0.5181299038952598, "grad_norm": 15.204364776611328, "learning_rate": 2.689400622127304e-07, "logits/chosen": -18.759992599487305, "logits/rejected": -17.76097297668457, "logps/chosen": -535.8468017578125, "logps/rejected": -379.5152893066406, "loss": 0.5393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4105982780456543, "rewards/margins": 0.79056316614151, "rewards/rejected": 1.620035171508789, "step": 11160 }, { "epoch": 0.5185941780026928, "grad_norm": 4.909512042999268, "learning_rate": 2.689122057662844e-07, "logits/chosen": -17.832664489746094, "logits/rejected": -17.033811569213867, "logps/chosen": -423.03253173828125, "logps/rejected": -305.3677978515625, "loss": 0.6733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.405043601989746, "rewards/margins": 0.6034191846847534, "rewards/rejected": 1.8016245365142822, "step": 11170 }, { "epoch": 0.5190584521101258, "grad_norm": 4.032334327697754, "learning_rate": 2.6888434931983843e-07, "logits/chosen": -18.322908401489258, "logits/rejected": -18.02114486694336, "logps/chosen": -434.52227783203125, "logps/rejected": -443.288330078125, "loss": 1.0207, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3802695274353027, "rewards/margins": -0.09263868629932404, "rewards/rejected": 2.4729082584381104, "step": 11180 }, { "epoch": 0.5195227262175588, "grad_norm": 125.54690551757812, "learning_rate": 2.688564928733924e-07, "logits/chosen": -18.161418914794922, "logits/rejected": -16.972230911254883, "logps/chosen": -423.216552734375, "logps/rejected": -294.6918640136719, "loss": 0.4178, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.533822536468506, "rewards/margins": 0.9919368624687195, "rewards/rejected": 1.5418856143951416, "step": 11190 }, { "epoch": 0.5199870003249919, "grad_norm": 53.754852294921875, "learning_rate": 2.6882863642694646e-07, "logits/chosen": -19.120771408081055, "logits/rejected": -17.984027862548828, "logps/chosen": -277.3172302246094, "logps/rejected": -252.22915649414062, "loss": 0.4627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.045030117034912, "rewards/margins": 0.8038300275802612, "rewards/rejected": 1.2412000894546509, "step": 11200 }, { "epoch": 0.5204512744324249, "grad_norm": 102.55821228027344, "learning_rate": 2.688007799805005e-07, "logits/chosen": -18.942333221435547, "logits/rejected": -17.547195434570312, "logps/chosen": -448.62841796875, "logps/rejected": -335.691650390625, "loss": 0.384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.903869152069092, "rewards/margins": 1.0295554399490356, "rewards/rejected": 1.8743140697479248, "step": 11210 }, { "epoch": 0.5209155485398579, "grad_norm": 122.26265716552734, "learning_rate": 2.687729235340545e-07, "logits/chosen": -18.234689712524414, "logits/rejected": -17.932575225830078, "logps/chosen": -453.38079833984375, "logps/rejected": -359.16748046875, "loss": 0.5833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.817474365234375, "rewards/margins": 0.3679336905479431, "rewards/rejected": 1.4495404958724976, "step": 11220 }, { "epoch": 0.521379822647291, "grad_norm": 66.97772216796875, "learning_rate": 2.687450670876085e-07, "logits/chosen": -16.94566535949707, "logits/rejected": -17.437681198120117, "logps/chosen": -308.3258972167969, "logps/rejected": -374.43963623046875, "loss": 1.1067, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3413989543914795, "rewards/margins": -0.5115858316421509, "rewards/rejected": 1.8529846668243408, "step": 11230 }, { "epoch": 0.521844096754724, "grad_norm": 131.52841186523438, "learning_rate": 2.687172106411625e-07, "logits/chosen": -18.97399139404297, "logits/rejected": -17.420246124267578, "logps/chosen": -513.7825927734375, "logps/rejected": -305.29241943359375, "loss": 0.5304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0179197788238525, "rewards/margins": 0.5528510212898254, "rewards/rejected": 1.4650688171386719, "step": 11240 }, { "epoch": 0.522308370862157, "grad_norm": 69.33723449707031, "learning_rate": 2.6868935419471656e-07, "logits/chosen": -18.927886962890625, "logits/rejected": -17.845470428466797, "logps/chosen": -389.90484619140625, "logps/rejected": -378.22271728515625, "loss": 0.7162, "rewards/accuracies": 0.5, "rewards/chosen": 1.7738821506500244, "rewards/margins": 0.0879879742860794, "rewards/rejected": 1.685894250869751, "step": 11250 }, { "epoch": 0.5227726449695901, "grad_norm": 107.69627380371094, "learning_rate": 2.686614977482706e-07, "logits/chosen": -18.095277786254883, "logits/rejected": -18.024410247802734, "logps/chosen": -397.06451416015625, "logps/rejected": -439.380615234375, "loss": 0.7013, "rewards/accuracies": 0.5, "rewards/chosen": 2.3052830696105957, "rewards/margins": 0.21965882182121277, "rewards/rejected": 2.0856242179870605, "step": 11260 }, { "epoch": 0.5232369190770231, "grad_norm": 58.63901138305664, "learning_rate": 2.686336413018246e-07, "logits/chosen": -18.076507568359375, "logits/rejected": -17.714515686035156, "logps/chosen": -454.2098693847656, "logps/rejected": -341.8248596191406, "loss": 0.5842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3786861896514893, "rewards/margins": 0.5072106719017029, "rewards/rejected": 1.8714752197265625, "step": 11270 }, { "epoch": 0.5237011931844561, "grad_norm": 7.527753829956055, "learning_rate": 2.686057848553786e-07, "logits/chosen": -18.5732421875, "logits/rejected": -17.6231632232666, "logps/chosen": -474.81640625, "logps/rejected": -288.586669921875, "loss": 0.5379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5197396278381348, "rewards/margins": 1.1127413511276245, "rewards/rejected": 1.4069983959197998, "step": 11280 }, { "epoch": 0.5241654672918892, "grad_norm": 51.67868423461914, "learning_rate": 2.685779284089326e-07, "logits/chosen": -17.66451644897461, "logits/rejected": -17.333999633789062, "logps/chosen": -363.6285400390625, "logps/rejected": -325.15313720703125, "loss": 0.5633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9096620082855225, "rewards/margins": 0.5484572649002075, "rewards/rejected": 1.3612048625946045, "step": 11290 }, { "epoch": 0.5246297413993222, "grad_norm": 26.69698715209961, "learning_rate": 2.685500719624866e-07, "logits/chosen": -17.783763885498047, "logits/rejected": -16.821842193603516, "logps/chosen": -416.64599609375, "logps/rejected": -258.5587158203125, "loss": 0.4613, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8321870565414429, "rewards/margins": 0.6803390383720398, "rewards/rejected": 1.1518479585647583, "step": 11300 }, { "epoch": 0.5250940155067552, "grad_norm": 65.4747543334961, "learning_rate": 2.6852221551604065e-07, "logits/chosen": -19.78066635131836, "logits/rejected": -19.94424057006836, "logps/chosen": -446.42852783203125, "logps/rejected": -399.93804931640625, "loss": 0.8657, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1381096839904785, "rewards/margins": -0.04495471715927124, "rewards/rejected": 2.1830644607543945, "step": 11310 }, { "epoch": 0.5255582896141883, "grad_norm": 76.8797836303711, "learning_rate": 2.684943590695947e-07, "logits/chosen": -19.399499893188477, "logits/rejected": -19.366165161132812, "logps/chosen": -448.91705322265625, "logps/rejected": -383.9502258300781, "loss": 0.7656, "rewards/accuracies": 0.5, "rewards/chosen": 2.4412338733673096, "rewards/margins": 0.16481366753578186, "rewards/rejected": 2.2764201164245605, "step": 11320 }, { "epoch": 0.5260225637216213, "grad_norm": 22.7825927734375, "learning_rate": 2.6846650262314873e-07, "logits/chosen": -18.058895111083984, "logits/rejected": -16.6812801361084, "logps/chosen": -315.13519287109375, "logps/rejected": -202.7754669189453, "loss": 0.4099, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.020369291305542, "rewards/margins": 0.8546380996704102, "rewards/rejected": 1.1657311916351318, "step": 11330 }, { "epoch": 0.5264868378290543, "grad_norm": 80.87014770507812, "learning_rate": 2.684386461767027e-07, "logits/chosen": -18.800382614135742, "logits/rejected": -17.822742462158203, "logps/chosen": -426.42645263671875, "logps/rejected": -265.9703674316406, "loss": 0.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2183666229248047, "rewards/margins": 0.6993997097015381, "rewards/rejected": 1.5189669132232666, "step": 11340 }, { "epoch": 0.5269511119364872, "grad_norm": 164.8067626953125, "learning_rate": 2.684107897302567e-07, "logits/chosen": -19.150590896606445, "logits/rejected": -18.386417388916016, "logps/chosen": -432.48828125, "logps/rejected": -417.1058044433594, "loss": 0.6964, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9938514232635498, "rewards/margins": 0.13876987993717194, "rewards/rejected": 1.855081558227539, "step": 11350 }, { "epoch": 0.5274153860439204, "grad_norm": 36.224365234375, "learning_rate": 2.6838293328381075e-07, "logits/chosen": -18.85818862915039, "logits/rejected": -18.548749923706055, "logps/chosen": -416.33026123046875, "logps/rejected": -412.6014709472656, "loss": 0.7024, "rewards/accuracies": 0.5, "rewards/chosen": 2.0287375450134277, "rewards/margins": 0.29508328437805176, "rewards/rejected": 1.733654260635376, "step": 11360 }, { "epoch": 0.5278796601513533, "grad_norm": 28.49726104736328, "learning_rate": 2.683550768373648e-07, "logits/chosen": -18.236289978027344, "logits/rejected": -17.533214569091797, "logps/chosen": -331.21527099609375, "logps/rejected": -346.8085632324219, "loss": 1.1804, "rewards/accuracies": 0.5, "rewards/chosen": 1.9238160848617554, "rewards/margins": -0.3033190667629242, "rewards/rejected": 2.227135181427002, "step": 11370 }, { "epoch": 0.5283439342587863, "grad_norm": 16.464826583862305, "learning_rate": 2.683272203909188e-07, "logits/chosen": -18.801124572753906, "logits/rejected": -17.41982078552246, "logps/chosen": -449.94610595703125, "logps/rejected": -295.1560974121094, "loss": 0.3231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7787070274353027, "rewards/margins": 1.2586342096328735, "rewards/rejected": 1.5200728178024292, "step": 11380 }, { "epoch": 0.5288082083662194, "grad_norm": 116.10628509521484, "learning_rate": 2.682993639444728e-07, "logits/chosen": -18.839771270751953, "logits/rejected": -18.258724212646484, "logps/chosen": -400.03802490234375, "logps/rejected": -366.2218322753906, "loss": 0.6032, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9145631790161133, "rewards/margins": 0.2405504286289215, "rewards/rejected": 1.674012541770935, "step": 11390 }, { "epoch": 0.5292724824736524, "grad_norm": 74.79942321777344, "learning_rate": 2.682715074980268e-07, "logits/chosen": -17.89990997314453, "logits/rejected": -17.257558822631836, "logps/chosen": -386.40496826171875, "logps/rejected": -244.43930053710938, "loss": 0.5689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6533348560333252, "rewards/margins": 0.46654003858566284, "rewards/rejected": 1.1867947578430176, "step": 11400 }, { "epoch": 0.5297367565810854, "grad_norm": 106.9707260131836, "learning_rate": 2.6824365105158085e-07, "logits/chosen": -18.637380599975586, "logits/rejected": -17.41431999206543, "logps/chosen": -360.1783142089844, "logps/rejected": -274.02301025390625, "loss": 0.5621, "rewards/accuracies": 0.5, "rewards/chosen": 2.0969398021698, "rewards/margins": 0.6023260951042175, "rewards/rejected": 1.494613766670227, "step": 11410 }, { "epoch": 0.5302010306885185, "grad_norm": 25.224857330322266, "learning_rate": 2.6821579460513483e-07, "logits/chosen": -18.52560043334961, "logits/rejected": -17.818443298339844, "logps/chosen": -375.97821044921875, "logps/rejected": -336.8521423339844, "loss": 0.5278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.224271535873413, "rewards/margins": 0.9415790438652039, "rewards/rejected": 1.2826924324035645, "step": 11420 }, { "epoch": 0.5306653047959515, "grad_norm": 4.654660701751709, "learning_rate": 2.681879381586889e-07, "logits/chosen": -18.34276580810547, "logits/rejected": -17.00031280517578, "logps/chosen": -393.6251525878906, "logps/rejected": -244.55050659179688, "loss": 0.3331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0122640132904053, "rewards/margins": 1.575661540031433, "rewards/rejected": 1.4366023540496826, "step": 11430 }, { "epoch": 0.5311295789033845, "grad_norm": 96.2985610961914, "learning_rate": 2.681600817122429e-07, "logits/chosen": -19.18587875366211, "logits/rejected": -18.458911895751953, "logps/chosen": -422.46014404296875, "logps/rejected": -489.6683044433594, "loss": 0.7991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9909805059432983, "rewards/margins": 0.14619889855384827, "rewards/rejected": 1.8447821140289307, "step": 11440 }, { "epoch": 0.5315938530108176, "grad_norm": 80.27389526367188, "learning_rate": 2.681322252657969e-07, "logits/chosen": -18.379762649536133, "logits/rejected": -17.37792205810547, "logps/chosen": -436.674072265625, "logps/rejected": -329.49774169921875, "loss": 0.5167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.957736611366272, "rewards/margins": 0.5609564185142517, "rewards/rejected": 1.3967801332473755, "step": 11450 }, { "epoch": 0.5320581271182506, "grad_norm": 95.8109130859375, "learning_rate": 2.6810436881935095e-07, "logits/chosen": -18.35079574584961, "logits/rejected": -17.4703369140625, "logps/chosen": -485.00323486328125, "logps/rejected": -340.84539794921875, "loss": 0.5883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.061382293701172, "rewards/margins": 0.3300657868385315, "rewards/rejected": 1.731316328048706, "step": 11460 }, { "epoch": 0.5325224012256836, "grad_norm": 62.20834732055664, "learning_rate": 2.6807651237290493e-07, "logits/chosen": -18.012327194213867, "logits/rejected": -17.675966262817383, "logps/chosen": -331.64337158203125, "logps/rejected": -300.81097412109375, "loss": 0.8962, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.5982547998428345, "rewards/margins": -0.2210136204957962, "rewards/rejected": 1.8192684650421143, "step": 11470 }, { "epoch": 0.5329866753331167, "grad_norm": 42.42397689819336, "learning_rate": 2.68048655926459e-07, "logits/chosen": -18.46317481994629, "logits/rejected": -18.398202896118164, "logps/chosen": -386.18731689453125, "logps/rejected": -308.64263916015625, "loss": 0.7458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5161426067352295, "rewards/margins": 0.1701192855834961, "rewards/rejected": 1.3460232019424438, "step": 11480 }, { "epoch": 0.5334509494405497, "grad_norm": 50.34992599487305, "learning_rate": 2.6802079948001296e-07, "logits/chosen": -18.426021575927734, "logits/rejected": -18.60181427001953, "logps/chosen": -376.97528076171875, "logps/rejected": -425.90325927734375, "loss": 0.9295, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.8652013540267944, "rewards/margins": -0.14909487962722778, "rewards/rejected": 2.014296054840088, "step": 11490 }, { "epoch": 0.5339152235479827, "grad_norm": 35.22206497192383, "learning_rate": 2.67992943033567e-07, "logits/chosen": -18.12508201599121, "logits/rejected": -17.94423484802246, "logps/chosen": -395.4211730957031, "logps/rejected": -350.4701232910156, "loss": 0.7749, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0856616497039795, "rewards/margins": 0.2397230565547943, "rewards/rejected": 1.8459386825561523, "step": 11500 }, { "epoch": 0.5343794976554157, "grad_norm": 113.14554595947266, "learning_rate": 2.6796508658712104e-07, "logits/chosen": -18.48554229736328, "logits/rejected": -17.773815155029297, "logps/chosen": -364.2929992675781, "logps/rejected": -287.47344970703125, "loss": 0.5109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3615427017211914, "rewards/margins": 0.7258302569389343, "rewards/rejected": 1.6357122659683228, "step": 11510 }, { "epoch": 0.5348437717628488, "grad_norm": 15.795411109924316, "learning_rate": 2.6793723014067503e-07, "logits/chosen": -19.301231384277344, "logits/rejected": -18.32244873046875, "logps/chosen": -432.49200439453125, "logps/rejected": -349.0335388183594, "loss": 0.5828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8397306203842163, "rewards/margins": 0.483186811208725, "rewards/rejected": 1.356543779373169, "step": 11520 }, { "epoch": 0.5353080458702818, "grad_norm": 47.91416549682617, "learning_rate": 2.67909373694229e-07, "logits/chosen": -18.162322998046875, "logits/rejected": -16.573110580444336, "logps/chosen": -397.2292175292969, "logps/rejected": -206.57470703125, "loss": 0.471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8763716220855713, "rewards/margins": 0.7341434359550476, "rewards/rejected": 1.1422282457351685, "step": 11530 }, { "epoch": 0.5357723199777148, "grad_norm": 66.8283462524414, "learning_rate": 2.6788151724778306e-07, "logits/chosen": -19.36809539794922, "logits/rejected": -18.009716033935547, "logps/chosen": -531.2802734375, "logps/rejected": -323.740966796875, "loss": 0.4116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.533395290374756, "rewards/margins": 0.9722551107406616, "rewards/rejected": 1.5611401796340942, "step": 11540 }, { "epoch": 0.5362365940851479, "grad_norm": 92.91083526611328, "learning_rate": 2.678536608013371e-07, "logits/chosen": -18.169708251953125, "logits/rejected": -17.973318099975586, "logps/chosen": -504.3047790527344, "logps/rejected": -445.800537109375, "loss": 0.5209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3398213386535645, "rewards/margins": 0.460421085357666, "rewards/rejected": 1.8794004917144775, "step": 11550 }, { "epoch": 0.5367008681925809, "grad_norm": 25.28363800048828, "learning_rate": 2.6782580435489114e-07, "logits/chosen": -18.879146575927734, "logits/rejected": -17.210338592529297, "logps/chosen": -444.473876953125, "logps/rejected": -258.6910095214844, "loss": 0.4911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3255093097686768, "rewards/margins": 0.899514377117157, "rewards/rejected": 1.4259947538375854, "step": 11560 }, { "epoch": 0.5371651423000139, "grad_norm": 48.441200256347656, "learning_rate": 2.6779794790844513e-07, "logits/chosen": -18.461652755737305, "logits/rejected": -17.42885971069336, "logps/chosen": -421.1908264160156, "logps/rejected": -279.4378967285156, "loss": 0.4614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3026859760284424, "rewards/margins": 1.1018470525741577, "rewards/rejected": 1.2008390426635742, "step": 11570 }, { "epoch": 0.537629416407447, "grad_norm": 25.269115447998047, "learning_rate": 2.6777009146199917e-07, "logits/chosen": -18.608064651489258, "logits/rejected": -17.79775619506836, "logps/chosen": -421.60107421875, "logps/rejected": -344.98040771484375, "loss": 0.5264, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.378121852874756, "rewards/margins": 0.762948215007782, "rewards/rejected": 1.6151736974716187, "step": 11580 }, { "epoch": 0.53809369051488, "grad_norm": 60.39057540893555, "learning_rate": 2.6774223501555316e-07, "logits/chosen": -18.853851318359375, "logits/rejected": -17.785686492919922, "logps/chosen": -454.09600830078125, "logps/rejected": -325.45538330078125, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.256324291229248, "rewards/margins": 0.7943547964096069, "rewards/rejected": 1.4619696140289307, "step": 11590 }, { "epoch": 0.538557964622313, "grad_norm": 73.33828735351562, "learning_rate": 2.677143785691072e-07, "logits/chosen": -18.021535873413086, "logits/rejected": -17.659523010253906, "logps/chosen": -351.31268310546875, "logps/rejected": -314.89849853515625, "loss": 0.6419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.195310115814209, "rewards/margins": 0.44660767912864685, "rewards/rejected": 1.7487022876739502, "step": 11600 }, { "epoch": 0.5390222387297461, "grad_norm": 81.2485580444336, "learning_rate": 2.676865221226612e-07, "logits/chosen": -18.495426177978516, "logits/rejected": -17.23122215270996, "logps/chosen": -366.8953857421875, "logps/rejected": -268.3719787597656, "loss": 0.4217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.162632942199707, "rewards/margins": 0.9611708521842957, "rewards/rejected": 1.2014621496200562, "step": 11610 }, { "epoch": 0.5394865128371791, "grad_norm": 54.34618377685547, "learning_rate": 2.6765866567621523e-07, "logits/chosen": -18.94448471069336, "logits/rejected": -17.683856964111328, "logps/chosen": -427.56585693359375, "logps/rejected": -287.98956298828125, "loss": 0.5767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9150028228759766, "rewards/margins": 0.5968384742736816, "rewards/rejected": 1.3181644678115845, "step": 11620 }, { "epoch": 0.5399507869446121, "grad_norm": 43.9051628112793, "learning_rate": 2.6763080922976927e-07, "logits/chosen": -17.79154396057129, "logits/rejected": -18.064739227294922, "logps/chosen": -367.8309326171875, "logps/rejected": -388.4739990234375, "loss": 0.8712, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.977758765220642, "rewards/margins": -0.16744866967201233, "rewards/rejected": 2.145207405090332, "step": 11630 }, { "epoch": 0.5404150610520452, "grad_norm": 64.57372283935547, "learning_rate": 2.6760295278332326e-07, "logits/chosen": -18.465917587280273, "logits/rejected": -17.442977905273438, "logps/chosen": -481.8623046875, "logps/rejected": -331.7768249511719, "loss": 0.393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.937107563018799, "rewards/margins": 1.106395959854126, "rewards/rejected": 1.8307117223739624, "step": 11640 }, { "epoch": 0.5408793351594782, "grad_norm": 19.312429428100586, "learning_rate": 2.6757509633687725e-07, "logits/chosen": -17.75623321533203, "logits/rejected": -16.552762985229492, "logps/chosen": -439.8954162597656, "logps/rejected": -307.24627685546875, "loss": 0.5068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.257436990737915, "rewards/margins": 0.9473668932914734, "rewards/rejected": 1.3100701570510864, "step": 11650 }, { "epoch": 0.5413436092669112, "grad_norm": 48.682857513427734, "learning_rate": 2.675472398904313e-07, "logits/chosen": -18.228662490844727, "logits/rejected": -17.493057250976562, "logps/chosen": -479.936279296875, "logps/rejected": -377.99395751953125, "loss": 0.6163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.292509078979492, "rewards/margins": 0.6176159381866455, "rewards/rejected": 1.6748931407928467, "step": 11660 }, { "epoch": 0.5418078833743443, "grad_norm": 27.426889419555664, "learning_rate": 2.6751938344398533e-07, "logits/chosen": -18.753555297851562, "logits/rejected": -18.024452209472656, "logps/chosen": -375.1817321777344, "logps/rejected": -261.2638244628906, "loss": 0.5749, "rewards/accuracies": 0.5, "rewards/chosen": 1.5995748043060303, "rewards/margins": 0.549771249294281, "rewards/rejected": 1.0498038530349731, "step": 11670 }, { "epoch": 0.5422721574817773, "grad_norm": 14.617643356323242, "learning_rate": 2.674915269975393e-07, "logits/chosen": -18.526552200317383, "logits/rejected": -17.894329071044922, "logps/chosen": -448.8882751464844, "logps/rejected": -382.6560974121094, "loss": 0.4608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0696475505828857, "rewards/margins": 0.7212823033332825, "rewards/rejected": 1.348365306854248, "step": 11680 }, { "epoch": 0.5427364315892103, "grad_norm": 15.330382347106934, "learning_rate": 2.6746367055109336e-07, "logits/chosen": -18.210886001586914, "logits/rejected": -16.54323959350586, "logps/chosen": -482.5023498535156, "logps/rejected": -268.1309814453125, "loss": 0.6307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.141829252243042, "rewards/margins": 1.0013291835784912, "rewards/rejected": 1.1405000686645508, "step": 11690 }, { "epoch": 0.5432007056966432, "grad_norm": 40.66756820678711, "learning_rate": 2.6743581410464735e-07, "logits/chosen": -17.856054306030273, "logits/rejected": -17.103206634521484, "logps/chosen": -371.91876220703125, "logps/rejected": -309.3486328125, "loss": 0.7481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.98760187625885, "rewards/margins": 0.2824341356754303, "rewards/rejected": 1.7051677703857422, "step": 11700 }, { "epoch": 0.5436649798040764, "grad_norm": 49.45347595214844, "learning_rate": 2.674079576582014e-07, "logits/chosen": -18.178295135498047, "logits/rejected": -18.431621551513672, "logps/chosen": -253.462646484375, "logps/rejected": -268.3277893066406, "loss": 1.0082, "rewards/accuracies": 0.5, "rewards/chosen": 1.5982189178466797, "rewards/margins": -0.3625188171863556, "rewards/rejected": 1.960737943649292, "step": 11710 }, { "epoch": 0.5441292539115093, "grad_norm": 57.32931900024414, "learning_rate": 2.673801012117554e-07, "logits/chosen": -18.742534637451172, "logits/rejected": -18.45769691467285, "logps/chosen": -421.5636291503906, "logps/rejected": -279.57147216796875, "loss": 0.7382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8690149784088135, "rewards/margins": 0.3986744284629822, "rewards/rejected": 1.4703404903411865, "step": 11720 }, { "epoch": 0.5445935280189423, "grad_norm": 79.14718627929688, "learning_rate": 2.673522447653094e-07, "logits/chosen": -18.213237762451172, "logits/rejected": -18.095882415771484, "logps/chosen": -365.9367370605469, "logps/rejected": -408.166015625, "loss": 0.8144, "rewards/accuracies": 0.5, "rewards/chosen": 2.1403231620788574, "rewards/margins": 0.006990480236709118, "rewards/rejected": 2.1333327293395996, "step": 11730 }, { "epoch": 0.5450578021263754, "grad_norm": 85.1968765258789, "learning_rate": 2.6732438831886346e-07, "logits/chosen": -18.42132568359375, "logits/rejected": -17.586549758911133, "logps/chosen": -368.61920166015625, "logps/rejected": -237.61129760742188, "loss": 0.4559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1209444999694824, "rewards/margins": 0.9855837821960449, "rewards/rejected": 1.135360598564148, "step": 11740 }, { "epoch": 0.5455220762338084, "grad_norm": 119.2213134765625, "learning_rate": 2.672965318724175e-07, "logits/chosen": -18.094501495361328, "logits/rejected": -16.896135330200195, "logps/chosen": -521.2369384765625, "logps/rejected": -360.798828125, "loss": 0.7062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1210107803344727, "rewards/margins": 0.7085792422294617, "rewards/rejected": 2.412431240081787, "step": 11750 }, { "epoch": 0.5459863503412414, "grad_norm": 130.64466857910156, "learning_rate": 2.672686754259715e-07, "logits/chosen": -18.604230880737305, "logits/rejected": -18.996891021728516, "logps/chosen": -456.503173828125, "logps/rejected": -458.5978088378906, "loss": 0.8081, "rewards/accuracies": 0.5, "rewards/chosen": 2.277634382247925, "rewards/margins": -0.14284197986125946, "rewards/rejected": 2.4204764366149902, "step": 11760 }, { "epoch": 0.5464506244486745, "grad_norm": 193.04656982421875, "learning_rate": 2.672408189795255e-07, "logits/chosen": -17.248971939086914, "logits/rejected": -18.706478118896484, "logps/chosen": -332.0824890136719, "logps/rejected": -429.2379455566406, "loss": 1.1055, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7952120304107666, "rewards/margins": -0.588928759098053, "rewards/rejected": 2.384140729904175, "step": 11770 }, { "epoch": 0.5469148985561075, "grad_norm": 42.14503479003906, "learning_rate": 2.672129625330795e-07, "logits/chosen": -18.717985153198242, "logits/rejected": -17.672800064086914, "logps/chosen": -316.1011962890625, "logps/rejected": -239.0089874267578, "loss": 0.4796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8606176376342773, "rewards/margins": 0.6148918271064758, "rewards/rejected": 1.2457257509231567, "step": 11780 }, { "epoch": 0.5473791726635405, "grad_norm": 47.96302795410156, "learning_rate": 2.6718510608663356e-07, "logits/chosen": -19.18024253845215, "logits/rejected": -18.376232147216797, "logps/chosen": -335.93548583984375, "logps/rejected": -266.1518249511719, "loss": 0.5795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6717075109481812, "rewards/margins": 0.45172223448753357, "rewards/rejected": 1.2199853658676147, "step": 11790 }, { "epoch": 0.5478434467709736, "grad_norm": 115.37519836425781, "learning_rate": 2.6715724964018755e-07, "logits/chosen": -19.5375919342041, "logits/rejected": -18.150684356689453, "logps/chosen": -440.1275329589844, "logps/rejected": -315.7803955078125, "loss": 0.6458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9295333623886108, "rewards/margins": 0.42747506499290466, "rewards/rejected": 1.5020582675933838, "step": 11800 }, { "epoch": 0.5483077208784066, "grad_norm": 82.13468170166016, "learning_rate": 2.671293931937416e-07, "logits/chosen": -18.799835205078125, "logits/rejected": -18.364736557006836, "logps/chosen": -487.269775390625, "logps/rejected": -457.25299072265625, "loss": 0.5593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8541505336761475, "rewards/margins": 0.44083571434020996, "rewards/rejected": 2.4133145809173584, "step": 11810 }, { "epoch": 0.5487719949858396, "grad_norm": 59.988494873046875, "learning_rate": 2.671015367472956e-07, "logits/chosen": -18.396114349365234, "logits/rejected": -17.500202178955078, "logps/chosen": -347.9141540527344, "logps/rejected": -297.75042724609375, "loss": 0.6393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8684791326522827, "rewards/margins": 0.4357028901576996, "rewards/rejected": 1.4327760934829712, "step": 11820 }, { "epoch": 0.5492362690932727, "grad_norm": 57.26295852661133, "learning_rate": 2.670736803008496e-07, "logits/chosen": -18.809396743774414, "logits/rejected": -18.251049041748047, "logps/chosen": -468.56866455078125, "logps/rejected": -387.7340393066406, "loss": 0.7315, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.8635683059692383, "rewards/margins": -0.038564376533031464, "rewards/rejected": 1.9021327495574951, "step": 11830 }, { "epoch": 0.5497005432007057, "grad_norm": 63.23893356323242, "learning_rate": 2.670458238544036e-07, "logits/chosen": -18.020261764526367, "logits/rejected": -17.79218101501465, "logps/chosen": -381.9026184082031, "logps/rejected": -312.91912841796875, "loss": 0.4625, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1071786880493164, "rewards/margins": 0.7101637721061707, "rewards/rejected": 1.397014856338501, "step": 11840 }, { "epoch": 0.5501648173081387, "grad_norm": 124.40662384033203, "learning_rate": 2.6701796740795764e-07, "logits/chosen": -18.09427261352539, "logits/rejected": -17.375728607177734, "logps/chosen": -473.5352478027344, "logps/rejected": -422.23541259765625, "loss": 0.517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.719136953353882, "rewards/margins": 0.7036765813827515, "rewards/rejected": 2.01546049118042, "step": 11850 }, { "epoch": 0.5506290914155717, "grad_norm": 41.03277587890625, "learning_rate": 2.669901109615117e-07, "logits/chosen": -18.193803787231445, "logits/rejected": -17.72792625427246, "logps/chosen": -272.7520446777344, "logps/rejected": -260.1134948730469, "loss": 0.723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6292619705200195, "rewards/margins": 0.08919437229633331, "rewards/rejected": 1.5400677919387817, "step": 11860 }, { "epoch": 0.5510933655230048, "grad_norm": 51.9048957824707, "learning_rate": 2.669622545150657e-07, "logits/chosen": -17.543363571166992, "logits/rejected": -18.426000595092773, "logps/chosen": -332.3017578125, "logps/rejected": -404.8661804199219, "loss": 1.1642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1115641593933105, "rewards/margins": -0.5655218362808228, "rewards/rejected": 2.677086353302002, "step": 11870 }, { "epoch": 0.5515576396304378, "grad_norm": 114.93079376220703, "learning_rate": 2.669343980686197e-07, "logits/chosen": -18.28158950805664, "logits/rejected": -18.810062408447266, "logps/chosen": -483.07135009765625, "logps/rejected": -470.7198181152344, "loss": 1.2103, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 1.8011077642440796, "rewards/margins": -0.7535358667373657, "rewards/rejected": 2.5546436309814453, "step": 11880 }, { "epoch": 0.5520219137378708, "grad_norm": 255.9281005859375, "learning_rate": 2.669065416221737e-07, "logits/chosen": -18.504684448242188, "logits/rejected": -17.9859676361084, "logps/chosen": -372.6310119628906, "logps/rejected": -357.24481201171875, "loss": 0.8259, "rewards/accuracies": 0.5, "rewards/chosen": 1.6599041223526, "rewards/margins": -0.07777427136898041, "rewards/rejected": 1.7376785278320312, "step": 11890 }, { "epoch": 0.5524861878453039, "grad_norm": 35.06096649169922, "learning_rate": 2.6687868517572774e-07, "logits/chosen": -19.647563934326172, "logits/rejected": -18.074230194091797, "logps/chosen": -501.6493225097656, "logps/rejected": -328.0559387207031, "loss": 0.4773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.91660737991333, "rewards/margins": 1.00783371925354, "rewards/rejected": 1.9087737798690796, "step": 11900 }, { "epoch": 0.5529504619527369, "grad_norm": 43.97743225097656, "learning_rate": 2.6685082872928173e-07, "logits/chosen": -17.12654685974121, "logits/rejected": -16.942487716674805, "logps/chosen": -248.2146759033203, "logps/rejected": -240.23117065429688, "loss": 0.9293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1511764526367188, "rewards/margins": -0.17461183667182922, "rewards/rejected": 1.3257882595062256, "step": 11910 }, { "epoch": 0.5534147360601699, "grad_norm": 52.372581481933594, "learning_rate": 2.6682297228283577e-07, "logits/chosen": -18.173419952392578, "logits/rejected": -17.375308990478516, "logps/chosen": -363.7801818847656, "logps/rejected": -245.7982635498047, "loss": 0.5044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1894211769104004, "rewards/margins": 0.9049045443534851, "rewards/rejected": 1.284516453742981, "step": 11920 }, { "epoch": 0.553879010167603, "grad_norm": 64.5362777709961, "learning_rate": 2.667951158363898e-07, "logits/chosen": -18.370561599731445, "logits/rejected": -17.450021743774414, "logps/chosen": -377.9519348144531, "logps/rejected": -266.10980224609375, "loss": 0.6459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8556087017059326, "rewards/margins": 0.2735968828201294, "rewards/rejected": 1.5820119380950928, "step": 11930 }, { "epoch": 0.554343284275036, "grad_norm": 59.757362365722656, "learning_rate": 2.667672593899438e-07, "logits/chosen": -18.849111557006836, "logits/rejected": -18.589122772216797, "logps/chosen": -544.9817504882812, "logps/rejected": -386.82501220703125, "loss": 0.8409, "rewards/accuracies": 0.5, "rewards/chosen": 2.187269449234009, "rewards/margins": 0.1640377938747406, "rewards/rejected": 2.0232319831848145, "step": 11940 }, { "epoch": 0.554807558382469, "grad_norm": 49.51484298706055, "learning_rate": 2.667394029434978e-07, "logits/chosen": -17.159399032592773, "logits/rejected": -17.520984649658203, "logps/chosen": -271.67303466796875, "logps/rejected": -254.6818084716797, "loss": 0.8901, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.2209947109222412, "rewards/margins": -0.28877896070480347, "rewards/rejected": 1.5097734928131104, "step": 11950 }, { "epoch": 0.5552718324899021, "grad_norm": 64.0462417602539, "learning_rate": 2.6671154649705183e-07, "logits/chosen": -19.858692169189453, "logits/rejected": -19.315353393554688, "logps/chosen": -379.7005310058594, "logps/rejected": -366.9212646484375, "loss": 0.7431, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9882396459579468, "rewards/margins": 0.10641038417816162, "rewards/rejected": 1.8818292617797852, "step": 11960 }, { "epoch": 0.5557361065973351, "grad_norm": 88.2206802368164, "learning_rate": 2.6668369005060587e-07, "logits/chosen": -19.530624389648438, "logits/rejected": -18.7314395904541, "logps/chosen": -370.8061218261719, "logps/rejected": -340.8643798828125, "loss": 0.8836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8309030532836914, "rewards/margins": 0.10136673599481583, "rewards/rejected": 1.7295360565185547, "step": 11970 }, { "epoch": 0.5562003807047681, "grad_norm": 168.95822143554688, "learning_rate": 2.666558336041599e-07, "logits/chosen": -17.77692985534668, "logits/rejected": -17.99579429626465, "logps/chosen": -262.05352783203125, "logps/rejected": -298.6200256347656, "loss": 0.8675, "rewards/accuracies": 0.5, "rewards/chosen": 1.4541596174240112, "rewards/margins": -0.04495656490325928, "rewards/rejected": 1.4991161823272705, "step": 11980 }, { "epoch": 0.5566646548122012, "grad_norm": 39.831241607666016, "learning_rate": 2.666279771577139e-07, "logits/chosen": -18.32111167907715, "logits/rejected": -18.13201141357422, "logps/chosen": -345.8668518066406, "logps/rejected": -326.732421875, "loss": 0.5592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.194533586502075, "rewards/margins": 0.4986952841281891, "rewards/rejected": 1.6958383321762085, "step": 11990 }, { "epoch": 0.5571289289196342, "grad_norm": 70.7596435546875, "learning_rate": 2.6660012071126794e-07, "logits/chosen": -18.213085174560547, "logits/rejected": -17.477388381958008, "logps/chosen": -391.9312438964844, "logps/rejected": -321.24615478515625, "loss": 0.725, "rewards/accuracies": 0.5, "rewards/chosen": 1.7636430263519287, "rewards/margins": 0.004720914177596569, "rewards/rejected": 1.7589222192764282, "step": 12000 }, { "epoch": 0.5575932030270672, "grad_norm": 33.69586944580078, "learning_rate": 2.6657226426482193e-07, "logits/chosen": -17.490633010864258, "logits/rejected": -17.613866806030273, "logps/chosen": -252.67459106445312, "logps/rejected": -315.8766174316406, "loss": 0.9373, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.1247727870941162, "rewards/margins": -0.30080047249794006, "rewards/rejected": 1.4255733489990234, "step": 12010 }, { "epoch": 0.5580574771345002, "grad_norm": 9.32002067565918, "learning_rate": 2.6654440781837597e-07, "logits/chosen": -18.878524780273438, "logits/rejected": -18.366764068603516, "logps/chosen": -412.9029846191406, "logps/rejected": -402.9871826171875, "loss": 0.7222, "rewards/accuracies": 0.5, "rewards/chosen": 2.2210583686828613, "rewards/margins": 0.3307151198387146, "rewards/rejected": 1.8903430700302124, "step": 12020 }, { "epoch": 0.5585217512419333, "grad_norm": 125.24077606201172, "learning_rate": 2.6651655137192996e-07, "logits/chosen": -18.948448181152344, "logits/rejected": -17.93845558166504, "logps/chosen": -605.287353515625, "logps/rejected": -350.3583984375, "loss": 0.3514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6149470806121826, "rewards/margins": 1.1322208642959595, "rewards/rejected": 1.4827263355255127, "step": 12030 }, { "epoch": 0.5589860253493663, "grad_norm": 42.664710998535156, "learning_rate": 2.66488694925484e-07, "logits/chosen": -17.626354217529297, "logits/rejected": -16.74855613708496, "logps/chosen": -398.9732971191406, "logps/rejected": -257.943603515625, "loss": 0.406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4280447959899902, "rewards/margins": 1.0330543518066406, "rewards/rejected": 1.3949902057647705, "step": 12040 }, { "epoch": 0.5594502994567992, "grad_norm": 78.18126678466797, "learning_rate": 2.6646083847903804e-07, "logits/chosen": -18.947790145874023, "logits/rejected": -18.304977416992188, "logps/chosen": -384.75213623046875, "logps/rejected": -301.0356140136719, "loss": 0.5536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1621813774108887, "rewards/margins": 0.4460034966468811, "rewards/rejected": 1.7161781787872314, "step": 12050 }, { "epoch": 0.5599145735642324, "grad_norm": 117.26888275146484, "learning_rate": 2.6643298203259203e-07, "logits/chosen": -18.55638885498047, "logits/rejected": -17.1131591796875, "logps/chosen": -329.54010009765625, "logps/rejected": -200.62008666992188, "loss": 0.6189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8337688446044922, "rewards/margins": 0.5866293907165527, "rewards/rejected": 1.2471394538879395, "step": 12060 }, { "epoch": 0.5603788476716653, "grad_norm": 83.02647399902344, "learning_rate": 2.66405125586146e-07, "logits/chosen": -18.133747100830078, "logits/rejected": -17.736316680908203, "logps/chosen": -420.44091796875, "logps/rejected": -371.2409973144531, "loss": 0.772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.032853841781616, "rewards/margins": 0.03957364708185196, "rewards/rejected": 1.9932801723480225, "step": 12070 }, { "epoch": 0.5608431217790983, "grad_norm": 38.18897247314453, "learning_rate": 2.6637726913970006e-07, "logits/chosen": -17.698612213134766, "logits/rejected": -16.94989776611328, "logps/chosen": -325.3304748535156, "logps/rejected": -285.3477478027344, "loss": 0.9566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6389182806015015, "rewards/margins": 0.05081608146429062, "rewards/rejected": 1.5881023406982422, "step": 12080 }, { "epoch": 0.5613073958865314, "grad_norm": 41.27471160888672, "learning_rate": 2.663494126932541e-07, "logits/chosen": -19.169906616210938, "logits/rejected": -18.15829849243164, "logps/chosen": -368.8917236328125, "logps/rejected": -274.2706604003906, "loss": 0.5244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.041714906692505, "rewards/margins": 0.49507492780685425, "rewards/rejected": 1.5466398000717163, "step": 12090 }, { "epoch": 0.5617716699939644, "grad_norm": 49.82801818847656, "learning_rate": 2.663215562468081e-07, "logits/chosen": -18.63461685180664, "logits/rejected": -17.888408660888672, "logps/chosen": -375.0310363769531, "logps/rejected": -377.57330322265625, "loss": 0.7996, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6734119653701782, "rewards/margins": -0.007943153381347656, "rewards/rejected": 1.6813548803329468, "step": 12100 }, { "epoch": 0.5622359441013974, "grad_norm": 83.64020538330078, "learning_rate": 2.6629369980036213e-07, "logits/chosen": -18.186626434326172, "logits/rejected": -18.537080764770508, "logps/chosen": -376.8611755371094, "logps/rejected": -430.51763916015625, "loss": 1.1596, "rewards/accuracies": 0.5, "rewards/chosen": 2.3428263664245605, "rewards/margins": -0.5231986045837402, "rewards/rejected": 2.8660247325897217, "step": 12110 }, { "epoch": 0.5627002182088305, "grad_norm": 93.0984878540039, "learning_rate": 2.662658433539161e-07, "logits/chosen": -18.72801971435547, "logits/rejected": -18.898820877075195, "logps/chosen": -308.9087219238281, "logps/rejected": -246.09518432617188, "loss": 0.5327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9244375228881836, "rewards/margins": 0.5091174840927124, "rewards/rejected": 1.4153201580047607, "step": 12120 }, { "epoch": 0.5631644923162635, "grad_norm": 36.2384147644043, "learning_rate": 2.6623798690747016e-07, "logits/chosen": -18.59854507446289, "logits/rejected": -17.9345645904541, "logps/chosen": -355.5978088378906, "logps/rejected": -252.0246124267578, "loss": 0.3929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0709261894226074, "rewards/margins": 1.2342536449432373, "rewards/rejected": 0.8366729021072388, "step": 12130 }, { "epoch": 0.5636287664236965, "grad_norm": 32.25520324707031, "learning_rate": 2.6621013046102415e-07, "logits/chosen": -19.47924041748047, "logits/rejected": -17.24740982055664, "logps/chosen": -439.5234375, "logps/rejected": -163.25637817382812, "loss": 0.4355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.36369252204895, "rewards/margins": 1.2551677227020264, "rewards/rejected": 1.108525037765503, "step": 12140 }, { "epoch": 0.5640930405311296, "grad_norm": 41.41872024536133, "learning_rate": 2.661822740145782e-07, "logits/chosen": -18.5991153717041, "logits/rejected": -18.24594497680664, "logps/chosen": -542.6485595703125, "logps/rejected": -467.915771484375, "loss": 0.4784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.946016788482666, "rewards/margins": 0.7175230383872986, "rewards/rejected": 2.2284939289093018, "step": 12150 }, { "epoch": 0.5645573146385626, "grad_norm": 77.03783416748047, "learning_rate": 2.6615441756813223e-07, "logits/chosen": -18.36737632751465, "logits/rejected": -17.294063568115234, "logps/chosen": -438.8353576660156, "logps/rejected": -331.16741943359375, "loss": 0.4346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3710522651672363, "rewards/margins": 0.8870233297348022, "rewards/rejected": 1.4840288162231445, "step": 12160 }, { "epoch": 0.5650215887459956, "grad_norm": 19.54596710205078, "learning_rate": 2.6612656112168627e-07, "logits/chosen": -19.31472396850586, "logits/rejected": -18.304597854614258, "logps/chosen": -444.1695251464844, "logps/rejected": -271.69781494140625, "loss": 0.3829, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4932878017425537, "rewards/margins": 1.0348279476165771, "rewards/rejected": 1.4584596157073975, "step": 12170 }, { "epoch": 0.5654858628534286, "grad_norm": 15.315469741821289, "learning_rate": 2.6609870467524026e-07, "logits/chosen": -17.702821731567383, "logits/rejected": -17.801877975463867, "logps/chosen": -310.36553955078125, "logps/rejected": -404.7042236328125, "loss": 1.1886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9661279916763306, "rewards/margins": -0.37922272086143494, "rewards/rejected": 2.345350742340088, "step": 12180 }, { "epoch": 0.5659501369608617, "grad_norm": 85.82913970947266, "learning_rate": 2.6607084822879425e-07, "logits/chosen": -18.930410385131836, "logits/rejected": -18.055574417114258, "logps/chosen": -444.17608642578125, "logps/rejected": -410.69732666015625, "loss": 0.7541, "rewards/accuracies": 0.5, "rewards/chosen": 2.007589101791382, "rewards/margins": 0.15559478104114532, "rewards/rejected": 1.8519941568374634, "step": 12190 }, { "epoch": 0.5664144110682947, "grad_norm": 80.07551574707031, "learning_rate": 2.660429917823483e-07, "logits/chosen": -18.49392318725586, "logits/rejected": -17.814802169799805, "logps/chosen": -402.0981140136719, "logps/rejected": -275.6632995605469, "loss": 0.5376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.49324893951416, "rewards/margins": 0.6911532878875732, "rewards/rejected": 1.8020957708358765, "step": 12200 }, { "epoch": 0.5668786851757277, "grad_norm": 69.64688873291016, "learning_rate": 2.6601513533590233e-07, "logits/chosen": -18.32483673095703, "logits/rejected": -18.31100082397461, "logps/chosen": -350.30078125, "logps/rejected": -356.453125, "loss": 0.5911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.020369052886963, "rewards/margins": 0.31857675313949585, "rewards/rejected": 1.7017924785614014, "step": 12210 }, { "epoch": 0.5673429592831608, "grad_norm": 4.196617603302002, "learning_rate": 2.659872788894563e-07, "logits/chosen": -17.581235885620117, "logits/rejected": -17.328039169311523, "logps/chosen": -348.1463317871094, "logps/rejected": -280.84967041015625, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.478980541229248, "rewards/margins": 0.9762727618217468, "rewards/rejected": 1.5027077198028564, "step": 12220 }, { "epoch": 0.5678072333905938, "grad_norm": 50.66926956176758, "learning_rate": 2.6595942244301036e-07, "logits/chosen": -17.775737762451172, "logits/rejected": -15.910761833190918, "logps/chosen": -487.56341552734375, "logps/rejected": -257.96881103515625, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": 2.517406702041626, "rewards/margins": 1.3827921152114868, "rewards/rejected": 1.1346147060394287, "step": 12230 }, { "epoch": 0.5682715074980268, "grad_norm": 69.61091613769531, "learning_rate": 2.6593156599656434e-07, "logits/chosen": -17.551633834838867, "logits/rejected": -17.133987426757812, "logps/chosen": -355.5244445800781, "logps/rejected": -316.8568420410156, "loss": 0.6962, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0112035274505615, "rewards/margins": 0.04677395895123482, "rewards/rejected": 1.9644296169281006, "step": 12240 }, { "epoch": 0.5687357816054599, "grad_norm": 82.54253387451172, "learning_rate": 2.659037095501184e-07, "logits/chosen": -18.887683868408203, "logits/rejected": -18.665245056152344, "logps/chosen": -466.5708923339844, "logps/rejected": -493.1478576660156, "loss": 0.8903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.237046003341675, "rewards/margins": -0.2582220733165741, "rewards/rejected": 2.4952681064605713, "step": 12250 }, { "epoch": 0.5692000557128929, "grad_norm": 144.96505737304688, "learning_rate": 2.6587585310367237e-07, "logits/chosen": -18.306636810302734, "logits/rejected": -18.637737274169922, "logps/chosen": -280.687255859375, "logps/rejected": -305.0877990722656, "loss": 0.9991, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.5588370561599731, "rewards/margins": -0.3590823709964752, "rewards/rejected": 1.917919397354126, "step": 12260 }, { "epoch": 0.5696643298203259, "grad_norm": 78.0849380493164, "learning_rate": 2.658479966572264e-07, "logits/chosen": -19.291244506835938, "logits/rejected": -18.753999710083008, "logps/chosen": -352.96270751953125, "logps/rejected": -254.8818817138672, "loss": 0.5689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9239912033081055, "rewards/margins": 0.5452044606208801, "rewards/rejected": 1.3787866830825806, "step": 12270 }, { "epoch": 0.570128603927759, "grad_norm": 223.10226440429688, "learning_rate": 2.6582014021078046e-07, "logits/chosen": -18.364351272583008, "logits/rejected": -17.689517974853516, "logps/chosen": -441.55670166015625, "logps/rejected": -362.92071533203125, "loss": 0.4827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.56166934967041, "rewards/margins": 0.8614040613174438, "rewards/rejected": 1.7002652883529663, "step": 12280 }, { "epoch": 0.570592878035192, "grad_norm": 69.56698608398438, "learning_rate": 2.6579228376433444e-07, "logits/chosen": -19.98422622680664, "logits/rejected": -19.163015365600586, "logps/chosen": -444.51959228515625, "logps/rejected": -379.3284912109375, "loss": 0.5404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2347989082336426, "rewards/margins": 0.4612864553928375, "rewards/rejected": 1.7735124826431274, "step": 12290 }, { "epoch": 0.571057152142625, "grad_norm": 5.9732513427734375, "learning_rate": 2.657644273178885e-07, "logits/chosen": -19.706016540527344, "logits/rejected": -18.12419319152832, "logps/chosen": -373.19927978515625, "logps/rejected": -305.73065185546875, "loss": 0.6453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3289237022399902, "rewards/margins": 0.5277485847473145, "rewards/rejected": 1.8011751174926758, "step": 12300 }, { "epoch": 0.5715214262500581, "grad_norm": 44.695655822753906, "learning_rate": 2.6573657087144247e-07, "logits/chosen": -18.411075592041016, "logits/rejected": -17.728818893432617, "logps/chosen": -428.1891174316406, "logps/rejected": -357.72503662109375, "loss": 0.5558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1627633571624756, "rewards/margins": 0.7188488841056824, "rewards/rejected": 1.443914532661438, "step": 12310 }, { "epoch": 0.5719857003574911, "grad_norm": 89.99002838134766, "learning_rate": 2.657087144249965e-07, "logits/chosen": -18.491226196289062, "logits/rejected": -17.772619247436523, "logps/chosen": -329.260498046875, "logps/rejected": -309.06072998046875, "loss": 0.6582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.283036470413208, "rewards/margins": 0.5162172317504883, "rewards/rejected": 1.7668187618255615, "step": 12320 }, { "epoch": 0.5724499744649241, "grad_norm": 58.3849983215332, "learning_rate": 2.656808579785505e-07, "logits/chosen": -18.7125301361084, "logits/rejected": -18.466999053955078, "logps/chosen": -407.5187072753906, "logps/rejected": -402.27947998046875, "loss": 0.7056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3971848487854004, "rewards/margins": 0.24619755148887634, "rewards/rejected": 2.1509876251220703, "step": 12330 }, { "epoch": 0.5729142485723571, "grad_norm": 109.06945037841797, "learning_rate": 2.6565300153210454e-07, "logits/chosen": -19.689329147338867, "logits/rejected": -17.882936477661133, "logps/chosen": -486.93157958984375, "logps/rejected": -321.1981506347656, "loss": 0.3677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8128905296325684, "rewards/margins": 1.3211714029312134, "rewards/rejected": 1.491719126701355, "step": 12340 }, { "epoch": 0.5733785226797902, "grad_norm": 79.7154769897461, "learning_rate": 2.656251450856586e-07, "logits/chosen": -18.066268920898438, "logits/rejected": -17.75443458557129, "logps/chosen": -508.7518005371094, "logps/rejected": -436.79473876953125, "loss": 0.9329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6042046546936035, "rewards/margins": -0.03481264039874077, "rewards/rejected": 2.639017105102539, "step": 12350 }, { "epoch": 0.5738427967872232, "grad_norm": 184.3318328857422, "learning_rate": 2.6559728863921257e-07, "logits/chosen": -19.54971694946289, "logits/rejected": -19.598007202148438, "logps/chosen": -474.4024353027344, "logps/rejected": -469.87274169921875, "loss": 0.5638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.706979751586914, "rewards/margins": 0.36933690309524536, "rewards/rejected": 2.3376426696777344, "step": 12360 }, { "epoch": 0.5743070708946562, "grad_norm": 19.664966583251953, "learning_rate": 2.6556943219276656e-07, "logits/chosen": -19.203533172607422, "logits/rejected": -18.839914321899414, "logps/chosen": -428.42559814453125, "logps/rejected": -384.53118896484375, "loss": 0.8374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2003672122955322, "rewards/margins": 0.026549244299530983, "rewards/rejected": 2.1738178730010986, "step": 12370 }, { "epoch": 0.5747713450020893, "grad_norm": 70.48260498046875, "learning_rate": 2.655415757463206e-07, "logits/chosen": -19.97379493713379, "logits/rejected": -18.832929611206055, "logps/chosen": -516.7210693359375, "logps/rejected": -375.95123291015625, "loss": 0.5064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4475362300872803, "rewards/margins": 0.6005433201789856, "rewards/rejected": 1.84699285030365, "step": 12380 }, { "epoch": 0.5752356191095223, "grad_norm": 44.90047073364258, "learning_rate": 2.6551371929987464e-07, "logits/chosen": -17.887981414794922, "logits/rejected": -17.010637283325195, "logps/chosen": -367.7540588378906, "logps/rejected": -263.29510498046875, "loss": 0.5569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9279139041900635, "rewards/margins": 0.5470678806304932, "rewards/rejected": 1.3808462619781494, "step": 12390 }, { "epoch": 0.5756998932169552, "grad_norm": 31.962175369262695, "learning_rate": 2.654858628534287e-07, "logits/chosen": -19.08402442932129, "logits/rejected": -17.27634048461914, "logps/chosen": -413.198486328125, "logps/rejected": -258.504150390625, "loss": 0.4125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2112107276916504, "rewards/margins": 1.043189525604248, "rewards/rejected": 1.1680214405059814, "step": 12400 }, { "epoch": 0.5761641673243884, "grad_norm": 22.861454010009766, "learning_rate": 2.6545800640698267e-07, "logits/chosen": -19.11958885192871, "logits/rejected": -17.801671981811523, "logps/chosen": -483.34942626953125, "logps/rejected": -339.2364807128906, "loss": 0.6698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9409065246582031, "rewards/margins": 0.3939457833766937, "rewards/rejected": 1.5469605922698975, "step": 12410 }, { "epoch": 0.5766284414318213, "grad_norm": 184.7721710205078, "learning_rate": 2.654301499605367e-07, "logits/chosen": -18.408876419067383, "logits/rejected": -18.350017547607422, "logps/chosen": -349.7850341796875, "logps/rejected": -315.2781066894531, "loss": 0.8702, "rewards/accuracies": 0.5, "rewards/chosen": 2.1179652214050293, "rewards/margins": 0.017680037766695023, "rewards/rejected": 2.100285053253174, "step": 12420 }, { "epoch": 0.5770927155392543, "grad_norm": 61.81156539916992, "learning_rate": 2.654022935140907e-07, "logits/chosen": -18.157560348510742, "logits/rejected": -17.327375411987305, "logps/chosen": -404.1091613769531, "logps/rejected": -363.8424987792969, "loss": 0.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8638759851455688, "rewards/margins": 0.31458884477615356, "rewards/rejected": 1.5492870807647705, "step": 12430 }, { "epoch": 0.5775569896466874, "grad_norm": 159.93759155273438, "learning_rate": 2.6537443706764474e-07, "logits/chosen": -18.355937957763672, "logits/rejected": -18.071414947509766, "logps/chosen": -388.6150207519531, "logps/rejected": -320.7958679199219, "loss": 0.7418, "rewards/accuracies": 0.5, "rewards/chosen": 1.8735727071762085, "rewards/margins": 0.19439572095870972, "rewards/rejected": 1.6791770458221436, "step": 12440 }, { "epoch": 0.5780212637541204, "grad_norm": 47.485172271728516, "learning_rate": 2.6534658062119873e-07, "logits/chosen": -19.584232330322266, "logits/rejected": -18.54676628112793, "logps/chosen": -452.50885009765625, "logps/rejected": -442.72882080078125, "loss": 0.4765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.781151294708252, "rewards/margins": 0.7933472394943237, "rewards/rejected": 1.9878044128417969, "step": 12450 }, { "epoch": 0.5784855378615534, "grad_norm": 44.9210319519043, "learning_rate": 2.6531872417475277e-07, "logits/chosen": -18.972965240478516, "logits/rejected": -17.738706588745117, "logps/chosen": -515.6638793945312, "logps/rejected": -303.3658752441406, "loss": 0.3779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1102709770202637, "rewards/margins": 1.3526713848114014, "rewards/rejected": 1.7575994729995728, "step": 12460 }, { "epoch": 0.5789498119689865, "grad_norm": 63.8904914855957, "learning_rate": 2.652908677283068e-07, "logits/chosen": -18.155521392822266, "logits/rejected": -17.85573959350586, "logps/chosen": -411.66778564453125, "logps/rejected": -361.1908264160156, "loss": 0.5445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0323920249938965, "rewards/margins": 0.46915197372436523, "rewards/rejected": 1.5632402896881104, "step": 12470 }, { "epoch": 0.5794140860764195, "grad_norm": 10.106801986694336, "learning_rate": 2.652630112818608e-07, "logits/chosen": -18.40157127380371, "logits/rejected": -17.459829330444336, "logps/chosen": -428.8436584472656, "logps/rejected": -317.198974609375, "loss": 0.4824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4384264945983887, "rewards/margins": 0.9001708030700684, "rewards/rejected": 1.5382558107376099, "step": 12480 }, { "epoch": 0.5798783601838525, "grad_norm": 22.206621170043945, "learning_rate": 2.652351548354148e-07, "logits/chosen": -18.392175674438477, "logits/rejected": -17.306568145751953, "logps/chosen": -374.67852783203125, "logps/rejected": -227.489501953125, "loss": 0.4189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8809293508529663, "rewards/margins": 0.9288301467895508, "rewards/rejected": 0.9520990252494812, "step": 12490 }, { "epoch": 0.5803426342912855, "grad_norm": 160.7908477783203, "learning_rate": 2.6520729838896883e-07, "logits/chosen": -17.79220199584961, "logits/rejected": -17.565147399902344, "logps/chosen": -236.36050415039062, "logps/rejected": -292.263671875, "loss": 0.6898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3415426015853882, "rewards/margins": 0.12760195136070251, "rewards/rejected": 1.2139407396316528, "step": 12500 }, { "epoch": 0.5808069083987186, "grad_norm": 143.4239044189453, "learning_rate": 2.6517944194252287e-07, "logits/chosen": -18.086437225341797, "logits/rejected": -17.917470932006836, "logps/chosen": -615.4963989257812, "logps/rejected": -490.90655517578125, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1354408264160156, "rewards/margins": 0.39452701807022095, "rewards/rejected": 2.7409136295318604, "step": 12510 }, { "epoch": 0.5812711825061516, "grad_norm": 108.79698181152344, "learning_rate": 2.6515158549607686e-07, "logits/chosen": -18.38944435119629, "logits/rejected": -18.011699676513672, "logps/chosen": -465.6934509277344, "logps/rejected": -413.95654296875, "loss": 0.5449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4090991020202637, "rewards/margins": 0.583681583404541, "rewards/rejected": 1.8254177570343018, "step": 12520 }, { "epoch": 0.5817354566135846, "grad_norm": 99.89503479003906, "learning_rate": 2.651237290496309e-07, "logits/chosen": -18.25686264038086, "logits/rejected": -17.390451431274414, "logps/chosen": -468.16094970703125, "logps/rejected": -404.44366455078125, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": 2.712703227996826, "rewards/margins": 0.4608820080757141, "rewards/rejected": 2.251821279525757, "step": 12530 }, { "epoch": 0.5821997307210177, "grad_norm": 86.22895812988281, "learning_rate": 2.650958726031849e-07, "logits/chosen": -18.210533142089844, "logits/rejected": -17.898853302001953, "logps/chosen": -434.99371337890625, "logps/rejected": -428.2061462402344, "loss": 1.0134, "rewards/accuracies": 0.5, "rewards/chosen": 1.9356778860092163, "rewards/margins": -0.3530900776386261, "rewards/rejected": 2.2887682914733887, "step": 12540 }, { "epoch": 0.5826640048284507, "grad_norm": 31.3911075592041, "learning_rate": 2.6506801615673893e-07, "logits/chosen": -18.093475341796875, "logits/rejected": -17.534526824951172, "logps/chosen": -480.4139709472656, "logps/rejected": -396.31475830078125, "loss": 0.498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9864675998687744, "rewards/margins": 0.6819299459457397, "rewards/rejected": 2.304537773132324, "step": 12550 }, { "epoch": 0.5831282789358837, "grad_norm": 106.4724349975586, "learning_rate": 2.650401597102929e-07, "logits/chosen": -18.658666610717773, "logits/rejected": -17.375736236572266, "logps/chosen": -366.612548828125, "logps/rejected": -266.02996826171875, "loss": 0.5104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1816179752349854, "rewards/margins": 0.6505368947982788, "rewards/rejected": 1.531080961227417, "step": 12560 }, { "epoch": 0.5835925530433168, "grad_norm": 109.30895233154297, "learning_rate": 2.6501230326384696e-07, "logits/chosen": -18.090030670166016, "logits/rejected": -17.621089935302734, "logps/chosen": -361.87506103515625, "logps/rejected": -310.02288818359375, "loss": 0.7346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9652564525604248, "rewards/margins": 0.40762728452682495, "rewards/rejected": 1.5576292276382446, "step": 12570 }, { "epoch": 0.5840568271507498, "grad_norm": 66.300537109375, "learning_rate": 2.64984446817401e-07, "logits/chosen": -19.340608596801758, "logits/rejected": -18.653322219848633, "logps/chosen": -414.7408752441406, "logps/rejected": -322.1158752441406, "loss": 0.5048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9946715831756592, "rewards/margins": 0.6958451867103577, "rewards/rejected": 1.298826813697815, "step": 12580 }, { "epoch": 0.5845211012581828, "grad_norm": 125.71319580078125, "learning_rate": 2.6495659037095504e-07, "logits/chosen": -19.944896697998047, "logits/rejected": -19.378131866455078, "logps/chosen": -372.94940185546875, "logps/rejected": -381.19158935546875, "loss": 0.6034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8909809589385986, "rewards/margins": 0.29461851716041565, "rewards/rejected": 1.5963621139526367, "step": 12590 }, { "epoch": 0.5849853753656159, "grad_norm": 70.37755584716797, "learning_rate": 2.64928733924509e-07, "logits/chosen": -17.404193878173828, "logits/rejected": -16.829219818115234, "logps/chosen": -256.03094482421875, "logps/rejected": -227.1318817138672, "loss": 0.6757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.411536693572998, "rewards/margins": 0.232045978307724, "rewards/rejected": 1.1794906854629517, "step": 12600 }, { "epoch": 0.5854496494730489, "grad_norm": 59.42629623413086, "learning_rate": 2.64900877478063e-07, "logits/chosen": -19.349878311157227, "logits/rejected": -18.84526824951172, "logps/chosen": -437.36553955078125, "logps/rejected": -375.76055908203125, "loss": 0.5767, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6576669216156006, "rewards/margins": 0.3557928502559662, "rewards/rejected": 2.3018739223480225, "step": 12610 }, { "epoch": 0.5859139235804819, "grad_norm": 208.2921600341797, "learning_rate": 2.6487302103161706e-07, "logits/chosen": -19.168668746948242, "logits/rejected": -18.379634857177734, "logps/chosen": -415.57403564453125, "logps/rejected": -382.8130187988281, "loss": 0.8463, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.4136369228363037, "rewards/margins": -0.04863288253545761, "rewards/rejected": 2.4622700214385986, "step": 12620 }, { "epoch": 0.586378197687915, "grad_norm": 61.371063232421875, "learning_rate": 2.648451645851711e-07, "logits/chosen": -19.55426025390625, "logits/rejected": -18.636123657226562, "logps/chosen": -450.08306884765625, "logps/rejected": -369.7073059082031, "loss": 0.6976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.236666202545166, "rewards/margins": 0.17636337876319885, "rewards/rejected": 2.060302734375, "step": 12630 }, { "epoch": 0.586842471795348, "grad_norm": 75.33267974853516, "learning_rate": 2.648173081387251e-07, "logits/chosen": -17.774723052978516, "logits/rejected": -17.072214126586914, "logps/chosen": -312.71246337890625, "logps/rejected": -308.3934326171875, "loss": 0.7952, "rewards/accuracies": 0.5, "rewards/chosen": 1.858560562133789, "rewards/margins": 0.24671702086925507, "rewards/rejected": 1.611843466758728, "step": 12640 }, { "epoch": 0.587306745902781, "grad_norm": 65.25315856933594, "learning_rate": 2.647894516922791e-07, "logits/chosen": -18.32620620727539, "logits/rejected": -18.163074493408203, "logps/chosen": -390.0698547363281, "logps/rejected": -379.7672119140625, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 1.671770691871643, "rewards/margins": 0.07708879560232162, "rewards/rejected": 1.594681978225708, "step": 12650 }, { "epoch": 0.5877710200102141, "grad_norm": 32.0077018737793, "learning_rate": 2.647615952458331e-07, "logits/chosen": -17.877857208251953, "logits/rejected": -17.725160598754883, "logps/chosen": -366.81005859375, "logps/rejected": -310.6327209472656, "loss": 0.7252, "rewards/accuracies": 0.5, "rewards/chosen": 1.776780128479004, "rewards/margins": 0.03001350536942482, "rewards/rejected": 1.746766448020935, "step": 12660 }, { "epoch": 0.5882352941176471, "grad_norm": 41.936607360839844, "learning_rate": 2.6473373879938715e-07, "logits/chosen": -17.807769775390625, "logits/rejected": -17.037616729736328, "logps/chosen": -379.3355712890625, "logps/rejected": -269.12152099609375, "loss": 0.5898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9038498401641846, "rewards/margins": 0.4161299765110016, "rewards/rejected": 1.4877197742462158, "step": 12670 }, { "epoch": 0.5886995682250801, "grad_norm": 97.58657836914062, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -18.174875259399414, "logits/rejected": -17.191526412963867, "logps/chosen": -414.83673095703125, "logps/rejected": -236.8614044189453, "loss": 0.439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6434125900268555, "rewards/margins": 1.233949899673462, "rewards/rejected": 1.4094629287719727, "step": 12680 }, { "epoch": 0.5891638423325131, "grad_norm": 269.1921081542969, "learning_rate": 2.646780259064952e-07, "logits/chosen": -18.454082489013672, "logits/rejected": -18.941980361938477, "logps/chosen": -449.71246337890625, "logps/rejected": -506.237060546875, "loss": 0.9992, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.274414300918579, "rewards/margins": -0.1743333786725998, "rewards/rejected": 2.4487478733062744, "step": 12690 }, { "epoch": 0.5896281164399462, "grad_norm": 12.321351051330566, "learning_rate": 2.646501694600492e-07, "logits/chosen": -19.259246826171875, "logits/rejected": -17.121944427490234, "logps/chosen": -507.75048828125, "logps/rejected": -265.32342529296875, "loss": 0.5101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5606560707092285, "rewards/margins": 1.1945898532867432, "rewards/rejected": 1.3660660982131958, "step": 12700 }, { "epoch": 0.5900923905473792, "grad_norm": 51.974308013916016, "learning_rate": 2.646223130136032e-07, "logits/chosen": -17.880504608154297, "logits/rejected": -17.538555145263672, "logps/chosen": -305.55206298828125, "logps/rejected": -212.677734375, "loss": 0.4841, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7296667098999023, "rewards/margins": 0.6058409214019775, "rewards/rejected": 1.1238259077072144, "step": 12710 }, { "epoch": 0.5905566646548122, "grad_norm": 72.77594757080078, "learning_rate": 2.6459445656715725e-07, "logits/chosen": -18.864839553833008, "logits/rejected": -17.625564575195312, "logps/chosen": -431.04241943359375, "logps/rejected": -348.18914794921875, "loss": 0.631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4971604347229004, "rewards/margins": 0.6193024516105652, "rewards/rejected": 1.8778579235076904, "step": 12720 }, { "epoch": 0.5910209387622453, "grad_norm": 108.71076965332031, "learning_rate": 2.6456660012071124e-07, "logits/chosen": -18.50879669189453, "logits/rejected": -17.238712310791016, "logps/chosen": -517.5604248046875, "logps/rejected": -370.76531982421875, "loss": 0.4478, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8355612754821777, "rewards/margins": 0.699347972869873, "rewards/rejected": 2.1362133026123047, "step": 12730 }, { "epoch": 0.5914852128696783, "grad_norm": 50.39386749267578, "learning_rate": 2.645387436742653e-07, "logits/chosen": -18.97079086303711, "logits/rejected": -17.99037742614746, "logps/chosen": -514.0567626953125, "logps/rejected": -400.2442932128906, "loss": 0.666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0385613441467285, "rewards/margins": 0.5848749876022339, "rewards/rejected": 2.453686237335205, "step": 12740 }, { "epoch": 0.5919494869771112, "grad_norm": 32.293792724609375, "learning_rate": 2.6451088722781927e-07, "logits/chosen": -18.090274810791016, "logits/rejected": -17.91207504272461, "logps/chosen": -235.9408721923828, "logps/rejected": -183.2360382080078, "loss": 0.6477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6214702129364014, "rewards/margins": 0.43100374937057495, "rewards/rejected": 1.1904665231704712, "step": 12750 }, { "epoch": 0.5924137610845444, "grad_norm": 272.2113952636719, "learning_rate": 2.644830307813733e-07, "logits/chosen": -19.195804595947266, "logits/rejected": -18.528690338134766, "logps/chosen": -424.7706604003906, "logps/rejected": -399.9718322753906, "loss": 1.0269, "rewards/accuracies": 0.5, "rewards/chosen": 2.0640816688537598, "rewards/margins": 0.007859563454985619, "rewards/rejected": 2.0562222003936768, "step": 12760 }, { "epoch": 0.5928780351919773, "grad_norm": 243.5167694091797, "learning_rate": 2.6445517433492735e-07, "logits/chosen": -19.277637481689453, "logits/rejected": -18.924785614013672, "logps/chosen": -432.10638427734375, "logps/rejected": -426.0345764160156, "loss": 0.8148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.464348316192627, "rewards/margins": -0.1211564764380455, "rewards/rejected": 2.5855047702789307, "step": 12770 }, { "epoch": 0.5933423092994103, "grad_norm": 25.405332565307617, "learning_rate": 2.6442731788848134e-07, "logits/chosen": -18.94034194946289, "logits/rejected": -18.56673812866211, "logps/chosen": -444.32513427734375, "logps/rejected": -409.7474060058594, "loss": 0.5605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8309600353240967, "rewards/margins": 0.5811794996261597, "rewards/rejected": 1.2497804164886475, "step": 12780 }, { "epoch": 0.5938065834068434, "grad_norm": 98.13770294189453, "learning_rate": 2.6439946144203533e-07, "logits/chosen": -18.166046142578125, "logits/rejected": -18.044546127319336, "logps/chosen": -319.7333984375, "logps/rejected": -335.6994323730469, "loss": 1.0551, "rewards/accuracies": 0.5, "rewards/chosen": 1.5703444480895996, "rewards/margins": -0.3248343765735626, "rewards/rejected": 1.8951787948608398, "step": 12790 }, { "epoch": 0.5942708575142764, "grad_norm": 51.464439392089844, "learning_rate": 2.6437160499558937e-07, "logits/chosen": -18.783924102783203, "logits/rejected": -18.699222564697266, "logps/chosen": -438.4542541503906, "logps/rejected": -389.3803405761719, "loss": 0.6726, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2777915000915527, "rewards/margins": 0.13380947709083557, "rewards/rejected": 2.143982172012329, "step": 12800 }, { "epoch": 0.5947351316217094, "grad_norm": 133.26791381835938, "learning_rate": 2.643437485491434e-07, "logits/chosen": -18.718358993530273, "logits/rejected": -17.961668014526367, "logps/chosen": -467.36669921875, "logps/rejected": -387.62506103515625, "loss": 0.529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5983500480651855, "rewards/margins": 0.5542842149734497, "rewards/rejected": 2.044065475463867, "step": 12810 }, { "epoch": 0.5951994057291425, "grad_norm": 103.11404418945312, "learning_rate": 2.6431589210269745e-07, "logits/chosen": -18.473848342895508, "logits/rejected": -17.55031967163086, "logps/chosen": -515.5858154296875, "logps/rejected": -402.6415710449219, "loss": 0.5637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2891228199005127, "rewards/margins": 0.5441257953643799, "rewards/rejected": 1.7449970245361328, "step": 12820 }, { "epoch": 0.5956636798365755, "grad_norm": 94.98777770996094, "learning_rate": 2.6428803565625144e-07, "logits/chosen": -17.93103790283203, "logits/rejected": -17.440038681030273, "logps/chosen": -341.04205322265625, "logps/rejected": -266.77459716796875, "loss": 0.7033, "rewards/accuracies": 0.5, "rewards/chosen": 2.404615640640259, "rewards/margins": 0.5936904549598694, "rewards/rejected": 1.8109248876571655, "step": 12830 }, { "epoch": 0.5961279539440085, "grad_norm": 29.945945739746094, "learning_rate": 2.642601792098055e-07, "logits/chosen": -17.551515579223633, "logits/rejected": -16.965087890625, "logps/chosen": -272.70831298828125, "logps/rejected": -260.00250244140625, "loss": 0.8002, "rewards/accuracies": 0.5, "rewards/chosen": 1.6156013011932373, "rewards/margins": 0.39476478099823, "rewards/rejected": 1.2208366394042969, "step": 12840 }, { "epoch": 0.5965922280514415, "grad_norm": 152.35914611816406, "learning_rate": 2.6423232276335947e-07, "logits/chosen": -18.14424705505371, "logits/rejected": -17.300212860107422, "logps/chosen": -500.821044921875, "logps/rejected": -310.626708984375, "loss": 0.5178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3875792026519775, "rewards/margins": 0.776739776134491, "rewards/rejected": 1.61083984375, "step": 12850 }, { "epoch": 0.5970565021588746, "grad_norm": 25.02426528930664, "learning_rate": 2.642044663169135e-07, "logits/chosen": -19.02529525756836, "logits/rejected": -18.593751907348633, "logps/chosen": -323.1676330566406, "logps/rejected": -300.0190734863281, "loss": 0.6868, "rewards/accuracies": 0.5, "rewards/chosen": 2.0032670497894287, "rewards/margins": 0.30116984248161316, "rewards/rejected": 1.7020971775054932, "step": 12860 }, { "epoch": 0.5975207762663076, "grad_norm": 12.389142036437988, "learning_rate": 2.641766098704675e-07, "logits/chosen": -19.242815017700195, "logits/rejected": -18.590587615966797, "logps/chosen": -417.175048828125, "logps/rejected": -312.44549560546875, "loss": 0.4882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3273494243621826, "rewards/margins": 0.6318349838256836, "rewards/rejected": 1.695514440536499, "step": 12870 }, { "epoch": 0.5979850503737406, "grad_norm": 126.96218872070312, "learning_rate": 2.6414875342402154e-07, "logits/chosen": -19.608795166015625, "logits/rejected": -19.902616500854492, "logps/chosen": -398.1496276855469, "logps/rejected": -291.7096252441406, "loss": 0.5024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3384971618652344, "rewards/margins": 0.6148746609687805, "rewards/rejected": 1.7236226797103882, "step": 12880 }, { "epoch": 0.5984493244811737, "grad_norm": 10.675202369689941, "learning_rate": 2.641208969775756e-07, "logits/chosen": -19.213218688964844, "logits/rejected": -18.359539031982422, "logps/chosen": -259.30804443359375, "logps/rejected": -186.7675018310547, "loss": 0.4517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5965080261230469, "rewards/margins": 0.7755575180053711, "rewards/rejected": 0.8209505081176758, "step": 12890 }, { "epoch": 0.5989135985886067, "grad_norm": 51.34479904174805, "learning_rate": 2.6409304053112957e-07, "logits/chosen": -17.902875900268555, "logits/rejected": -16.408187866210938, "logps/chosen": -378.63189697265625, "logps/rejected": -220.69967651367188, "loss": 0.4586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.155566692352295, "rewards/margins": 1.0621892213821411, "rewards/rejected": 1.093377709388733, "step": 12900 }, { "epoch": 0.5993778726960397, "grad_norm": 43.85335922241211, "learning_rate": 2.6406518408468356e-07, "logits/chosen": -18.888568878173828, "logits/rejected": -17.792579650878906, "logps/chosen": -330.5040588378906, "logps/rejected": -261.1601867675781, "loss": 0.494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3420791625976562, "rewards/margins": 0.6685099601745605, "rewards/rejected": 1.6735690832138062, "step": 12910 }, { "epoch": 0.5998421468034728, "grad_norm": 47.015254974365234, "learning_rate": 2.640373276382376e-07, "logits/chosen": -18.26906967163086, "logits/rejected": -16.984792709350586, "logps/chosen": -416.601318359375, "logps/rejected": -227.39028930664062, "loss": 0.4584, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4356844425201416, "rewards/margins": 1.389099359512329, "rewards/rejected": 1.0465853214263916, "step": 12920 }, { "epoch": 0.6003064209109058, "grad_norm": 61.45171356201172, "learning_rate": 2.6400947119179164e-07, "logits/chosen": -18.36017608642578, "logits/rejected": -18.3004150390625, "logps/chosen": -509.84326171875, "logps/rejected": -447.06219482421875, "loss": 0.5447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.006490468978882, "rewards/margins": 0.8006714582443237, "rewards/rejected": 2.2058191299438477, "step": 12930 }, { "epoch": 0.6007706950183388, "grad_norm": 65.8664779663086, "learning_rate": 2.6398161474534563e-07, "logits/chosen": -18.967329025268555, "logits/rejected": -17.51144027709961, "logps/chosen": -521.544189453125, "logps/rejected": -384.02020263671875, "loss": 0.5935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.255370616912842, "rewards/margins": 0.5844085812568665, "rewards/rejected": 1.6709620952606201, "step": 12940 }, { "epoch": 0.6012349691257719, "grad_norm": 155.246337890625, "learning_rate": 2.6395375829889967e-07, "logits/chosen": -19.073179244995117, "logits/rejected": -18.71253204345703, "logps/chosen": -481.83612060546875, "logps/rejected": -465.09210205078125, "loss": 0.9455, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5568184852600098, "rewards/margins": -0.22743013501167297, "rewards/rejected": 2.7842485904693604, "step": 12950 }, { "epoch": 0.6016992432332049, "grad_norm": 143.99786376953125, "learning_rate": 2.6392590185245366e-07, "logits/chosen": -18.967090606689453, "logits/rejected": -18.80592155456543, "logps/chosen": -459.5677185058594, "logps/rejected": -404.24761962890625, "loss": 0.5982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3425114154815674, "rewards/margins": 0.4526232182979584, "rewards/rejected": 1.889888048171997, "step": 12960 }, { "epoch": 0.6021635173406379, "grad_norm": 49.135440826416016, "learning_rate": 2.638980454060077e-07, "logits/chosen": -17.672863006591797, "logits/rejected": -17.097126007080078, "logps/chosen": -337.2572937011719, "logps/rejected": -311.03704833984375, "loss": 0.8785, "rewards/accuracies": 0.5, "rewards/chosen": 1.832720398902893, "rewards/margins": 0.04693252593278885, "rewards/rejected": 1.7857879400253296, "step": 12970 }, { "epoch": 0.602627791448071, "grad_norm": 72.70643615722656, "learning_rate": 2.638701889595617e-07, "logits/chosen": -19.75607681274414, "logits/rejected": -19.97768211364746, "logps/chosen": -351.787841796875, "logps/rejected": -291.5856018066406, "loss": 0.5742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3146982192993164, "rewards/margins": 0.5656999349594116, "rewards/rejected": 1.7489984035491943, "step": 12980 }, { "epoch": 0.603092065555504, "grad_norm": 39.021820068359375, "learning_rate": 2.638423325131157e-07, "logits/chosen": -17.934131622314453, "logits/rejected": -18.35260772705078, "logps/chosen": -344.9382629394531, "logps/rejected": -369.71392822265625, "loss": 1.2983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7340091466903687, "rewards/margins": -0.37604743242263794, "rewards/rejected": 2.1100564002990723, "step": 12990 }, { "epoch": 0.603556339662937, "grad_norm": 89.4130859375, "learning_rate": 2.6381447606666977e-07, "logits/chosen": -19.744985580444336, "logits/rejected": -17.123682022094727, "logps/chosen": -566.5950927734375, "logps/rejected": -291.9244384765625, "loss": 0.4192, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.740908145904541, "rewards/margins": 1.2519309520721436, "rewards/rejected": 1.4889769554138184, "step": 13000 }, { "epoch": 0.60402061377037, "grad_norm": 60.37932205200195, "learning_rate": 2.637866196202238e-07, "logits/chosen": -18.669109344482422, "logits/rejected": -17.486560821533203, "logps/chosen": -493.65118408203125, "logps/rejected": -322.32232666015625, "loss": 0.5186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.539618730545044, "rewards/margins": 0.7509413361549377, "rewards/rejected": 1.7886772155761719, "step": 13010 }, { "epoch": 0.6044848878778031, "grad_norm": 63.9084358215332, "learning_rate": 2.637587631737778e-07, "logits/chosen": -19.347593307495117, "logits/rejected": -17.47054672241211, "logps/chosen": -525.4000244140625, "logps/rejected": -314.65283203125, "loss": 0.4013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6847264766693115, "rewards/margins": 1.0342023372650146, "rewards/rejected": 1.6505239009857178, "step": 13020 }, { "epoch": 0.6049491619852361, "grad_norm": 32.61298370361328, "learning_rate": 2.637309067273318e-07, "logits/chosen": -19.128063201904297, "logits/rejected": -17.7126407623291, "logps/chosen": -422.45867919921875, "logps/rejected": -273.10003662109375, "loss": 0.6871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.957005500793457, "rewards/margins": 0.3466084599494934, "rewards/rejected": 1.6103969812393188, "step": 13030 }, { "epoch": 0.6054134360926691, "grad_norm": 81.94168853759766, "learning_rate": 2.637030502808858e-07, "logits/chosen": -19.147850036621094, "logits/rejected": -19.15812873840332, "logps/chosen": -350.1402893066406, "logps/rejected": -339.36895751953125, "loss": 0.9886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0096349716186523, "rewards/margins": -0.2320895940065384, "rewards/rejected": 2.241724729537964, "step": 13040 }, { "epoch": 0.6058777102001022, "grad_norm": 64.78239440917969, "learning_rate": 2.6367519383443987e-07, "logits/chosen": -18.947063446044922, "logits/rejected": -17.810997009277344, "logps/chosen": -428.5576171875, "logps/rejected": -315.88128662109375, "loss": 0.3692, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9443068504333496, "rewards/margins": 1.0762512683868408, "rewards/rejected": 1.8680553436279297, "step": 13050 }, { "epoch": 0.6063419843075352, "grad_norm": 211.66162109375, "learning_rate": 2.6364733738799385e-07, "logits/chosen": -19.278369903564453, "logits/rejected": -18.76428985595703, "logps/chosen": -500.1260681152344, "logps/rejected": -393.1638488769531, "loss": 0.5707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3501548767089844, "rewards/margins": 0.420586496591568, "rewards/rejected": 1.9295680522918701, "step": 13060 }, { "epoch": 0.6068062584149682, "grad_norm": 29.693805694580078, "learning_rate": 2.636194809415479e-07, "logits/chosen": -18.83360481262207, "logits/rejected": -18.568164825439453, "logps/chosen": -263.1925964355469, "logps/rejected": -275.67193603515625, "loss": 0.7922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6755914688110352, "rewards/margins": 0.11287359148263931, "rewards/rejected": 1.5627180337905884, "step": 13070 }, { "epoch": 0.6072705325224013, "grad_norm": 96.5412368774414, "learning_rate": 2.635916244951019e-07, "logits/chosen": -18.955501556396484, "logits/rejected": -17.899978637695312, "logps/chosen": -396.78668212890625, "logps/rejected": -296.8570251464844, "loss": 0.5014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7785420417785645, "rewards/margins": 1.0068609714508057, "rewards/rejected": 1.7716814279556274, "step": 13080 }, { "epoch": 0.6077348066298343, "grad_norm": 1.9952069520950317, "learning_rate": 2.635637680486559e-07, "logits/chosen": -17.848125457763672, "logits/rejected": -17.48050308227539, "logps/chosen": -423.46923828125, "logps/rejected": -418.0420837402344, "loss": 1.3988, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.2467851638793945, "rewards/margins": -0.45538797974586487, "rewards/rejected": 2.7021729946136475, "step": 13090 }, { "epoch": 0.6081990807372673, "grad_norm": 169.31289672851562, "learning_rate": 2.635359116022099e-07, "logits/chosen": -19.097858428955078, "logits/rejected": -18.39598274230957, "logps/chosen": -380.1739501953125, "logps/rejected": -219.3114013671875, "loss": 0.583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3979015350341797, "rewards/margins": 0.9712969660758972, "rewards/rejected": 1.4266046285629272, "step": 13100 }, { "epoch": 0.6086633548447004, "grad_norm": 17.364850997924805, "learning_rate": 2.6350805515576395e-07, "logits/chosen": -19.241443634033203, "logits/rejected": -17.780841827392578, "logps/chosen": -458.28948974609375, "logps/rejected": -340.40185546875, "loss": 0.4039, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.475766658782959, "rewards/margins": 0.9324444532394409, "rewards/rejected": 1.5433218479156494, "step": 13110 }, { "epoch": 0.6091276289521333, "grad_norm": 28.878171920776367, "learning_rate": 2.63480198709318e-07, "logits/chosen": -18.538314819335938, "logits/rejected": -18.889760971069336, "logps/chosen": -446.03253173828125, "logps/rejected": -365.015869140625, "loss": 0.7546, "rewards/accuracies": 0.5, "rewards/chosen": 2.344686508178711, "rewards/margins": 0.05078446865081787, "rewards/rejected": 2.2939021587371826, "step": 13120 }, { "epoch": 0.6095919030595663, "grad_norm": 56.72702407836914, "learning_rate": 2.63452342262872e-07, "logits/chosen": -18.679569244384766, "logits/rejected": -17.30844497680664, "logps/chosen": -626.74853515625, "logps/rejected": -355.2677307128906, "loss": 0.4353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9051082134246826, "rewards/margins": 1.0433790683746338, "rewards/rejected": 1.8617292642593384, "step": 13130 }, { "epoch": 0.6100561771669994, "grad_norm": 41.3481559753418, "learning_rate": 2.63424485816426e-07, "logits/chosen": -19.043216705322266, "logits/rejected": -19.337627410888672, "logps/chosen": -470.7357482910156, "logps/rejected": -448.0284729003906, "loss": 0.8704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6635375022888184, "rewards/margins": -0.024569403380155563, "rewards/rejected": 2.6881070137023926, "step": 13140 }, { "epoch": 0.6105204512744324, "grad_norm": 99.23085021972656, "learning_rate": 2.6339662936998e-07, "logits/chosen": -19.589130401611328, "logits/rejected": -19.354265213012695, "logps/chosen": -401.73260498046875, "logps/rejected": -410.25177001953125, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.363950490951538, "rewards/margins": 0.371208131313324, "rewards/rejected": 1.9927421808242798, "step": 13150 }, { "epoch": 0.6109847253818654, "grad_norm": 31.292755126953125, "learning_rate": 2.6336877292353405e-07, "logits/chosen": -18.247838973999023, "logits/rejected": -17.81715202331543, "logps/chosen": -474.4222106933594, "logps/rejected": -449.3673400878906, "loss": 0.645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5288407802581787, "rewards/margins": 0.3455328345298767, "rewards/rejected": 2.183307647705078, "step": 13160 }, { "epoch": 0.6114489994892984, "grad_norm": 54.15705490112305, "learning_rate": 2.6334091647708804e-07, "logits/chosen": -19.874645233154297, "logits/rejected": -19.055322647094727, "logps/chosen": -393.45208740234375, "logps/rejected": -360.4461364746094, "loss": 0.6941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.149453639984131, "rewards/margins": 0.10652372986078262, "rewards/rejected": 2.0429301261901855, "step": 13170 }, { "epoch": 0.6119132735967315, "grad_norm": 6.067714691162109, "learning_rate": 2.633130600306421e-07, "logits/chosen": -19.822175979614258, "logits/rejected": -19.42279815673828, "logps/chosen": -420.46881103515625, "logps/rejected": -415.5087890625, "loss": 0.9857, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.40506911277771, "rewards/margins": -0.07832145690917969, "rewards/rejected": 2.4833905696868896, "step": 13180 }, { "epoch": 0.6123775477041645, "grad_norm": 262.9471130371094, "learning_rate": 2.632852035841961e-07, "logits/chosen": -17.858686447143555, "logits/rejected": -18.06400489807129, "logps/chosen": -317.72381591796875, "logps/rejected": -370.50408935546875, "loss": 1.0574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.007206439971924, "rewards/margins": -0.2815369665622711, "rewards/rejected": 2.288743495941162, "step": 13190 }, { "epoch": 0.6128418218115975, "grad_norm": 87.41482543945312, "learning_rate": 2.632573471377501e-07, "logits/chosen": -18.11399269104004, "logits/rejected": -17.43736457824707, "logps/chosen": -411.130859375, "logps/rejected": -315.5787658691406, "loss": 0.4562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6644039154052734, "rewards/margins": 0.7385458946228027, "rewards/rejected": 1.9258581399917603, "step": 13200 }, { "epoch": 0.6133060959190306, "grad_norm": 93.23387908935547, "learning_rate": 2.632294906913041e-07, "logits/chosen": -20.1156063079834, "logits/rejected": -18.36751937866211, "logps/chosen": -532.9071044921875, "logps/rejected": -338.2699890136719, "loss": 0.3815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.882981777191162, "rewards/margins": 1.019958257675171, "rewards/rejected": 1.8630231618881226, "step": 13210 }, { "epoch": 0.6137703700264636, "grad_norm": 16.17082977294922, "learning_rate": 2.6320163424485814e-07, "logits/chosen": -18.022598266601562, "logits/rejected": -16.510536193847656, "logps/chosen": -451.99774169921875, "logps/rejected": -271.837158203125, "loss": 0.5248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8647987842559814, "rewards/margins": 1.0300425291061401, "rewards/rejected": 1.8347558975219727, "step": 13220 }, { "epoch": 0.6142346441338966, "grad_norm": 80.6930160522461, "learning_rate": 2.631737777984122e-07, "logits/chosen": -18.242172241210938, "logits/rejected": -17.865957260131836, "logps/chosen": -353.1449279785156, "logps/rejected": -345.89794921875, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 2.3821730613708496, "rewards/margins": 0.39702194929122925, "rewards/rejected": 1.9851510524749756, "step": 13230 }, { "epoch": 0.6146989182413297, "grad_norm": 34.15947341918945, "learning_rate": 2.631459213519662e-07, "logits/chosen": -19.174198150634766, "logits/rejected": -19.673664093017578, "logps/chosen": -316.3692932128906, "logps/rejected": -259.6044006347656, "loss": 0.7232, "rewards/accuracies": 0.5, "rewards/chosen": 1.8101812601089478, "rewards/margins": 0.20280952751636505, "rewards/rejected": 1.6073715686798096, "step": 13240 }, { "epoch": 0.6151631923487627, "grad_norm": 103.16583251953125, "learning_rate": 2.631180649055202e-07, "logits/chosen": -18.43588638305664, "logits/rejected": -17.08953094482422, "logps/chosen": -430.59844970703125, "logps/rejected": -287.7836608886719, "loss": 0.3939, "rewards/accuracies": 1.0, "rewards/chosen": 2.027329683303833, "rewards/margins": 0.8230869174003601, "rewards/rejected": 1.2042427062988281, "step": 13250 }, { "epoch": 0.6156274664561957, "grad_norm": 21.082963943481445, "learning_rate": 2.6309020845907425e-07, "logits/chosen": -20.7379150390625, "logits/rejected": -20.37299919128418, "logps/chosen": -369.59088134765625, "logps/rejected": -341.87518310546875, "loss": 0.5872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.293732166290283, "rewards/margins": 0.40441733598709106, "rewards/rejected": 1.8893150091171265, "step": 13260 }, { "epoch": 0.6160917405636288, "grad_norm": 79.30343627929688, "learning_rate": 2.6306235201262824e-07, "logits/chosen": -18.067657470703125, "logits/rejected": -18.42984962463379, "logps/chosen": -350.87347412109375, "logps/rejected": -354.4564514160156, "loss": 0.8952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8323609828948975, "rewards/margins": -0.1371416449546814, "rewards/rejected": 1.9695026874542236, "step": 13270 }, { "epoch": 0.6165560146710618, "grad_norm": 238.5487518310547, "learning_rate": 2.6303449556618223e-07, "logits/chosen": -17.934280395507812, "logits/rejected": -17.574535369873047, "logps/chosen": -350.15765380859375, "logps/rejected": -288.66650390625, "loss": 0.7306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2707009315490723, "rewards/margins": 0.5707963705062866, "rewards/rejected": 1.699904441833496, "step": 13280 }, { "epoch": 0.6170202887784948, "grad_norm": 120.7573013305664, "learning_rate": 2.6300663911973627e-07, "logits/chosen": -18.10540771484375, "logits/rejected": -18.41972541809082, "logps/chosen": -409.9669494628906, "logps/rejected": -432.5502014160156, "loss": 0.9022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.904693603515625, "rewards/margins": -0.040759701281785965, "rewards/rejected": 1.9454530477523804, "step": 13290 }, { "epoch": 0.6174845628859279, "grad_norm": 50.95083999633789, "learning_rate": 2.629787826732903e-07, "logits/chosen": -18.50467300415039, "logits/rejected": -16.621807098388672, "logps/chosen": -407.8534851074219, "logps/rejected": -207.3098602294922, "loss": 0.4186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3659353256225586, "rewards/margins": 1.1509090662002563, "rewards/rejected": 1.2150261402130127, "step": 13300 }, { "epoch": 0.6179488369933609, "grad_norm": 13.591483116149902, "learning_rate": 2.6295092622684435e-07, "logits/chosen": -17.920886993408203, "logits/rejected": -17.477880477905273, "logps/chosen": -564.926513671875, "logps/rejected": -442.20599365234375, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": 3.0778605937957764, "rewards/margins": 0.48728227615356445, "rewards/rejected": 2.590578556060791, "step": 13310 }, { "epoch": 0.6184131111007939, "grad_norm": 27.045869827270508, "learning_rate": 2.6292306978039834e-07, "logits/chosen": -18.122146606445312, "logits/rejected": -17.657733917236328, "logps/chosen": -491.56463623046875, "logps/rejected": -406.728759765625, "loss": 0.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8238868713378906, "rewards/margins": 0.3028465211391449, "rewards/rejected": 2.521040678024292, "step": 13320 }, { "epoch": 0.6188773852082269, "grad_norm": 85.14446258544922, "learning_rate": 2.628952133339523e-07, "logits/chosen": -18.283597946166992, "logits/rejected": -18.093835830688477, "logps/chosen": -391.32275390625, "logps/rejected": -346.4471740722656, "loss": 0.7828, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5553138256073, "rewards/margins": 0.02210932970046997, "rewards/rejected": 2.5332043170928955, "step": 13330 }, { "epoch": 0.61934165931566, "grad_norm": 3.4250731468200684, "learning_rate": 2.6286735688750637e-07, "logits/chosen": -18.830259323120117, "logits/rejected": -18.248140335083008, "logps/chosen": -415.2518615722656, "logps/rejected": -333.1142883300781, "loss": 0.837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3762316703796387, "rewards/margins": 0.3886646330356598, "rewards/rejected": 1.9875673055648804, "step": 13340 }, { "epoch": 0.619805933423093, "grad_norm": 117.63409423828125, "learning_rate": 2.628395004410604e-07, "logits/chosen": -17.78813934326172, "logits/rejected": -17.759174346923828, "logps/chosen": -331.58892822265625, "logps/rejected": -383.8295593261719, "loss": 0.8602, "rewards/accuracies": 0.5, "rewards/chosen": 1.9091403484344482, "rewards/margins": -0.13075347244739532, "rewards/rejected": 2.039893865585327, "step": 13350 }, { "epoch": 0.620270207530526, "grad_norm": 117.04840087890625, "learning_rate": 2.628116439946144e-07, "logits/chosen": -17.928253173828125, "logits/rejected": -17.3698673248291, "logps/chosen": -464.8309631347656, "logps/rejected": -348.17791748046875, "loss": 0.6647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6616811752319336, "rewards/margins": 0.5727378129959106, "rewards/rejected": 2.0889437198638916, "step": 13360 }, { "epoch": 0.6207344816379591, "grad_norm": 133.47850036621094, "learning_rate": 2.6278378754816844e-07, "logits/chosen": -18.92270278930664, "logits/rejected": -18.18783950805664, "logps/chosen": -414.3856506347656, "logps/rejected": -376.52252197265625, "loss": 0.6068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.293057680130005, "rewards/margins": 0.4527760446071625, "rewards/rejected": 1.8402817249298096, "step": 13370 }, { "epoch": 0.6211987557453921, "grad_norm": 110.70036315917969, "learning_rate": 2.6275871674636706e-07, "logits/chosen": -18.22449493408203, "logits/rejected": -17.712894439697266, "logps/chosen": -468.49078369140625, "logps/rejected": -461.55535888671875, "loss": 0.8537, "rewards/accuracies": 0.5, "rewards/chosen": 2.5838658809661865, "rewards/margins": -0.05111845210194588, "rewards/rejected": 2.6349844932556152, "step": 13380 }, { "epoch": 0.6216630298528251, "grad_norm": 177.36434936523438, "learning_rate": 2.6273086029992105e-07, "logits/chosen": -18.199413299560547, "logits/rejected": -18.408344268798828, "logps/chosen": -383.92340087890625, "logps/rejected": -388.9569091796875, "loss": 1.0702, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0303826332092285, "rewards/margins": -0.25090357661247253, "rewards/rejected": 2.2812867164611816, "step": 13390 }, { "epoch": 0.6221273039602582, "grad_norm": 148.5010528564453, "learning_rate": 2.627030038534751e-07, "logits/chosen": -19.049531936645508, "logits/rejected": -18.49509048461914, "logps/chosen": -433.0953063964844, "logps/rejected": -384.59674072265625, "loss": 0.4708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3894596099853516, "rewards/margins": 0.6630352139472961, "rewards/rejected": 1.7264244556427002, "step": 13400 }, { "epoch": 0.6225915780676912, "grad_norm": 56.988555908203125, "learning_rate": 2.626751474070291e-07, "logits/chosen": -17.649646759033203, "logits/rejected": -17.732803344726562, "logps/chosen": -343.0149230957031, "logps/rejected": -317.74725341796875, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 1.7522178888320923, "rewards/margins": 0.0885748416185379, "rewards/rejected": 1.66364324092865, "step": 13410 }, { "epoch": 0.6230558521751242, "grad_norm": 85.88673400878906, "learning_rate": 2.626472909605831e-07, "logits/chosen": -18.281755447387695, "logits/rejected": -17.692354202270508, "logps/chosen": -484.08209228515625, "logps/rejected": -425.10205078125, "loss": 0.704, "rewards/accuracies": 0.5, "rewards/chosen": 2.5449092388153076, "rewards/margins": 0.30994555354118347, "rewards/rejected": 2.234963893890381, "step": 13420 }, { "epoch": 0.6235201262825573, "grad_norm": 13.371931076049805, "learning_rate": 2.6261943451413716e-07, "logits/chosen": -18.553312301635742, "logits/rejected": -18.009654998779297, "logps/chosen": -369.647705078125, "logps/rejected": -290.402587890625, "loss": 0.505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6285338401794434, "rewards/margins": 1.011216163635254, "rewards/rejected": 1.6173179149627686, "step": 13430 }, { "epoch": 0.6239844003899903, "grad_norm": 83.25929260253906, "learning_rate": 2.6259157806769115e-07, "logits/chosen": -18.264446258544922, "logits/rejected": -16.917142868041992, "logps/chosen": -416.36077880859375, "logps/rejected": -294.974853515625, "loss": 0.5987, "rewards/accuracies": 0.5, "rewards/chosen": 2.201374053955078, "rewards/margins": 0.5845522880554199, "rewards/rejected": 1.6168218851089478, "step": 13440 }, { "epoch": 0.6244486744974233, "grad_norm": 253.1593017578125, "learning_rate": 2.625637216212452e-07, "logits/chosen": -18.278667449951172, "logits/rejected": -17.68864631652832, "logps/chosen": -262.5963439941406, "logps/rejected": -269.9480895996094, "loss": 1.0083, "rewards/accuracies": 0.5, "rewards/chosen": 1.5743926763534546, "rewards/margins": -0.06959038227796555, "rewards/rejected": 1.6439831256866455, "step": 13450 }, { "epoch": 0.6249129486048564, "grad_norm": 116.08039855957031, "learning_rate": 2.625358651747992e-07, "logits/chosen": -17.92576026916504, "logits/rejected": -17.936580657958984, "logps/chosen": -374.6109924316406, "logps/rejected": -377.8695068359375, "loss": 0.8457, "rewards/accuracies": 0.5, "rewards/chosen": 2.246901512145996, "rewards/margins": 0.015227735042572021, "rewards/rejected": 2.2316737174987793, "step": 13460 }, { "epoch": 0.6253772227122893, "grad_norm": 82.55338287353516, "learning_rate": 2.625080087283532e-07, "logits/chosen": -18.72536849975586, "logits/rejected": -18.711618423461914, "logps/chosen": -318.65179443359375, "logps/rejected": -332.43072509765625, "loss": 0.7914, "rewards/accuracies": 0.5, "rewards/chosen": 1.6575781106948853, "rewards/margins": 0.0088049890473485, "rewards/rejected": 1.648772954940796, "step": 13470 }, { "epoch": 0.6258414968197223, "grad_norm": 132.9109649658203, "learning_rate": 2.6248015228190726e-07, "logits/chosen": -19.126256942749023, "logits/rejected": -18.642786026000977, "logps/chosen": -370.17889404296875, "logps/rejected": -355.7380676269531, "loss": 0.7431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9877220392227173, "rewards/margins": 0.3347914218902588, "rewards/rejected": 1.6529306173324585, "step": 13480 }, { "epoch": 0.6263057709271554, "grad_norm": 110.36726379394531, "learning_rate": 2.6245229583546124e-07, "logits/chosen": -18.26546859741211, "logits/rejected": -17.234920501708984, "logps/chosen": -414.8937072753906, "logps/rejected": -305.7624206542969, "loss": 0.5445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4513378143310547, "rewards/margins": 0.5140515565872192, "rewards/rejected": 1.937286376953125, "step": 13490 }, { "epoch": 0.6267700450345884, "grad_norm": 30.098188400268555, "learning_rate": 2.624244393890153e-07, "logits/chosen": -18.533096313476562, "logits/rejected": -17.46026611328125, "logps/chosen": -360.43328857421875, "logps/rejected": -271.3753356933594, "loss": 0.5381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.482856512069702, "rewards/margins": 0.7438031435012817, "rewards/rejected": 1.7390533685684204, "step": 13500 }, { "epoch": 0.6272343191420214, "grad_norm": 154.3782958984375, "learning_rate": 2.6239658294256927e-07, "logits/chosen": -19.83015251159668, "logits/rejected": -19.09707260131836, "logps/chosen": -397.1098937988281, "logps/rejected": -369.5776062011719, "loss": 0.5647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.046626567840576, "rewards/margins": 0.4038986265659332, "rewards/rejected": 1.6427276134490967, "step": 13510 }, { "epoch": 0.6276985932494544, "grad_norm": 75.91329956054688, "learning_rate": 2.6236872649612326e-07, "logits/chosen": -17.891611099243164, "logits/rejected": -18.230113983154297, "logps/chosen": -377.4305725097656, "logps/rejected": -360.3865966796875, "loss": 0.5607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8559287786483765, "rewards/margins": 0.3983309864997864, "rewards/rejected": 1.4575976133346558, "step": 13520 }, { "epoch": 0.6281628673568875, "grad_norm": 207.80902099609375, "learning_rate": 2.623408700496773e-07, "logits/chosen": -18.765987396240234, "logits/rejected": -18.113834381103516, "logps/chosen": -531.5980224609375, "logps/rejected": -433.96173095703125, "loss": 0.939, "rewards/accuracies": 0.5, "rewards/chosen": 3.167088508605957, "rewards/margins": 0.2702533006668091, "rewards/rejected": 2.8968353271484375, "step": 13530 }, { "epoch": 0.6286271414643205, "grad_norm": 82.96099853515625, "learning_rate": 2.6231301360323134e-07, "logits/chosen": -18.94059944152832, "logits/rejected": -18.251161575317383, "logps/chosen": -437.56170654296875, "logps/rejected": -450.4771423339844, "loss": 0.6039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7615952491760254, "rewards/margins": 0.3172522187232971, "rewards/rejected": 2.444342851638794, "step": 13540 }, { "epoch": 0.6290914155717535, "grad_norm": 76.63970184326172, "learning_rate": 2.622851571567854e-07, "logits/chosen": -19.290863037109375, "logits/rejected": -19.019824981689453, "logps/chosen": -341.7861022949219, "logps/rejected": -280.28759765625, "loss": 0.5605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9566503763198853, "rewards/margins": 0.5545312762260437, "rewards/rejected": 1.4021189212799072, "step": 13550 }, { "epoch": 0.6295556896791866, "grad_norm": 195.7263641357422, "learning_rate": 2.6225730071033937e-07, "logits/chosen": -18.759241104125977, "logits/rejected": -17.343727111816406, "logps/chosen": -428.6773986816406, "logps/rejected": -262.21575927734375, "loss": 0.4249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4592044353485107, "rewards/margins": 0.9589556455612183, "rewards/rejected": 1.500248670578003, "step": 13560 }, { "epoch": 0.6300199637866196, "grad_norm": 39.2073974609375, "learning_rate": 2.6222944426389336e-07, "logits/chosen": -19.405460357666016, "logits/rejected": -18.39769744873047, "logps/chosen": -475.3699645996094, "logps/rejected": -325.4247131347656, "loss": 0.6765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5500521659851074, "rewards/margins": 0.6671972274780273, "rewards/rejected": 1.8828548192977905, "step": 13570 }, { "epoch": 0.6304842378940526, "grad_norm": 199.03579711914062, "learning_rate": 2.622015878174474e-07, "logits/chosen": -18.295560836791992, "logits/rejected": -17.543710708618164, "logps/chosen": -300.39422607421875, "logps/rejected": -309.26409912109375, "loss": 0.8496, "rewards/accuracies": 0.5, "rewards/chosen": 2.394608736038208, "rewards/margins": 0.3683885931968689, "rewards/rejected": 2.0262198448181152, "step": 13580 }, { "epoch": 0.6309485120014857, "grad_norm": 20.804765701293945, "learning_rate": 2.6217373137100144e-07, "logits/chosen": -18.645366668701172, "logits/rejected": -16.85481071472168, "logps/chosen": -459.7740783691406, "logps/rejected": -575.4232788085938, "loss": 0.3632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0748538970947266, "rewards/margins": 1.6626179218292236, "rewards/rejected": 1.412235975265503, "step": 13590 }, { "epoch": 0.6314127861089187, "grad_norm": 37.5512580871582, "learning_rate": 2.6214587492455543e-07, "logits/chosen": -18.79071807861328, "logits/rejected": -18.11105728149414, "logps/chosen": -450.6238708496094, "logps/rejected": -350.08477783203125, "loss": 0.5454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3429317474365234, "rewards/margins": 0.46469181776046753, "rewards/rejected": 1.8782402276992798, "step": 13600 }, { "epoch": 0.6318770602163517, "grad_norm": 87.90632629394531, "learning_rate": 2.6211801847810947e-07, "logits/chosen": -18.357921600341797, "logits/rejected": -16.631563186645508, "logps/chosen": -381.64874267578125, "logps/rejected": -186.7884521484375, "loss": 0.3849, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9065736532211304, "rewards/margins": 1.0468295812606812, "rewards/rejected": 0.8597438931465149, "step": 13610 }, { "epoch": 0.6323413343237848, "grad_norm": 72.23379516601562, "learning_rate": 2.620901620316635e-07, "logits/chosen": -18.650615692138672, "logits/rejected": -17.27165412902832, "logps/chosen": -371.39068603515625, "logps/rejected": -249.9835968017578, "loss": 0.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.448336362838745, "rewards/margins": 0.8113778233528137, "rewards/rejected": 1.6369584798812866, "step": 13620 }, { "epoch": 0.6328056084312178, "grad_norm": 67.80197143554688, "learning_rate": 2.620623055852175e-07, "logits/chosen": -19.10577964782715, "logits/rejected": -19.63304328918457, "logps/chosen": -434.25054931640625, "logps/rejected": -456.4810485839844, "loss": 0.7359, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.487274169921875, "rewards/margins": 0.13371174037456512, "rewards/rejected": 2.353562593460083, "step": 13630 }, { "epoch": 0.6332698825386508, "grad_norm": 94.04488372802734, "learning_rate": 2.620344491387715e-07, "logits/chosen": -18.43670082092285, "logits/rejected": -17.564056396484375, "logps/chosen": -322.2832336425781, "logps/rejected": -271.1398620605469, "loss": 0.5544, "rewards/accuracies": 0.5, "rewards/chosen": 2.4312281608581543, "rewards/margins": 0.8122478723526001, "rewards/rejected": 1.6189801692962646, "step": 13640 }, { "epoch": 0.6337341566460839, "grad_norm": 21.415666580200195, "learning_rate": 2.6200659269232553e-07, "logits/chosen": -19.194992065429688, "logits/rejected": -17.699800491333008, "logps/chosen": -412.69097900390625, "logps/rejected": -279.7787170410156, "loss": 0.4554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.522165298461914, "rewards/margins": 0.8991045951843262, "rewards/rejected": 1.623060941696167, "step": 13650 }, { "epoch": 0.6341984307535169, "grad_norm": 49.76481628417969, "learning_rate": 2.6197873624587957e-07, "logits/chosen": -18.07620620727539, "logits/rejected": -17.870885848999023, "logps/chosen": -378.0863342285156, "logps/rejected": -333.39569091796875, "loss": 0.5038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6093459129333496, "rewards/margins": 0.4817514419555664, "rewards/rejected": 2.127594470977783, "step": 13660 }, { "epoch": 0.6346627048609499, "grad_norm": 78.9715805053711, "learning_rate": 2.619508797994336e-07, "logits/chosen": -18.344823837280273, "logits/rejected": -17.588043212890625, "logps/chosen": -433.35357666015625, "logps/rejected": -385.6650390625, "loss": 0.7754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.289534330368042, "rewards/margins": 0.033161282539367676, "rewards/rejected": 2.256373167037964, "step": 13670 }, { "epoch": 0.6351269789683829, "grad_norm": 247.06234741210938, "learning_rate": 2.619230233529876e-07, "logits/chosen": -18.040340423583984, "logits/rejected": -16.8889102935791, "logps/chosen": -466.38922119140625, "logps/rejected": -339.57769775390625, "loss": 0.6089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0289711952209473, "rewards/margins": 1.1645543575286865, "rewards/rejected": 1.8644167184829712, "step": 13680 }, { "epoch": 0.635591253075816, "grad_norm": 162.4391632080078, "learning_rate": 2.618951669065416e-07, "logits/chosen": -18.99612045288086, "logits/rejected": -17.565107345581055, "logps/chosen": -490.62066650390625, "logps/rejected": -308.72918701171875, "loss": 0.3486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2578320503234863, "rewards/margins": 1.1906678676605225, "rewards/rejected": 2.067164182662964, "step": 13690 }, { "epoch": 0.636055527183249, "grad_norm": 99.51836395263672, "learning_rate": 2.6186731046009563e-07, "logits/chosen": -18.11532974243164, "logits/rejected": -17.59493064880371, "logps/chosen": -405.3373107910156, "logps/rejected": -385.97698974609375, "loss": 0.6898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4869585037231445, "rewards/margins": 0.4412098824977875, "rewards/rejected": 2.045748710632324, "step": 13700 }, { "epoch": 0.636519801290682, "grad_norm": 102.95259094238281, "learning_rate": 2.6183945401364967e-07, "logits/chosen": -18.664981842041016, "logits/rejected": -17.72393798828125, "logps/chosen": -343.0154724121094, "logps/rejected": -301.9432373046875, "loss": 0.503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.50060772895813, "rewards/margins": 0.5564509630203247, "rewards/rejected": 1.9441566467285156, "step": 13710 }, { "epoch": 0.6369840753981151, "grad_norm": 22.752172470092773, "learning_rate": 2.6181159756720366e-07, "logits/chosen": -18.691593170166016, "logits/rejected": -17.618375778198242, "logps/chosen": -430.6878967285156, "logps/rejected": -343.6043395996094, "loss": 0.5316, "rewards/accuracies": 0.5, "rewards/chosen": 2.566746473312378, "rewards/margins": 0.7373267412185669, "rewards/rejected": 1.829419732093811, "step": 13720 }, { "epoch": 0.6374483495055481, "grad_norm": 136.35325622558594, "learning_rate": 2.617837411207577e-07, "logits/chosen": -18.042804718017578, "logits/rejected": -17.247085571289062, "logps/chosen": -495.1798400878906, "logps/rejected": -344.4765930175781, "loss": 0.6869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.843122720718384, "rewards/margins": 0.8984594345092773, "rewards/rejected": 1.944663405418396, "step": 13730 }, { "epoch": 0.6379126236129811, "grad_norm": 96.01766967773438, "learning_rate": 2.617558846743117e-07, "logits/chosen": -19.01613998413086, "logits/rejected": -18.330089569091797, "logps/chosen": -440.9002380371094, "logps/rejected": -354.1145935058594, "loss": 0.548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.223745584487915, "rewards/margins": 0.6652420163154602, "rewards/rejected": 1.5585033893585205, "step": 13740 }, { "epoch": 0.6383768977204142, "grad_norm": 29.837636947631836, "learning_rate": 2.6172802822786573e-07, "logits/chosen": -18.30539894104004, "logits/rejected": -17.28229331970215, "logps/chosen": -423.43975830078125, "logps/rejected": -282.2229919433594, "loss": 0.3703, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.739189624786377, "rewards/margins": 1.2086832523345947, "rewards/rejected": 1.5305067300796509, "step": 13750 }, { "epoch": 0.6388411718278472, "grad_norm": 155.0910186767578, "learning_rate": 2.617001717814197e-07, "logits/chosen": -18.74734878540039, "logits/rejected": -17.33160972595215, "logps/chosen": -365.2032470703125, "logps/rejected": -224.18310546875, "loss": 0.505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4281840324401855, "rewards/margins": 0.9366401433944702, "rewards/rejected": 1.4915438890457153, "step": 13760 }, { "epoch": 0.6393054459352802, "grad_norm": 163.5692596435547, "learning_rate": 2.6167231533497376e-07, "logits/chosen": -18.7005615234375, "logits/rejected": -18.785663604736328, "logps/chosen": -475.5650329589844, "logps/rejected": -445.98565673828125, "loss": 1.1349, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.563565731048584, "rewards/margins": -0.15091319382190704, "rewards/rejected": 2.7144789695739746, "step": 13770 }, { "epoch": 0.6397697200427133, "grad_norm": 125.90689086914062, "learning_rate": 2.616444588885278e-07, "logits/chosen": -17.93436050415039, "logits/rejected": -17.60276985168457, "logps/chosen": -343.4067077636719, "logps/rejected": -309.41656494140625, "loss": 0.8884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7131961584091187, "rewards/margins": 0.0033851980697363615, "rewards/rejected": 1.7098109722137451, "step": 13780 }, { "epoch": 0.6402339941501463, "grad_norm": 123.28801727294922, "learning_rate": 2.616166024420818e-07, "logits/chosen": -18.24038314819336, "logits/rejected": -17.570369720458984, "logps/chosen": -423.96466064453125, "logps/rejected": -314.9632263183594, "loss": 0.8105, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.184206962585449, "rewards/margins": 0.054614268243312836, "rewards/rejected": 2.1295926570892334, "step": 13790 }, { "epoch": 0.6406982682575793, "grad_norm": 24.4547176361084, "learning_rate": 2.6158874599563583e-07, "logits/chosen": -18.934419631958008, "logits/rejected": -18.774980545043945, "logps/chosen": -296.96270751953125, "logps/rejected": -244.85009765625, "loss": 0.5569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.026486873626709, "rewards/margins": 0.4546060562133789, "rewards/rejected": 1.57188081741333, "step": 13800 }, { "epoch": 0.6411625423650124, "grad_norm": 79.47291564941406, "learning_rate": 2.615608895491898e-07, "logits/chosen": -18.088756561279297, "logits/rejected": -17.070232391357422, "logps/chosen": -291.60577392578125, "logps/rejected": -211.3678436279297, "loss": 0.4714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9114608764648438, "rewards/margins": 0.7654212713241577, "rewards/rejected": 1.1460397243499756, "step": 13810 }, { "epoch": 0.6416268164724453, "grad_norm": 24.239274978637695, "learning_rate": 2.6153303310274386e-07, "logits/chosen": -17.32105827331543, "logits/rejected": -17.165544509887695, "logps/chosen": -325.0032043457031, "logps/rejected": -254.8848419189453, "loss": 0.7016, "rewards/accuracies": 0.5, "rewards/chosen": 1.6275465488433838, "rewards/margins": 0.34273475408554077, "rewards/rejected": 1.2848117351531982, "step": 13820 }, { "epoch": 0.6420910905798783, "grad_norm": 14.148999214172363, "learning_rate": 2.6150517665629784e-07, "logits/chosen": -17.892221450805664, "logits/rejected": -16.622209548950195, "logps/chosen": -360.79534912109375, "logps/rejected": -195.18673706054688, "loss": 0.4697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8998746871948242, "rewards/margins": 0.910937488079071, "rewards/rejected": 0.988937497138977, "step": 13830 }, { "epoch": 0.6425553646873113, "grad_norm": 149.0481414794922, "learning_rate": 2.614773202098519e-07, "logits/chosen": -18.814697265625, "logits/rejected": -19.003141403198242, "logps/chosen": -388.0157775878906, "logps/rejected": -373.0256652832031, "loss": 0.8319, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5534379482269287, "rewards/margins": -0.00657044630497694, "rewards/rejected": 2.5600082874298096, "step": 13840 }, { "epoch": 0.6430196387947444, "grad_norm": 222.702880859375, "learning_rate": 2.614494637634059e-07, "logits/chosen": -18.448482513427734, "logits/rejected": -17.65798568725586, "logps/chosen": -399.5433654785156, "logps/rejected": -359.0203552246094, "loss": 0.8175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4256949424743652, "rewards/margins": 0.13517527282238007, "rewards/rejected": 2.2905197143554688, "step": 13850 }, { "epoch": 0.6434839129021774, "grad_norm": 65.5867919921875, "learning_rate": 2.614216073169599e-07, "logits/chosen": -18.655324935913086, "logits/rejected": -17.101303100585938, "logps/chosen": -399.6647033691406, "logps/rejected": -275.3705139160156, "loss": 0.5195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5805041790008545, "rewards/margins": 0.8896891474723816, "rewards/rejected": 1.6908149719238281, "step": 13860 }, { "epoch": 0.6439481870096104, "grad_norm": 80.51487731933594, "learning_rate": 2.6139375087051396e-07, "logits/chosen": -18.325754165649414, "logits/rejected": -16.71295738220215, "logps/chosen": -385.3268127441406, "logps/rejected": -251.50051879882812, "loss": 0.4786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0863051414489746, "rewards/margins": 1.2885009050369263, "rewards/rejected": 1.7978041172027588, "step": 13870 }, { "epoch": 0.6444124611170435, "grad_norm": 16.961515426635742, "learning_rate": 2.6136589442406794e-07, "logits/chosen": -17.839632034301758, "logits/rejected": -16.464242935180664, "logps/chosen": -387.4429016113281, "logps/rejected": -222.63119506835938, "loss": 0.5151, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.016449451446533, "rewards/margins": 0.9330196380615234, "rewards/rejected": 1.0834296941757202, "step": 13880 }, { "epoch": 0.6448767352244765, "grad_norm": 188.88958740234375, "learning_rate": 2.61338037977622e-07, "logits/chosen": -17.379009246826172, "logits/rejected": -16.672849655151367, "logps/chosen": -293.526123046875, "logps/rejected": -231.2664794921875, "loss": 0.5895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1830906867980957, "rewards/margins": 1.0529377460479736, "rewards/rejected": 1.1301532983779907, "step": 13890 }, { "epoch": 0.6453410093319095, "grad_norm": 57.80997848510742, "learning_rate": 2.61310181531176e-07, "logits/chosen": -18.469335556030273, "logits/rejected": -18.792388916015625, "logps/chosen": -352.69757080078125, "logps/rejected": -320.9561462402344, "loss": 0.7264, "rewards/accuracies": 0.5, "rewards/chosen": 1.9114539623260498, "rewards/margins": 0.03797201067209244, "rewards/rejected": 1.8734819889068604, "step": 13900 }, { "epoch": 0.6458052834393426, "grad_norm": 153.6401824951172, "learning_rate": 2.6128232508473e-07, "logits/chosen": -18.08609962463379, "logits/rejected": -17.377164840698242, "logps/chosen": -388.9622802734375, "logps/rejected": -328.00531005859375, "loss": 0.6675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2605602741241455, "rewards/margins": 0.2767408490180969, "rewards/rejected": 1.9838192462921143, "step": 13910 }, { "epoch": 0.6462695575467756, "grad_norm": 120.37042999267578, "learning_rate": 2.6125446863828405e-07, "logits/chosen": -19.1605224609375, "logits/rejected": -19.067195892333984, "logps/chosen": -455.6632385253906, "logps/rejected": -405.57110595703125, "loss": 0.6132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4093539714813232, "rewards/margins": 0.3509199619293213, "rewards/rejected": 2.058434009552002, "step": 13920 }, { "epoch": 0.6467338316542086, "grad_norm": 7.258584499359131, "learning_rate": 2.6122661219183804e-07, "logits/chosen": -18.04560089111328, "logits/rejected": -17.58612060546875, "logps/chosen": -362.20465087890625, "logps/rejected": -323.2598876953125, "loss": 0.6431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8447303771972656, "rewards/margins": 0.3955624997615814, "rewards/rejected": 1.4491679668426514, "step": 13930 }, { "epoch": 0.6471981057616417, "grad_norm": 21.938623428344727, "learning_rate": 2.6119875574539203e-07, "logits/chosen": -18.412853240966797, "logits/rejected": -17.78252601623535, "logps/chosen": -430.2112731933594, "logps/rejected": -378.9376525878906, "loss": 0.7817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8012282848358154, "rewards/margins": 0.5556203126907349, "rewards/rejected": 2.24560809135437, "step": 13940 }, { "epoch": 0.6476623798690747, "grad_norm": 78.70970153808594, "learning_rate": 2.6117089929894607e-07, "logits/chosen": -18.817602157592773, "logits/rejected": -17.508056640625, "logps/chosen": -402.56341552734375, "logps/rejected": -257.49603271484375, "loss": 0.5452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1787760257720947, "rewards/margins": 0.5730606317520142, "rewards/rejected": 1.6057153940200806, "step": 13950 }, { "epoch": 0.6481266539765077, "grad_norm": 96.236572265625, "learning_rate": 2.611430428525001e-07, "logits/chosen": -18.62655258178711, "logits/rejected": -17.84115219116211, "logps/chosen": -408.64422607421875, "logps/rejected": -360.8636779785156, "loss": 0.5744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1923580169677734, "rewards/margins": 0.642852783203125, "rewards/rejected": 2.5495052337646484, "step": 13960 }, { "epoch": 0.6485909280839408, "grad_norm": 104.95927429199219, "learning_rate": 2.6111518640605415e-07, "logits/chosen": -19.14811134338379, "logits/rejected": -18.359975814819336, "logps/chosen": -407.9607238769531, "logps/rejected": -379.056396484375, "loss": 0.6773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8229377269744873, "rewards/margins": 0.30708375573158264, "rewards/rejected": 2.5158541202545166, "step": 13970 }, { "epoch": 0.6490552021913738, "grad_norm": 53.43663787841797, "learning_rate": 2.6108732995960814e-07, "logits/chosen": -18.460634231567383, "logits/rejected": -17.49732780456543, "logps/chosen": -347.4915771484375, "logps/rejected": -241.32986450195312, "loss": 0.5932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2900519371032715, "rewards/margins": 0.7711428999900818, "rewards/rejected": 1.518908977508545, "step": 13980 }, { "epoch": 0.6495194762988068, "grad_norm": 57.90388107299805, "learning_rate": 2.6105947351316213e-07, "logits/chosen": -18.407772064208984, "logits/rejected": -18.17156219482422, "logps/chosen": -457.6895446777344, "logps/rejected": -390.0555725097656, "loss": 1.0826, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.244044065475464, "rewards/margins": -0.356015682220459, "rewards/rejected": 2.600059986114502, "step": 13990 }, { "epoch": 0.6499837504062398, "grad_norm": 123.4892349243164, "learning_rate": 2.6103161706671617e-07, "logits/chosen": -18.23565101623535, "logits/rejected": -18.42336654663086, "logps/chosen": -441.6689453125, "logps/rejected": -460.9717712402344, "loss": 0.7644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.586176633834839, "rewards/margins": 0.07209710031747818, "rewards/rejected": 2.5140795707702637, "step": 14000 }, { "epoch": 0.6504480245136729, "grad_norm": 109.08653259277344, "learning_rate": 2.610037606202702e-07, "logits/chosen": -18.458370208740234, "logits/rejected": -18.254796981811523, "logps/chosen": -379.4212341308594, "logps/rejected": -289.3758239746094, "loss": 0.6669, "rewards/accuracies": 0.5, "rewards/chosen": 2.500828504562378, "rewards/margins": 0.2709060311317444, "rewards/rejected": 2.229922294616699, "step": 14010 }, { "epoch": 0.6509122986211059, "grad_norm": 8.993896484375, "learning_rate": 2.609759041738242e-07, "logits/chosen": -19.213422775268555, "logits/rejected": -17.81302833557129, "logps/chosen": -369.1484375, "logps/rejected": -239.0540008544922, "loss": 0.5643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4961042404174805, "rewards/margins": 0.8971740007400513, "rewards/rejected": 1.5989301204681396, "step": 14020 }, { "epoch": 0.6513765727285389, "grad_norm": 74.78120422363281, "learning_rate": 2.6094804772737824e-07, "logits/chosen": -19.152965545654297, "logits/rejected": -17.20040512084961, "logps/chosen": -520.416259765625, "logps/rejected": -311.93280029296875, "loss": 0.5267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.278148651123047, "rewards/margins": 1.3870490789413452, "rewards/rejected": 1.8910995721817017, "step": 14030 }, { "epoch": 0.651840846835972, "grad_norm": 17.366586685180664, "learning_rate": 2.609201912809323e-07, "logits/chosen": -18.418766021728516, "logits/rejected": -18.063335418701172, "logps/chosen": -434.76751708984375, "logps/rejected": -339.3798828125, "loss": 0.7428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.008880853652954, "rewards/margins": 0.2356283962726593, "rewards/rejected": 1.773252248764038, "step": 14040 }, { "epoch": 0.652305120943405, "grad_norm": 54.305824279785156, "learning_rate": 2.6089233483448627e-07, "logits/chosen": -18.348108291625977, "logits/rejected": -17.532527923583984, "logps/chosen": -433.37493896484375, "logps/rejected": -364.296630859375, "loss": 0.6409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.748547315597534, "rewards/margins": 0.9310499429702759, "rewards/rejected": 1.8174974918365479, "step": 14050 }, { "epoch": 0.652769395050838, "grad_norm": 39.63496017456055, "learning_rate": 2.6086447838804026e-07, "logits/chosen": -18.384159088134766, "logits/rejected": -17.59640121459961, "logps/chosen": -346.83624267578125, "logps/rejected": -310.5010986328125, "loss": 0.8094, "rewards/accuracies": 0.5, "rewards/chosen": 2.0764310359954834, "rewards/margins": 0.09058712422847748, "rewards/rejected": 1.9858436584472656, "step": 14060 }, { "epoch": 0.6532336691582711, "grad_norm": 30.026247024536133, "learning_rate": 2.608366219415943e-07, "logits/chosen": -18.540760040283203, "logits/rejected": -17.381061553955078, "logps/chosen": -422.14385986328125, "logps/rejected": -272.33551025390625, "loss": 0.3965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.49885892868042, "rewards/margins": 1.0299386978149414, "rewards/rejected": 1.4689202308654785, "step": 14070 }, { "epoch": 0.6536979432657041, "grad_norm": 25.196399688720703, "learning_rate": 2.6080876549514834e-07, "logits/chosen": -18.434104919433594, "logits/rejected": -17.308137893676758, "logps/chosen": -350.1795959472656, "logps/rejected": -243.83731079101562, "loss": 0.7151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8899635076522827, "rewards/margins": 0.5377585887908936, "rewards/rejected": 1.3522050380706787, "step": 14080 }, { "epoch": 0.6541622173731371, "grad_norm": 56.815731048583984, "learning_rate": 2.607809090487024e-07, "logits/chosen": -19.846559524536133, "logits/rejected": -18.67323875427246, "logps/chosen": -401.45843505859375, "logps/rejected": -271.89739990234375, "loss": 0.4768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3545942306518555, "rewards/margins": 0.786182701587677, "rewards/rejected": 1.5684115886688232, "step": 14090 }, { "epoch": 0.6546264914805702, "grad_norm": 147.10552978515625, "learning_rate": 2.6075305260225637e-07, "logits/chosen": -18.08289337158203, "logits/rejected": -17.836698532104492, "logps/chosen": -403.0438537597656, "logps/rejected": -374.1336975097656, "loss": 0.8659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.589170455932617, "rewards/margins": 0.4879682660102844, "rewards/rejected": 2.1012017726898193, "step": 14100 }, { "epoch": 0.6550907655880032, "grad_norm": 48.981781005859375, "learning_rate": 2.6072519615581036e-07, "logits/chosen": -19.276782989501953, "logits/rejected": -18.232919692993164, "logps/chosen": -420.30950927734375, "logps/rejected": -306.2165832519531, "loss": 0.4956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.284855842590332, "rewards/margins": 0.7266570329666138, "rewards/rejected": 1.5581989288330078, "step": 14110 }, { "epoch": 0.6555550396954362, "grad_norm": 26.42806053161621, "learning_rate": 2.606973397093644e-07, "logits/chosen": -18.421504974365234, "logits/rejected": -17.782981872558594, "logps/chosen": -237.6007537841797, "logps/rejected": -217.6804656982422, "loss": 0.5567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3381067514419556, "rewards/margins": 0.3882443308830261, "rewards/rejected": 0.9498625993728638, "step": 14120 }, { "epoch": 0.6560193138028693, "grad_norm": 14.032709121704102, "learning_rate": 2.606694832629184e-07, "logits/chosen": -18.88724136352539, "logits/rejected": -17.74992561340332, "logps/chosen": -412.32281494140625, "logps/rejected": -309.35888671875, "loss": 0.5408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0255799293518066, "rewards/margins": 0.5670917630195618, "rewards/rejected": 1.4584882259368896, "step": 14130 }, { "epoch": 0.6564835879103023, "grad_norm": 19.059553146362305, "learning_rate": 2.6064162681647243e-07, "logits/chosen": -18.44723129272461, "logits/rejected": -17.506925582885742, "logps/chosen": -347.439453125, "logps/rejected": -290.5750427246094, "loss": 0.4077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.477383852005005, "rewards/margins": 0.8965632319450378, "rewards/rejected": 1.5808206796646118, "step": 14140 }, { "epoch": 0.6569478620177353, "grad_norm": 95.6689453125, "learning_rate": 2.6061377037002647e-07, "logits/chosen": -18.860992431640625, "logits/rejected": -17.728336334228516, "logps/chosen": -435.13134765625, "logps/rejected": -359.2800598144531, "loss": 0.5843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.666141986846924, "rewards/margins": 0.7034586071968079, "rewards/rejected": 1.9626833200454712, "step": 14150 }, { "epoch": 0.6574121361251682, "grad_norm": 40.53849792480469, "learning_rate": 2.6058591392358046e-07, "logits/chosen": -18.90114402770996, "logits/rejected": -18.890771865844727, "logps/chosen": -364.45391845703125, "logps/rejected": -388.6430969238281, "loss": 0.6585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.217205047607422, "rewards/margins": 0.20773717761039734, "rewards/rejected": 2.009467601776123, "step": 14160 }, { "epoch": 0.6578764102326013, "grad_norm": 39.999046325683594, "learning_rate": 2.605580574771345e-07, "logits/chosen": -19.03630256652832, "logits/rejected": -17.39179801940918, "logps/chosen": -352.67706298828125, "logps/rejected": -215.7442169189453, "loss": 0.4818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2443127632141113, "rewards/margins": 1.1834564208984375, "rewards/rejected": 1.0608562231063843, "step": 14170 }, { "epoch": 0.6583406843400343, "grad_norm": 6.251531600952148, "learning_rate": 2.605302010306885e-07, "logits/chosen": -18.53708267211914, "logits/rejected": -17.671276092529297, "logps/chosen": -506.5619201660156, "logps/rejected": -409.3047790527344, "loss": 0.5695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.674015522003174, "rewards/margins": 0.6560854911804199, "rewards/rejected": 2.017929792404175, "step": 14180 }, { "epoch": 0.6588049584474673, "grad_norm": 193.25416564941406, "learning_rate": 2.6050234458424253e-07, "logits/chosen": -18.358707427978516, "logits/rejected": -17.72328758239746, "logps/chosen": -480.302734375, "logps/rejected": -382.96881103515625, "loss": 0.623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.950335741043091, "rewards/margins": 0.6820840835571289, "rewards/rejected": 2.268251657485962, "step": 14190 }, { "epoch": 0.6592692325549004, "grad_norm": 57.6225471496582, "learning_rate": 2.6047448813779657e-07, "logits/chosen": -18.83196449279785, "logits/rejected": -17.851404190063477, "logps/chosen": -349.0439758300781, "logps/rejected": -249.7931365966797, "loss": 0.504, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1410484313964844, "rewards/margins": 0.746303141117096, "rewards/rejected": 1.3947453498840332, "step": 14200 }, { "epoch": 0.6597335066623334, "grad_norm": 226.26107788085938, "learning_rate": 2.6044663169135056e-07, "logits/chosen": -18.803401947021484, "logits/rejected": -18.36468505859375, "logps/chosen": -358.6358642578125, "logps/rejected": -361.0525817871094, "loss": 0.8723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.901235818862915, "rewards/margins": 0.028040587902069092, "rewards/rejected": 1.8731950521469116, "step": 14210 }, { "epoch": 0.6601977807697664, "grad_norm": 59.60431671142578, "learning_rate": 2.604187752449046e-07, "logits/chosen": -18.055856704711914, "logits/rejected": -18.210922241210938, "logps/chosen": -340.8406677246094, "logps/rejected": -376.73834228515625, "loss": 0.7138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8676210641860962, "rewards/margins": 0.0017682015895843506, "rewards/rejected": 1.865852952003479, "step": 14220 }, { "epoch": 0.6606620548771995, "grad_norm": 173.4868621826172, "learning_rate": 2.603909187984586e-07, "logits/chosen": -18.900876998901367, "logits/rejected": -17.80840492248535, "logps/chosen": -551.249267578125, "logps/rejected": -415.1490173339844, "loss": 0.5375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.973593235015869, "rewards/margins": 0.4391966760158539, "rewards/rejected": 2.5343966484069824, "step": 14230 }, { "epoch": 0.6611263289846325, "grad_norm": 27.2500057220459, "learning_rate": 2.603630623520126e-07, "logits/chosen": -18.118106842041016, "logits/rejected": -17.132787704467773, "logps/chosen": -368.18316650390625, "logps/rejected": -270.80902099609375, "loss": 0.5605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5310657024383545, "rewards/margins": 1.0525398254394531, "rewards/rejected": 1.4785257577896118, "step": 14240 }, { "epoch": 0.6615906030920655, "grad_norm": 75.64907836914062, "learning_rate": 2.603352059055666e-07, "logits/chosen": -19.047405242919922, "logits/rejected": -18.578266143798828, "logps/chosen": -461.0476989746094, "logps/rejected": -308.56353759765625, "loss": 0.8126, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.944824695587158, "rewards/margins": 0.19393223524093628, "rewards/rejected": 2.750892162322998, "step": 14250 }, { "epoch": 0.6620548771994986, "grad_norm": 48.29677200317383, "learning_rate": 2.6030734945912065e-07, "logits/chosen": -19.33269691467285, "logits/rejected": -17.749881744384766, "logps/chosen": -422.80267333984375, "logps/rejected": -262.37908935546875, "loss": 0.391, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7740485668182373, "rewards/margins": 1.2429805994033813, "rewards/rejected": 1.531067967414856, "step": 14260 }, { "epoch": 0.6625191513069316, "grad_norm": 103.08689880371094, "learning_rate": 2.602794930126747e-07, "logits/chosen": -18.987747192382812, "logits/rejected": -18.237567901611328, "logps/chosen": -408.98370361328125, "logps/rejected": -317.15118408203125, "loss": 0.5623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3839287757873535, "rewards/margins": 0.5836323499679565, "rewards/rejected": 1.8002960681915283, "step": 14270 }, { "epoch": 0.6629834254143646, "grad_norm": 46.23683166503906, "learning_rate": 2.602516365662287e-07, "logits/chosen": -19.568470001220703, "logits/rejected": -19.149593353271484, "logps/chosen": -443.29132080078125, "logps/rejected": -383.8904724121094, "loss": 0.6423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8810195922851562, "rewards/margins": 0.4111095070838928, "rewards/rejected": 2.4699103832244873, "step": 14280 }, { "epoch": 0.6634476995217977, "grad_norm": 5.542292594909668, "learning_rate": 2.602237801197827e-07, "logits/chosen": -19.053329467773438, "logits/rejected": -17.481441497802734, "logps/chosen": -425.7159729003906, "logps/rejected": -256.6299743652344, "loss": 0.4076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7593235969543457, "rewards/margins": 1.2087558507919312, "rewards/rejected": 1.550567626953125, "step": 14290 }, { "epoch": 0.6639119736292307, "grad_norm": 20.67959213256836, "learning_rate": 2.601959236733367e-07, "logits/chosen": -19.287914276123047, "logits/rejected": -18.651582717895508, "logps/chosen": -415.52801513671875, "logps/rejected": -320.80645751953125, "loss": 0.5425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.535635471343994, "rewards/margins": 0.4690801501274109, "rewards/rejected": 2.0665552616119385, "step": 14300 }, { "epoch": 0.6643762477366637, "grad_norm": 17.452577590942383, "learning_rate": 2.6016806722689075e-07, "logits/chosen": -18.235210418701172, "logits/rejected": -16.86187171936035, "logps/chosen": -530.5712890625, "logps/rejected": -384.0062255859375, "loss": 0.4874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4153149127960205, "rewards/margins": 1.0634825229644775, "rewards/rejected": 2.3518319129943848, "step": 14310 }, { "epoch": 0.6648405218440968, "grad_norm": 56.1185302734375, "learning_rate": 2.6014021078044474e-07, "logits/chosen": -19.155109405517578, "logits/rejected": -18.52114486694336, "logps/chosen": -372.20355224609375, "logps/rejected": -296.0704040527344, "loss": 0.8122, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7067859172821045, "rewards/margins": -0.10588064044713974, "rewards/rejected": 1.8126665353775024, "step": 14320 }, { "epoch": 0.6653047959515298, "grad_norm": 139.68072509765625, "learning_rate": 2.601123543339988e-07, "logits/chosen": -18.29166030883789, "logits/rejected": -17.559385299682617, "logps/chosen": -420.64013671875, "logps/rejected": -327.3028564453125, "loss": 0.5462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7324483394622803, "rewards/margins": 0.9791037440299988, "rewards/rejected": 1.7533447742462158, "step": 14330 }, { "epoch": 0.6657690700589628, "grad_norm": 88.78142547607422, "learning_rate": 2.600844978875528e-07, "logits/chosen": -18.324705123901367, "logits/rejected": -18.04181671142578, "logps/chosen": -310.08636474609375, "logps/rejected": -297.91571044921875, "loss": 0.5875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7686665058135986, "rewards/margins": 0.2939060628414154, "rewards/rejected": 1.4747604131698608, "step": 14340 }, { "epoch": 0.6662333441663958, "grad_norm": 59.8984260559082, "learning_rate": 2.600566414411068e-07, "logits/chosen": -18.159025192260742, "logits/rejected": -16.619535446166992, "logps/chosen": -401.145751953125, "logps/rejected": -259.53668212890625, "loss": 0.3827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.681074857711792, "rewards/margins": 1.208875060081482, "rewards/rejected": 1.4721999168395996, "step": 14350 }, { "epoch": 0.6666976182738289, "grad_norm": 133.3522186279297, "learning_rate": 2.600287849946608e-07, "logits/chosen": -18.786714553833008, "logits/rejected": -17.926462173461914, "logps/chosen": -408.43658447265625, "logps/rejected": -350.6014099121094, "loss": 0.6439, "rewards/accuracies": 0.5, "rewards/chosen": 2.4877848625183105, "rewards/margins": 0.5268948674201965, "rewards/rejected": 1.9608900547027588, "step": 14360 }, { "epoch": 0.6671618923812619, "grad_norm": 59.07412338256836, "learning_rate": 2.6000092854821484e-07, "logits/chosen": -17.848007202148438, "logits/rejected": -17.642282485961914, "logps/chosen": -336.8330993652344, "logps/rejected": -267.67852783203125, "loss": 0.6707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3725650310516357, "rewards/margins": 0.5119763016700745, "rewards/rejected": 1.8605890274047852, "step": 14370 }, { "epoch": 0.6676261664886949, "grad_norm": 173.9022674560547, "learning_rate": 2.599730721017689e-07, "logits/chosen": -17.878915786743164, "logits/rejected": -18.054401397705078, "logps/chosen": -429.21917724609375, "logps/rejected": -476.1846618652344, "loss": 0.7369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4131064414978027, "rewards/margins": 0.200283482670784, "rewards/rejected": 2.2128233909606934, "step": 14380 }, { "epoch": 0.668090440596128, "grad_norm": 67.15686798095703, "learning_rate": 2.599452156553229e-07, "logits/chosen": -17.219058990478516, "logits/rejected": -18.307579040527344, "logps/chosen": -323.0881042480469, "logps/rejected": -370.53094482421875, "loss": 1.0432, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.6754028797149658, "rewards/margins": -0.5164424180984497, "rewards/rejected": 2.191845417022705, "step": 14390 }, { "epoch": 0.668554714703561, "grad_norm": 68.97955322265625, "learning_rate": 2.599173592088769e-07, "logits/chosen": -19.495094299316406, "logits/rejected": -18.28290557861328, "logps/chosen": -400.76953125, "logps/rejected": -245.23123168945312, "loss": 0.3808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7662041187286377, "rewards/margins": 1.1897907257080078, "rewards/rejected": 1.5764133930206299, "step": 14400 }, { "epoch": 0.669018988810994, "grad_norm": 9.108837127685547, "learning_rate": 2.598895027624309e-07, "logits/chosen": -17.57837677001953, "logits/rejected": -17.344467163085938, "logps/chosen": -300.15509033203125, "logps/rejected": -257.2615051269531, "loss": 0.6318, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3766183853149414, "rewards/margins": 0.6860202550888062, "rewards/rejected": 1.6905982494354248, "step": 14410 }, { "epoch": 0.6694832629184271, "grad_norm": 73.48458099365234, "learning_rate": 2.5986164631598494e-07, "logits/chosen": -18.115324020385742, "logits/rejected": -17.433881759643555, "logps/chosen": -280.5016174316406, "logps/rejected": -242.1476287841797, "loss": 0.6056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8111873865127563, "rewards/margins": 0.4045644700527191, "rewards/rejected": 1.406623125076294, "step": 14420 }, { "epoch": 0.6699475370258601, "grad_norm": 38.391265869140625, "learning_rate": 2.59833789869539e-07, "logits/chosen": -19.428726196289062, "logits/rejected": -18.081462860107422, "logps/chosen": -407.74505615234375, "logps/rejected": -293.827392578125, "loss": 0.488, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.953850507736206, "rewards/margins": 0.9984580874443054, "rewards/rejected": 1.955392837524414, "step": 14430 }, { "epoch": 0.6704118111332931, "grad_norm": 218.9901885986328, "learning_rate": 2.5980593342309297e-07, "logits/chosen": -18.843881607055664, "logits/rejected": -18.242111206054688, "logps/chosen": -473.00079345703125, "logps/rejected": -427.8575134277344, "loss": 0.8209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.548048734664917, "rewards/margins": 0.15986216068267822, "rewards/rejected": 2.3881869316101074, "step": 14440 }, { "epoch": 0.6708760852407262, "grad_norm": 145.2386016845703, "learning_rate": 2.59778076976647e-07, "logits/chosen": -18.163387298583984, "logits/rejected": -17.98796272277832, "logps/chosen": -431.116943359375, "logps/rejected": -415.35321044921875, "loss": 0.782, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.180393934249878, "rewards/margins": 0.04510922357439995, "rewards/rejected": 2.135284662246704, "step": 14450 }, { "epoch": 0.6713403593481592, "grad_norm": 107.54997253417969, "learning_rate": 2.5975022053020105e-07, "logits/chosen": -18.373920440673828, "logits/rejected": -17.749034881591797, "logps/chosen": -350.6304931640625, "logps/rejected": -250.34500122070312, "loss": 0.5042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2371749877929688, "rewards/margins": 0.7223132848739624, "rewards/rejected": 1.5148615837097168, "step": 14460 }, { "epoch": 0.6718046334555922, "grad_norm": 70.60714721679688, "learning_rate": 2.5972236408375504e-07, "logits/chosen": -18.43004608154297, "logits/rejected": -16.872575759887695, "logps/chosen": -421.2264709472656, "logps/rejected": -236.03860473632812, "loss": 0.5562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.264214038848877, "rewards/margins": 0.8433204889297485, "rewards/rejected": 1.420893907546997, "step": 14470 }, { "epoch": 0.6722689075630253, "grad_norm": 159.58860778808594, "learning_rate": 2.5969450763730903e-07, "logits/chosen": -18.40471839904785, "logits/rejected": -18.76013946533203, "logps/chosen": -386.65374755859375, "logps/rejected": -368.5183410644531, "loss": 0.8094, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.176185369491577, "rewards/margins": -0.055425237864255905, "rewards/rejected": 2.2316107749938965, "step": 14480 }, { "epoch": 0.6727331816704583, "grad_norm": 89.38944244384766, "learning_rate": 2.5966665119086307e-07, "logits/chosen": -17.808839797973633, "logits/rejected": -17.426069259643555, "logps/chosen": -417.5494079589844, "logps/rejected": -284.78863525390625, "loss": 0.6677, "rewards/accuracies": 0.5, "rewards/chosen": 2.30556321144104, "rewards/margins": 0.6063214540481567, "rewards/rejected": 1.6992416381835938, "step": 14490 }, { "epoch": 0.6731974557778913, "grad_norm": 269.3271789550781, "learning_rate": 2.596387947444171e-07, "logits/chosen": -17.98241424560547, "logits/rejected": -17.52787208557129, "logps/chosen": -397.0584716796875, "logps/rejected": -345.80560302734375, "loss": 0.6252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.659759044647217, "rewards/margins": 0.7043096423149109, "rewards/rejected": 1.9554493427276611, "step": 14500 }, { "epoch": 0.6736617298853242, "grad_norm": 15.749408721923828, "learning_rate": 2.596109382979711e-07, "logits/chosen": -18.432952880859375, "logits/rejected": -17.76283073425293, "logps/chosen": -436.79620361328125, "logps/rejected": -360.162841796875, "loss": 0.5888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.338270664215088, "rewards/margins": 0.5283696055412292, "rewards/rejected": 1.8099009990692139, "step": 14510 }, { "epoch": 0.6741260039927573, "grad_norm": 68.42239379882812, "learning_rate": 2.5958308185152514e-07, "logits/chosen": -19.25347137451172, "logits/rejected": -18.111051559448242, "logps/chosen": -446.6595764160156, "logps/rejected": -360.86285400390625, "loss": 0.5171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7547614574432373, "rewards/margins": 0.6817244291305542, "rewards/rejected": 2.0730373859405518, "step": 14520 }, { "epoch": 0.6745902781001903, "grad_norm": 57.241249084472656, "learning_rate": 2.5955522540507913e-07, "logits/chosen": -19.706514358520508, "logits/rejected": -18.59341049194336, "logps/chosen": -342.9632873535156, "logps/rejected": -298.21099853515625, "loss": 0.4812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.170809745788574, "rewards/margins": 0.6432555913925171, "rewards/rejected": 1.5275542736053467, "step": 14530 }, { "epoch": 0.6750545522076233, "grad_norm": 19.003219604492188, "learning_rate": 2.5952736895863317e-07, "logits/chosen": -18.707035064697266, "logits/rejected": -17.65414810180664, "logps/chosen": -471.84326171875, "logps/rejected": -269.08917236328125, "loss": 0.4128, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5396888256073, "rewards/margins": 0.9291170239448547, "rewards/rejected": 1.6105716228485107, "step": 14540 }, { "epoch": 0.6755188263150564, "grad_norm": 221.60691833496094, "learning_rate": 2.5949951251218716e-07, "logits/chosen": -18.611799240112305, "logits/rejected": -17.975183486938477, "logps/chosen": -402.5376892089844, "logps/rejected": -391.1629333496094, "loss": 0.7738, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0481791496276855, "rewards/margins": 0.07752052694559097, "rewards/rejected": 1.9706586599349976, "step": 14550 }, { "epoch": 0.6759831004224894, "grad_norm": 158.36801147460938, "learning_rate": 2.594716560657412e-07, "logits/chosen": -18.750438690185547, "logits/rejected": -18.284423828125, "logps/chosen": -399.79736328125, "logps/rejected": -336.0236511230469, "loss": 0.6258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4433846473693848, "rewards/margins": 0.3026999831199646, "rewards/rejected": 2.1406846046447754, "step": 14560 }, { "epoch": 0.6764473745299224, "grad_norm": 7.1938090324401855, "learning_rate": 2.5944379961929524e-07, "logits/chosen": -17.97174072265625, "logits/rejected": -16.9613094329834, "logps/chosen": -408.3375549316406, "logps/rejected": -261.47198486328125, "loss": 0.4867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.870103120803833, "rewards/margins": 1.4127379655838013, "rewards/rejected": 1.4573653936386108, "step": 14570 }, { "epoch": 0.6769116486373555, "grad_norm": 142.641357421875, "learning_rate": 2.594159431728492e-07, "logits/chosen": -19.140380859375, "logits/rejected": -18.202899932861328, "logps/chosen": -454.3133239746094, "logps/rejected": -279.1556396484375, "loss": 0.4766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.665656566619873, "rewards/margins": 0.8195323944091797, "rewards/rejected": 1.8461240530014038, "step": 14580 }, { "epoch": 0.6773759227447885, "grad_norm": 50.98454284667969, "learning_rate": 2.5938808672640327e-07, "logits/chosen": -19.045791625976562, "logits/rejected": -17.457202911376953, "logps/chosen": -433.3394470214844, "logps/rejected": -306.676025390625, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 2.737039089202881, "rewards/margins": 1.104773998260498, "rewards/rejected": 1.6322650909423828, "step": 14590 }, { "epoch": 0.6778401968522215, "grad_norm": 42.632720947265625, "learning_rate": 2.5936023027995725e-07, "logits/chosen": -17.838226318359375, "logits/rejected": -17.707124710083008, "logps/chosen": -217.39810180664062, "logps/rejected": -202.2642822265625, "loss": 0.7, "rewards/accuracies": 0.5, "rewards/chosen": 1.1266839504241943, "rewards/margins": 0.03949253633618355, "rewards/rejected": 1.0871914625167847, "step": 14600 }, { "epoch": 0.6783044709596546, "grad_norm": 45.517269134521484, "learning_rate": 2.593323738335113e-07, "logits/chosen": -18.727741241455078, "logits/rejected": -18.386463165283203, "logps/chosen": -306.421630859375, "logps/rejected": -255.8956298828125, "loss": 0.6741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0004770755767822, "rewards/margins": 0.20212197303771973, "rewards/rejected": 1.7983548641204834, "step": 14610 }, { "epoch": 0.6787687450670876, "grad_norm": 14.436305046081543, "learning_rate": 2.5930451738706534e-07, "logits/chosen": -18.932083129882812, "logits/rejected": -18.164140701293945, "logps/chosen": -369.27081298828125, "logps/rejected": -263.1763000488281, "loss": 0.6377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9987398386001587, "rewards/margins": 0.7010045051574707, "rewards/rejected": 1.2977354526519775, "step": 14620 }, { "epoch": 0.6792330191745206, "grad_norm": 111.79026794433594, "learning_rate": 2.592766609406193e-07, "logits/chosen": -19.852100372314453, "logits/rejected": -17.74478530883789, "logps/chosen": -405.11151123046875, "logps/rejected": -258.54278564453125, "loss": 0.3784, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.872633695602417, "rewards/margins": 1.1913855075836182, "rewards/rejected": 1.681248664855957, "step": 14630 }, { "epoch": 0.6796972932819537, "grad_norm": 19.082122802734375, "learning_rate": 2.5924880449417337e-07, "logits/chosen": -18.66078758239746, "logits/rejected": -18.896228790283203, "logps/chosen": -363.1207580566406, "logps/rejected": -347.075439453125, "loss": 0.6192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9960496425628662, "rewards/margins": 0.2517058253288269, "rewards/rejected": 1.7443437576293945, "step": 14640 }, { "epoch": 0.6801615673893867, "grad_norm": 66.5438461303711, "learning_rate": 2.5922094804772735e-07, "logits/chosen": -18.826244354248047, "logits/rejected": -18.56644058227539, "logps/chosen": -397.42010498046875, "logps/rejected": -346.9493408203125, "loss": 0.6464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2857518196105957, "rewards/margins": 0.4876762926578522, "rewards/rejected": 1.7980754375457764, "step": 14650 }, { "epoch": 0.6806258414968197, "grad_norm": 43.906131744384766, "learning_rate": 2.591930916012814e-07, "logits/chosen": -18.118085861206055, "logits/rejected": -17.78627586364746, "logps/chosen": -360.02410888671875, "logps/rejected": -341.370361328125, "loss": 1.0514, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9804527759552002, "rewards/margins": -0.3798372745513916, "rewards/rejected": 2.360290050506592, "step": 14660 }, { "epoch": 0.6810901156042527, "grad_norm": 14.616087913513184, "learning_rate": 2.591652351548354e-07, "logits/chosen": -18.185314178466797, "logits/rejected": -16.85137367248535, "logps/chosen": -357.2429504394531, "logps/rejected": -274.12103271484375, "loss": 0.4085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.134138584136963, "rewards/margins": 0.9834100604057312, "rewards/rejected": 1.1507282257080078, "step": 14670 }, { "epoch": 0.6815543897116858, "grad_norm": 45.577606201171875, "learning_rate": 2.591373787083894e-07, "logits/chosen": -18.459238052368164, "logits/rejected": -17.792200088500977, "logps/chosen": -433.20562744140625, "logps/rejected": -324.6001281738281, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.299402952194214, "rewards/margins": 0.42013779282569885, "rewards/rejected": 1.8792651891708374, "step": 14680 }, { "epoch": 0.6820186638191188, "grad_norm": 43.364620208740234, "learning_rate": 2.5910952226194347e-07, "logits/chosen": -18.41446876525879, "logits/rejected": -18.578441619873047, "logps/chosen": -412.3165588378906, "logps/rejected": -388.131591796875, "loss": 0.7378, "rewards/accuracies": 0.5, "rewards/chosen": 2.3656063079833984, "rewards/margins": 0.12209298461675644, "rewards/rejected": 2.243513584136963, "step": 14690 }, { "epoch": 0.6824829379265518, "grad_norm": 11.662203788757324, "learning_rate": 2.5908166581549745e-07, "logits/chosen": -18.354522705078125, "logits/rejected": -18.40561294555664, "logps/chosen": -406.80352783203125, "logps/rejected": -335.4165954589844, "loss": 0.7888, "rewards/accuracies": 0.5, "rewards/chosen": 1.9579699039459229, "rewards/margins": 0.12753772735595703, "rewards/rejected": 1.8304322957992554, "step": 14700 }, { "epoch": 0.6829472120339849, "grad_norm": 36.64363098144531, "learning_rate": 2.590538093690515e-07, "logits/chosen": -19.134258270263672, "logits/rejected": -18.516372680664062, "logps/chosen": -405.54022216796875, "logps/rejected": -322.9195251464844, "loss": 0.6404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.910400629043579, "rewards/margins": 0.28963571786880493, "rewards/rejected": 1.6207647323608398, "step": 14710 }, { "epoch": 0.6834114861414179, "grad_norm": 40.12162780761719, "learning_rate": 2.590259529226055e-07, "logits/chosen": -17.681827545166016, "logits/rejected": -17.881431579589844, "logps/chosen": -331.610107421875, "logps/rejected": -363.8981628417969, "loss": 0.6115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2035446166992188, "rewards/margins": 0.30192384123802185, "rewards/rejected": 1.901620864868164, "step": 14720 }, { "epoch": 0.6838757602488509, "grad_norm": 94.48839569091797, "learning_rate": 2.589980964761595e-07, "logits/chosen": -19.205808639526367, "logits/rejected": -18.188819885253906, "logps/chosen": -461.8021545410156, "logps/rejected": -389.748291015625, "loss": 0.6776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8471620082855225, "rewards/margins": 0.23431432247161865, "rewards/rejected": 2.6128478050231934, "step": 14730 }, { "epoch": 0.684340034356284, "grad_norm": 42.642887115478516, "learning_rate": 2.589702400297135e-07, "logits/chosen": -18.612123489379883, "logits/rejected": -18.608312606811523, "logps/chosen": -447.67303466796875, "logps/rejected": -431.4585876464844, "loss": 0.6195, "rewards/accuracies": 0.5, "rewards/chosen": 2.045959234237671, "rewards/margins": 0.24253304302692413, "rewards/rejected": 1.8034263849258423, "step": 14740 }, { "epoch": 0.684804308463717, "grad_norm": 49.61751937866211, "learning_rate": 2.5894238358326755e-07, "logits/chosen": -18.396068572998047, "logits/rejected": -17.858671188354492, "logps/chosen": -412.52239990234375, "logps/rejected": -409.74700927734375, "loss": 0.6387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.506324052810669, "rewards/margins": 0.2909148633480072, "rewards/rejected": 2.215409278869629, "step": 14750 }, { "epoch": 0.68526858257115, "grad_norm": 57.36947250366211, "learning_rate": 2.589145271368216e-07, "logits/chosen": -18.13321304321289, "logits/rejected": -17.101459503173828, "logps/chosen": -461.33392333984375, "logps/rejected": -373.23040771484375, "loss": 0.4815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.985691547393799, "rewards/margins": 0.8805230259895325, "rewards/rejected": 2.1051688194274902, "step": 14760 }, { "epoch": 0.6857328566785831, "grad_norm": 190.5449981689453, "learning_rate": 2.588866706903756e-07, "logits/chosen": -18.241313934326172, "logits/rejected": -17.28030014038086, "logps/chosen": -441.47210693359375, "logps/rejected": -326.8228759765625, "loss": 0.4958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.580183506011963, "rewards/margins": 0.8100013732910156, "rewards/rejected": 1.7701820135116577, "step": 14770 }, { "epoch": 0.6861971307860161, "grad_norm": 87.8952865600586, "learning_rate": 2.5885881424392957e-07, "logits/chosen": -19.030641555786133, "logits/rejected": -18.152633666992188, "logps/chosen": -303.4242248535156, "logps/rejected": -305.0023193359375, "loss": 0.7186, "rewards/accuracies": 0.5, "rewards/chosen": 2.402965545654297, "rewards/margins": 0.6009393334388733, "rewards/rejected": 1.8020265102386475, "step": 14780 }, { "epoch": 0.6866614048934491, "grad_norm": 75.95572662353516, "learning_rate": 2.588309577974836e-07, "logits/chosen": -17.453182220458984, "logits/rejected": -17.209081649780273, "logps/chosen": -377.00567626953125, "logps/rejected": -308.54534912109375, "loss": 0.7695, "rewards/accuracies": 0.5, "rewards/chosen": 1.8643678426742554, "rewards/margins": -0.0003834724484477192, "rewards/rejected": 1.8647512197494507, "step": 14790 }, { "epoch": 0.6871256790008822, "grad_norm": 83.17738342285156, "learning_rate": 2.5880310135103765e-07, "logits/chosen": -18.371837615966797, "logits/rejected": -18.095163345336914, "logps/chosen": -375.3080139160156, "logps/rejected": -349.3548889160156, "loss": 0.673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.779850959777832, "rewards/margins": 0.4883800446987152, "rewards/rejected": 2.291471242904663, "step": 14800 }, { "epoch": 0.6875899531083152, "grad_norm": 30.008176803588867, "learning_rate": 2.587752449045917e-07, "logits/chosen": -18.244403839111328, "logits/rejected": -17.039175033569336, "logps/chosen": -395.4510498046875, "logps/rejected": -297.94580078125, "loss": 0.6529, "rewards/accuracies": 0.5, "rewards/chosen": 2.6861648559570312, "rewards/margins": 0.35341373085975647, "rewards/rejected": 2.3327507972717285, "step": 14810 }, { "epoch": 0.6880542272157482, "grad_norm": 101.33118438720703, "learning_rate": 2.587473884581457e-07, "logits/chosen": -17.2148380279541, "logits/rejected": -16.869630813598633, "logps/chosen": -279.7679138183594, "logps/rejected": -246.88662719726562, "loss": 0.7898, "rewards/accuracies": 0.5, "rewards/chosen": 1.5789215564727783, "rewards/margins": 0.03428538888692856, "rewards/rejected": 1.5446362495422363, "step": 14820 }, { "epoch": 0.6885185013231812, "grad_norm": 80.74662017822266, "learning_rate": 2.5871953201169967e-07, "logits/chosen": -18.69586181640625, "logits/rejected": -17.612804412841797, "logps/chosen": -277.5350036621094, "logps/rejected": -246.10482788085938, "loss": 0.8646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3895970582962036, "rewards/margins": 0.11419206857681274, "rewards/rejected": 1.2754050493240356, "step": 14830 }, { "epoch": 0.6889827754306143, "grad_norm": 47.27116012573242, "learning_rate": 2.586916755652537e-07, "logits/chosen": -19.63747787475586, "logits/rejected": -19.501256942749023, "logps/chosen": -433.8834533691406, "logps/rejected": -457.61859130859375, "loss": 0.8835, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.108901262283325, "rewards/margins": -0.24660997092723846, "rewards/rejected": 2.355510950088501, "step": 14840 }, { "epoch": 0.6894470495380473, "grad_norm": 160.12147521972656, "learning_rate": 2.5866381911880775e-07, "logits/chosen": -19.065948486328125, "logits/rejected": -18.825761795043945, "logps/chosen": -369.5048522949219, "logps/rejected": -370.174072265625, "loss": 0.7399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.700739622116089, "rewards/margins": 0.3416876792907715, "rewards/rejected": 2.3590521812438965, "step": 14850 }, { "epoch": 0.6899113236454802, "grad_norm": 46.36777877807617, "learning_rate": 2.5863596267236174e-07, "logits/chosen": -19.08905792236328, "logits/rejected": -17.98282814025879, "logps/chosen": -451.8106384277344, "logps/rejected": -329.535888671875, "loss": 0.4918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0442395210266113, "rewards/margins": 0.81275475025177, "rewards/rejected": 2.231484889984131, "step": 14860 }, { "epoch": 0.6903755977529134, "grad_norm": 276.23651123046875, "learning_rate": 2.586081062259158e-07, "logits/chosen": -18.088754653930664, "logits/rejected": -16.999277114868164, "logps/chosen": -423.4562072753906, "logps/rejected": -307.2756042480469, "loss": 0.4925, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.513960361480713, "rewards/margins": 1.1451936960220337, "rewards/rejected": 1.3687670230865479, "step": 14870 }, { "epoch": 0.6908398718603463, "grad_norm": 56.36841583251953, "learning_rate": 2.585802497794698e-07, "logits/chosen": -17.69057846069336, "logits/rejected": -18.013704299926758, "logps/chosen": -329.92340087890625, "logps/rejected": -355.7355041503906, "loss": 1.0148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.146149158477783, "rewards/margins": -0.29098841547966003, "rewards/rejected": 2.4371378421783447, "step": 14880 }, { "epoch": 0.6913041459677793, "grad_norm": 16.710664749145508, "learning_rate": 2.585523933330238e-07, "logits/chosen": -19.25457000732422, "logits/rejected": -17.731128692626953, "logps/chosen": -307.76727294921875, "logps/rejected": -172.61151123046875, "loss": 0.5041, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.020132064819336, "rewards/margins": 0.902147650718689, "rewards/rejected": 1.1179845333099365, "step": 14890 }, { "epoch": 0.6917684200752124, "grad_norm": 141.59678649902344, "learning_rate": 2.585245368865778e-07, "logits/chosen": -18.13597869873047, "logits/rejected": -18.314502716064453, "logps/chosen": -377.5732421875, "logps/rejected": -349.2835998535156, "loss": 1.1595, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.8787858486175537, "rewards/margins": -0.579521119594574, "rewards/rejected": 2.4583067893981934, "step": 14900 }, { "epoch": 0.6922326941826454, "grad_norm": 177.24395751953125, "learning_rate": 2.5849668044013184e-07, "logits/chosen": -18.065853118896484, "logits/rejected": -17.899465560913086, "logps/chosen": -502.58441162109375, "logps/rejected": -391.2371520996094, "loss": 0.8225, "rewards/accuracies": 0.5, "rewards/chosen": 2.244863986968994, "rewards/margins": 0.07091186195611954, "rewards/rejected": 2.173952102661133, "step": 14910 }, { "epoch": 0.6926969682900784, "grad_norm": 27.00899314880371, "learning_rate": 2.584688239936859e-07, "logits/chosen": -18.684398651123047, "logits/rejected": -17.093130111694336, "logps/chosen": -450.0508728027344, "logps/rejected": -368.47802734375, "loss": 0.8456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6784167289733887, "rewards/margins": 0.45196160674095154, "rewards/rejected": 2.2264552116394043, "step": 14920 }, { "epoch": 0.6931612423975115, "grad_norm": 35.04024124145508, "learning_rate": 2.5844096754723987e-07, "logits/chosen": -19.265377044677734, "logits/rejected": -18.3966064453125, "logps/chosen": -482.4954528808594, "logps/rejected": -353.8456726074219, "loss": 0.495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.781062602996826, "rewards/margins": 0.6680440902709961, "rewards/rejected": 2.113018274307251, "step": 14930 }, { "epoch": 0.6936255165049445, "grad_norm": 209.60523986816406, "learning_rate": 2.584131111007939e-07, "logits/chosen": -18.07288932800293, "logits/rejected": -17.001766204833984, "logps/chosen": -468.0785217285156, "logps/rejected": -344.58038330078125, "loss": 0.6659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.408994674682617, "rewards/margins": 0.313355952501297, "rewards/rejected": 2.0956387519836426, "step": 14940 }, { "epoch": 0.6940897906123775, "grad_norm": 60.74957275390625, "learning_rate": 2.583852546543479e-07, "logits/chosen": -18.108806610107422, "logits/rejected": -18.03012466430664, "logps/chosen": -350.7191162109375, "logps/rejected": -361.6189270019531, "loss": 0.869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.468851089477539, "rewards/margins": 0.2224956750869751, "rewards/rejected": 2.2463555335998535, "step": 14950 }, { "epoch": 0.6945540647198106, "grad_norm": 0.9580501317977905, "learning_rate": 2.5835739820790194e-07, "logits/chosen": -17.746952056884766, "logits/rejected": -16.57653045654297, "logps/chosen": -481.20257568359375, "logps/rejected": -299.7729187011719, "loss": 0.5014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0928564071655273, "rewards/margins": 1.086230754852295, "rewards/rejected": 2.0066258907318115, "step": 14960 }, { "epoch": 0.6950183388272436, "grad_norm": 100.66624450683594, "learning_rate": 2.583295417614559e-07, "logits/chosen": -19.670753479003906, "logits/rejected": -18.546039581298828, "logps/chosen": -533.1820068359375, "logps/rejected": -387.5909729003906, "loss": 0.5995, "rewards/accuracies": 0.5, "rewards/chosen": 3.2310423851013184, "rewards/margins": 0.6210626363754272, "rewards/rejected": 2.6099798679351807, "step": 14970 }, { "epoch": 0.6954826129346766, "grad_norm": 81.26729583740234, "learning_rate": 2.5830168531500997e-07, "logits/chosen": -17.892080307006836, "logits/rejected": -17.740394592285156, "logps/chosen": -302.5127258300781, "logps/rejected": -324.3413391113281, "loss": 0.6008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8480708599090576, "rewards/margins": 0.2695433497428894, "rewards/rejected": 1.5785276889801025, "step": 14980 }, { "epoch": 0.6959468870421096, "grad_norm": 182.0566864013672, "learning_rate": 2.58273828868564e-07, "logits/chosen": -18.879310607910156, "logits/rejected": -17.42035484313965, "logps/chosen": -480.52557373046875, "logps/rejected": -361.7396240234375, "loss": 0.3709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9397997856140137, "rewards/margins": 1.0570846796035767, "rewards/rejected": 1.882714867591858, "step": 14990 }, { "epoch": 0.6964111611495427, "grad_norm": 33.54416275024414, "learning_rate": 2.58245972422118e-07, "logits/chosen": -17.336400985717773, "logits/rejected": -18.27261734008789, "logps/chosen": -324.74066162109375, "logps/rejected": -462.55255126953125, "loss": 1.3412, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9364064931869507, "rewards/margins": -0.6585407853126526, "rewards/rejected": 2.594947099685669, "step": 15000 }, { "epoch": 0.6968754352569757, "grad_norm": 127.23167419433594, "learning_rate": 2.5821811597567204e-07, "logits/chosen": -18.13827896118164, "logits/rejected": -17.907787322998047, "logps/chosen": -388.3667297363281, "logps/rejected": -354.8091735839844, "loss": 0.9452, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7873634099960327, "rewards/margins": -0.17338725924491882, "rewards/rejected": 1.9607505798339844, "step": 15010 }, { "epoch": 0.6973397093644087, "grad_norm": 113.2035903930664, "learning_rate": 2.58190259529226e-07, "logits/chosen": -19.165157318115234, "logits/rejected": -19.001436233520508, "logps/chosen": -387.7070007324219, "logps/rejected": -360.2164611816406, "loss": 0.7792, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2671544551849365, "rewards/margins": 0.35063794255256653, "rewards/rejected": 1.916516661643982, "step": 15020 }, { "epoch": 0.6978039834718418, "grad_norm": 28.720115661621094, "learning_rate": 2.5816240308278007e-07, "logits/chosen": -18.11000633239746, "logits/rejected": -17.154369354248047, "logps/chosen": -440.3507385253906, "logps/rejected": -308.7139587402344, "loss": 0.4906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5880980491638184, "rewards/margins": 0.6639102697372437, "rewards/rejected": 1.9241876602172852, "step": 15030 }, { "epoch": 0.6982682575792748, "grad_norm": 22.879863739013672, "learning_rate": 2.581345466363341e-07, "logits/chosen": -18.34830665588379, "logits/rejected": -17.06411361694336, "logps/chosen": -327.84075927734375, "logps/rejected": -220.2678680419922, "loss": 0.4399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5273656845092773, "rewards/margins": 0.9424979090690613, "rewards/rejected": 1.5848678350448608, "step": 15040 }, { "epoch": 0.6987325316867078, "grad_norm": 56.97555923461914, "learning_rate": 2.581066901898881e-07, "logits/chosen": -18.63327980041504, "logits/rejected": -17.707881927490234, "logps/chosen": -402.98944091796875, "logps/rejected": -287.61968994140625, "loss": 0.4283, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.479689836502075, "rewards/margins": 0.863043487071991, "rewards/rejected": 1.61664617061615, "step": 15050 }, { "epoch": 0.6991968057941409, "grad_norm": 28.6662540435791, "learning_rate": 2.5807883374344214e-07, "logits/chosen": -20.118942260742188, "logits/rejected": -19.06305503845215, "logps/chosen": -436.60626220703125, "logps/rejected": -353.06427001953125, "loss": 0.6032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.724092721939087, "rewards/margins": 0.3770723044872284, "rewards/rejected": 2.3470206260681152, "step": 15060 }, { "epoch": 0.6996610799015739, "grad_norm": 104.74388122558594, "learning_rate": 2.580509772969961e-07, "logits/chosen": -18.35064697265625, "logits/rejected": -17.91935157775879, "logps/chosen": -402.22747802734375, "logps/rejected": -273.9425048828125, "loss": 0.4213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0258121490478516, "rewards/margins": 1.122788429260254, "rewards/rejected": 1.9030239582061768, "step": 15070 }, { "epoch": 0.7001253540090069, "grad_norm": 64.13005828857422, "learning_rate": 2.5802312085055016e-07, "logits/chosen": -18.160337448120117, "logits/rejected": -17.852815628051758, "logps/chosen": -513.775146484375, "logps/rejected": -396.0306701660156, "loss": 0.6421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.91558837890625, "rewards/margins": 0.48051881790161133, "rewards/rejected": 2.4350695610046387, "step": 15080 }, { "epoch": 0.70058962811644, "grad_norm": 35.0758056640625, "learning_rate": 2.5799526440410415e-07, "logits/chosen": -18.93985366821289, "logits/rejected": -19.039247512817383, "logps/chosen": -309.9839782714844, "logps/rejected": -323.76416015625, "loss": 0.7147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9841253757476807, "rewards/margins": 0.1133476048707962, "rewards/rejected": 1.8707777261734009, "step": 15090 }, { "epoch": 0.701053902223873, "grad_norm": 128.269287109375, "learning_rate": 2.579674079576582e-07, "logits/chosen": -19.638032913208008, "logits/rejected": -18.83876609802246, "logps/chosen": -342.46978759765625, "logps/rejected": -266.94757080078125, "loss": 0.5995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.301226854324341, "rewards/margins": 0.8897630572319031, "rewards/rejected": 1.411463737487793, "step": 15100 }, { "epoch": 0.701518176331306, "grad_norm": 167.26968383789062, "learning_rate": 2.5793955151121223e-07, "logits/chosen": -18.737531661987305, "logits/rejected": -17.040241241455078, "logps/chosen": -560.5887451171875, "logps/rejected": -337.01373291015625, "loss": 0.4121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.974256992340088, "rewards/margins": 1.5112377405166626, "rewards/rejected": 1.4630193710327148, "step": 15110 }, { "epoch": 0.7019824504387391, "grad_norm": 51.19053649902344, "learning_rate": 2.579116950647662e-07, "logits/chosen": -18.364704132080078, "logits/rejected": -18.07223129272461, "logps/chosen": -321.5555419921875, "logps/rejected": -284.92266845703125, "loss": 0.7412, "rewards/accuracies": 0.5, "rewards/chosen": 2.2278411388397217, "rewards/margins": 0.1283295899629593, "rewards/rejected": 2.0995113849639893, "step": 15120 }, { "epoch": 0.7024467245461721, "grad_norm": 5.323837757110596, "learning_rate": 2.5788383861832026e-07, "logits/chosen": -18.794055938720703, "logits/rejected": -17.86052894592285, "logps/chosen": -384.6407165527344, "logps/rejected": -322.545654296875, "loss": 0.6013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.544116735458374, "rewards/margins": 0.7339708805084229, "rewards/rejected": 1.8101457357406616, "step": 15130 }, { "epoch": 0.7029109986536051, "grad_norm": 96.84346771240234, "learning_rate": 2.5785598217187425e-07, "logits/chosen": -19.16727638244629, "logits/rejected": -18.38973617553711, "logps/chosen": -440.5284729003906, "logps/rejected": -304.35430908203125, "loss": 0.5098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5129473209381104, "rewards/margins": 0.7936462163925171, "rewards/rejected": 1.7193009853363037, "step": 15140 }, { "epoch": 0.7033752727610381, "grad_norm": 117.09784698486328, "learning_rate": 2.578281257254283e-07, "logits/chosen": -18.862173080444336, "logits/rejected": -18.047000885009766, "logps/chosen": -420.628173828125, "logps/rejected": -376.9472351074219, "loss": 0.7692, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.415001392364502, "rewards/margins": 0.11637966334819794, "rewards/rejected": 2.298621654510498, "step": 15150 }, { "epoch": 0.7038395468684712, "grad_norm": 58.887168884277344, "learning_rate": 2.578002692789823e-07, "logits/chosen": -18.98210334777832, "logits/rejected": -18.271869659423828, "logps/chosen": -435.2603454589844, "logps/rejected": -276.58416748046875, "loss": 0.5202, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.627089500427246, "rewards/margins": 0.8752420544624329, "rewards/rejected": 1.7518478631973267, "step": 15160 }, { "epoch": 0.7043038209759042, "grad_norm": 8.345998764038086, "learning_rate": 2.577724128325363e-07, "logits/chosen": -18.202503204345703, "logits/rejected": -17.55138397216797, "logps/chosen": -426.8097229003906, "logps/rejected": -322.6794128417969, "loss": 0.5923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1646759510040283, "rewards/margins": 1.1620368957519531, "rewards/rejected": 2.002638816833496, "step": 15170 }, { "epoch": 0.7047680950833372, "grad_norm": 40.832794189453125, "learning_rate": 2.5774455638609036e-07, "logits/chosen": -17.674243927001953, "logits/rejected": -16.612106323242188, "logps/chosen": -435.15472412109375, "logps/rejected": -271.9411315917969, "loss": 0.5514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5251400470733643, "rewards/margins": 0.736548125743866, "rewards/rejected": 1.788591980934143, "step": 15180 }, { "epoch": 0.7052323691907703, "grad_norm": 101.04936218261719, "learning_rate": 2.5771669993964435e-07, "logits/chosen": -17.661819458007812, "logits/rejected": -17.92197036743164, "logps/chosen": -424.58447265625, "logps/rejected": -351.55194091796875, "loss": 0.9408, "rewards/accuracies": 0.5, "rewards/chosen": 2.673403263092041, "rewards/margins": 0.16625405848026276, "rewards/rejected": 2.5071492195129395, "step": 15190 }, { "epoch": 0.7056966432982033, "grad_norm": 220.45045471191406, "learning_rate": 2.5768884349319834e-07, "logits/chosen": -18.462417602539062, "logits/rejected": -18.303054809570312, "logps/chosen": -338.1781921386719, "logps/rejected": -378.3452453613281, "loss": 0.7737, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4484195709228516, "rewards/margins": 0.45933112502098083, "rewards/rejected": 1.9890884160995483, "step": 15200 }, { "epoch": 0.7061609174056362, "grad_norm": 31.123291015625, "learning_rate": 2.576609870467524e-07, "logits/chosen": -17.92028045654297, "logits/rejected": -17.420625686645508, "logps/chosen": -448.1995544433594, "logps/rejected": -386.7388610839844, "loss": 0.6052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.878033399581909, "rewards/margins": 0.747234046459198, "rewards/rejected": 2.1307992935180664, "step": 15210 }, { "epoch": 0.7066251915130694, "grad_norm": 33.457801818847656, "learning_rate": 2.576331306003064e-07, "logits/chosen": -17.556657791137695, "logits/rejected": -17.06082534790039, "logps/chosen": -375.8511657714844, "logps/rejected": -295.3192443847656, "loss": 0.6423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2990264892578125, "rewards/margins": 0.4748404622077942, "rewards/rejected": 1.8241857290267944, "step": 15220 }, { "epoch": 0.7070894656205023, "grad_norm": 74.68594360351562, "learning_rate": 2.5760527415386046e-07, "logits/chosen": -17.80000877380371, "logits/rejected": -17.80027961730957, "logps/chosen": -293.79974365234375, "logps/rejected": -271.6967468261719, "loss": 0.996, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9689871072769165, "rewards/margins": -0.20287008583545685, "rewards/rejected": 2.1718571186065674, "step": 15230 }, { "epoch": 0.7075537397279353, "grad_norm": 208.004638671875, "learning_rate": 2.5757741770741445e-07, "logits/chosen": -18.676164627075195, "logits/rejected": -18.58183479309082, "logps/chosen": -409.8710021972656, "logps/rejected": -416.77020263671875, "loss": 1.0194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.369065523147583, "rewards/margins": -0.4047682285308838, "rewards/rejected": 2.773833751678467, "step": 15240 }, { "epoch": 0.7080180138353684, "grad_norm": 142.51268005371094, "learning_rate": 2.5754956126096844e-07, "logits/chosen": -18.297183990478516, "logits/rejected": -17.701940536499023, "logps/chosen": -317.22845458984375, "logps/rejected": -320.31695556640625, "loss": 0.968, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9933979511260986, "rewards/margins": -0.2564055919647217, "rewards/rejected": 2.2498037815093994, "step": 15250 }, { "epoch": 0.7084822879428014, "grad_norm": 11.934377670288086, "learning_rate": 2.575217048145225e-07, "logits/chosen": -18.404024124145508, "logits/rejected": -17.059370040893555, "logps/chosen": -392.11346435546875, "logps/rejected": -338.93817138671875, "loss": 0.9919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.409780263900757, "rewards/margins": 0.18074734508991241, "rewards/rejected": 2.2290327548980713, "step": 15260 }, { "epoch": 0.7089465620502344, "grad_norm": 205.65057373046875, "learning_rate": 2.574938483680765e-07, "logits/chosen": -19.324186325073242, "logits/rejected": -18.952415466308594, "logps/chosen": -420.0935974121094, "logps/rejected": -376.32769775390625, "loss": 0.6284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.754453659057617, "rewards/margins": 0.6365318894386292, "rewards/rejected": 2.117922067642212, "step": 15270 }, { "epoch": 0.7094108361576675, "grad_norm": 30.43585777282715, "learning_rate": 2.574659919216305e-07, "logits/chosen": -19.041622161865234, "logits/rejected": -17.38463592529297, "logps/chosen": -428.69580078125, "logps/rejected": -234.86373901367188, "loss": 0.2967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.311864137649536, "rewards/margins": 2.063993453979492, "rewards/rejected": 1.2478708028793335, "step": 15280 }, { "epoch": 0.7098751102651005, "grad_norm": 91.02122497558594, "learning_rate": 2.5743813547518455e-07, "logits/chosen": -18.007823944091797, "logits/rejected": -17.873281478881836, "logps/chosen": -418.67974853515625, "logps/rejected": -406.2182922363281, "loss": 0.5732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6430585384368896, "rewards/margins": 0.3967207670211792, "rewards/rejected": 2.246338129043579, "step": 15290 }, { "epoch": 0.7103393843725335, "grad_norm": 72.3141860961914, "learning_rate": 2.574102790287386e-07, "logits/chosen": -19.168912887573242, "logits/rejected": -18.17534065246582, "logps/chosen": -331.9685974121094, "logps/rejected": -283.1922607421875, "loss": 0.4503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.504049777984619, "rewards/margins": 0.7105748057365417, "rewards/rejected": 1.7934753894805908, "step": 15300 }, { "epoch": 0.7108036584799666, "grad_norm": 32.03382110595703, "learning_rate": 2.573824225822926e-07, "logits/chosen": -18.84709358215332, "logits/rejected": -17.416648864746094, "logps/chosen": -379.0397033691406, "logps/rejected": -209.86428833007812, "loss": 0.4244, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.690734386444092, "rewards/margins": 1.0766209363937378, "rewards/rejected": 1.614113211631775, "step": 15310 }, { "epoch": 0.7112679325873996, "grad_norm": 47.342411041259766, "learning_rate": 2.5735456613584657e-07, "logits/chosen": -18.301538467407227, "logits/rejected": -17.904373168945312, "logps/chosen": -340.97052001953125, "logps/rejected": -354.4121398925781, "loss": 1.0495, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.710020661354065, "rewards/margins": -0.30118194222450256, "rewards/rejected": 2.011202335357666, "step": 15320 }, { "epoch": 0.7117322066948326, "grad_norm": 35.955345153808594, "learning_rate": 2.573267096894006e-07, "logits/chosen": -17.8895320892334, "logits/rejected": -17.271480560302734, "logps/chosen": -354.89703369140625, "logps/rejected": -296.39501953125, "loss": 1.2433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3286516666412354, "rewards/margins": -0.023433923721313477, "rewards/rejected": 2.3520853519439697, "step": 15330 }, { "epoch": 0.7121964808022656, "grad_norm": 36.95586395263672, "learning_rate": 2.5729885324295465e-07, "logits/chosen": -17.65865707397461, "logits/rejected": -17.504817962646484, "logps/chosen": -336.16162109375, "logps/rejected": -249.1575164794922, "loss": 0.7472, "rewards/accuracies": 0.5, "rewards/chosen": 1.5577795505523682, "rewards/margins": 0.13041844964027405, "rewards/rejected": 1.427361249923706, "step": 15340 }, { "epoch": 0.7126607549096987, "grad_norm": 77.4592514038086, "learning_rate": 2.5727099679650864e-07, "logits/chosen": -17.961645126342773, "logits/rejected": -17.14314842224121, "logps/chosen": -462.3841857910156, "logps/rejected": -338.0121765136719, "loss": 0.4122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.726148843765259, "rewards/margins": 0.9407864809036255, "rewards/rejected": 1.7853622436523438, "step": 15350 }, { "epoch": 0.7131250290171317, "grad_norm": 108.43489074707031, "learning_rate": 2.572431403500627e-07, "logits/chosen": -18.32879638671875, "logits/rejected": -17.187170028686523, "logps/chosen": -434.037109375, "logps/rejected": -259.52349853515625, "loss": 0.551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1543362140655518, "rewards/margins": 1.0073868036270142, "rewards/rejected": 2.146949529647827, "step": 15360 }, { "epoch": 0.7135893031245647, "grad_norm": 58.00227355957031, "learning_rate": 2.5721528390361667e-07, "logits/chosen": -18.572406768798828, "logits/rejected": -18.048921585083008, "logps/chosen": -501.73260498046875, "logps/rejected": -416.39599609375, "loss": 0.5653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4046030044555664, "rewards/margins": 0.6010950803756714, "rewards/rejected": 1.8035074472427368, "step": 15370 }, { "epoch": 0.7140535772319978, "grad_norm": 116.82589721679688, "learning_rate": 2.571874274571707e-07, "logits/chosen": -18.621013641357422, "logits/rejected": -17.92981719970703, "logps/chosen": -414.4234924316406, "logps/rejected": -317.55364990234375, "loss": 0.5438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7589614391326904, "rewards/margins": 1.0826170444488525, "rewards/rejected": 1.6763442754745483, "step": 15380 }, { "epoch": 0.7145178513394308, "grad_norm": 155.39698791503906, "learning_rate": 2.571595710107247e-07, "logits/chosen": -18.9091739654541, "logits/rejected": -17.80539894104004, "logps/chosen": -436.8642578125, "logps/rejected": -328.20233154296875, "loss": 0.7255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6885900497436523, "rewards/margins": 0.08393395692110062, "rewards/rejected": 2.604656457901001, "step": 15390 }, { "epoch": 0.7149821254468638, "grad_norm": 1.4152556657791138, "learning_rate": 2.5713171456427874e-07, "logits/chosen": -18.559799194335938, "logits/rejected": -16.872278213500977, "logps/chosen": -461.89630126953125, "logps/rejected": -304.21600341796875, "loss": 0.36, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.880141258239746, "rewards/margins": 1.2808114290237427, "rewards/rejected": 1.5993300676345825, "step": 15400 }, { "epoch": 0.7154463995542969, "grad_norm": 150.04908752441406, "learning_rate": 2.571038581178328e-07, "logits/chosen": -17.943912506103516, "logits/rejected": -17.75796127319336, "logps/chosen": -478.08416748046875, "logps/rejected": -446.84381103515625, "loss": 0.9825, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.5146734714508057, "rewards/margins": -0.10292589664459229, "rewards/rejected": 2.6175992488861084, "step": 15410 }, { "epoch": 0.7159106736617299, "grad_norm": 36.25785827636719, "learning_rate": 2.5707600167138676e-07, "logits/chosen": -18.682735443115234, "logits/rejected": -17.849557876586914, "logps/chosen": -394.1512451171875, "logps/rejected": -341.16351318359375, "loss": 0.4524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.80495285987854, "rewards/margins": 0.8173182606697083, "rewards/rejected": 1.9876344203948975, "step": 15420 }, { "epoch": 0.7163749477691629, "grad_norm": 217.2212677001953, "learning_rate": 2.570481452249408e-07, "logits/chosen": -18.297319412231445, "logits/rejected": -17.938236236572266, "logps/chosen": -461.1259765625, "logps/rejected": -375.31915283203125, "loss": 0.968, "rewards/accuracies": 0.5, "rewards/chosen": 2.489077091217041, "rewards/margins": 0.012110757641494274, "rewards/rejected": 2.476966381072998, "step": 15430 }, { "epoch": 0.716839221876596, "grad_norm": 183.03924560546875, "learning_rate": 2.570202887784948e-07, "logits/chosen": -19.26890754699707, "logits/rejected": -18.732772827148438, "logps/chosen": -433.8262634277344, "logps/rejected": -312.501708984375, "loss": 0.7639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.714554786682129, "rewards/margins": 0.22871167957782745, "rewards/rejected": 2.4858431816101074, "step": 15440 }, { "epoch": 0.717303495984029, "grad_norm": 27.797086715698242, "learning_rate": 2.5699243233204883e-07, "logits/chosen": -18.895267486572266, "logits/rejected": -18.18893051147461, "logps/chosen": -399.00897216796875, "logps/rejected": -312.9290466308594, "loss": 0.6579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8857536315917969, "rewards/margins": 0.4204958379268646, "rewards/rejected": 1.4652576446533203, "step": 15450 }, { "epoch": 0.717767770091462, "grad_norm": 57.14860534667969, "learning_rate": 2.569645758856029e-07, "logits/chosen": -18.04910659790039, "logits/rejected": -17.786239624023438, "logps/chosen": -376.8703918457031, "logps/rejected": -336.2587890625, "loss": 0.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2822446823120117, "rewards/margins": 0.16081103682518005, "rewards/rejected": 2.121433734893799, "step": 15460 }, { "epoch": 0.7182320441988951, "grad_norm": 82.34961700439453, "learning_rate": 2.5693671943915686e-07, "logits/chosen": -17.8268985748291, "logits/rejected": -16.768306732177734, "logps/chosen": -382.26385498046875, "logps/rejected": -254.8914794921875, "loss": 0.5852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.141378879547119, "rewards/margins": 0.5678380131721497, "rewards/rejected": 1.5735406875610352, "step": 15470 }, { "epoch": 0.7186963183063281, "grad_norm": 62.675601959228516, "learning_rate": 2.569088629927109e-07, "logits/chosen": -18.367671966552734, "logits/rejected": -17.209491729736328, "logps/chosen": -314.6733703613281, "logps/rejected": -200.37734985351562, "loss": 0.4339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.084683895111084, "rewards/margins": 1.0659939050674438, "rewards/rejected": 1.0186899900436401, "step": 15480 }, { "epoch": 0.7191605924137611, "grad_norm": 43.59312438964844, "learning_rate": 2.568810065462649e-07, "logits/chosen": -18.837806701660156, "logits/rejected": -17.44292640686035, "logps/chosen": -536.4896240234375, "logps/rejected": -313.77911376953125, "loss": 0.4298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9629220962524414, "rewards/margins": 1.016074538230896, "rewards/rejected": 1.9468475580215454, "step": 15490 }, { "epoch": 0.7196248665211941, "grad_norm": 38.53394317626953, "learning_rate": 2.568531500998189e-07, "logits/chosen": -18.379901885986328, "logits/rejected": -17.799592971801758, "logps/chosen": -363.0445251464844, "logps/rejected": -326.9077453613281, "loss": 0.5072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.264263391494751, "rewards/margins": 0.5584259033203125, "rewards/rejected": 1.7058372497558594, "step": 15500 }, { "epoch": 0.7200891406286272, "grad_norm": 146.57797241210938, "learning_rate": 2.568252936533729e-07, "logits/chosen": -19.35104751586914, "logits/rejected": -19.12948989868164, "logps/chosen": -439.68853759765625, "logps/rejected": -437.29144287109375, "loss": 0.9297, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6841483116149902, "rewards/margins": -0.22133441269397736, "rewards/rejected": 2.905482530593872, "step": 15510 }, { "epoch": 0.7205534147360602, "grad_norm": 163.37680053710938, "learning_rate": 2.5679743720692696e-07, "logits/chosen": -18.832012176513672, "logits/rejected": -18.3865909576416, "logps/chosen": -499.6903381347656, "logps/rejected": -420.0361328125, "loss": 0.4647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.573260545730591, "rewards/margins": 1.2680132389068604, "rewards/rejected": 2.3052470684051514, "step": 15520 }, { "epoch": 0.7210176888434932, "grad_norm": 68.8577880859375, "learning_rate": 2.56769580760481e-07, "logits/chosen": -19.768083572387695, "logits/rejected": -19.425081253051758, "logps/chosen": -364.8058776855469, "logps/rejected": -330.80474853515625, "loss": 0.5866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.524759292602539, "rewards/margins": 0.5480177402496338, "rewards/rejected": 1.9767415523529053, "step": 15530 }, { "epoch": 0.7214819629509263, "grad_norm": 38.48572540283203, "learning_rate": 2.56741724314035e-07, "logits/chosen": -19.80022430419922, "logits/rejected": -18.812631607055664, "logps/chosen": -348.6405334472656, "logps/rejected": -306.9290466308594, "loss": 0.5854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.444199323654175, "rewards/margins": 0.5156022310256958, "rewards/rejected": 1.9285972118377686, "step": 15540 }, { "epoch": 0.7219462370583593, "grad_norm": 150.8932647705078, "learning_rate": 2.5671386786758903e-07, "logits/chosen": -17.772676467895508, "logits/rejected": -17.258325576782227, "logps/chosen": -374.2459411621094, "logps/rejected": -341.6507263183594, "loss": 0.8468, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.533618450164795, "rewards/margins": 0.18921758234500885, "rewards/rejected": 2.3444008827209473, "step": 15550 }, { "epoch": 0.7224105111657922, "grad_norm": 152.584716796875, "learning_rate": 2.56686011421143e-07, "logits/chosen": -17.880556106567383, "logits/rejected": -17.599454879760742, "logps/chosen": -397.03192138671875, "logps/rejected": -377.307861328125, "loss": 0.7584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1689140796661377, "rewards/margins": 0.06772563606500626, "rewards/rejected": 2.1011881828308105, "step": 15560 }, { "epoch": 0.7228747852732254, "grad_norm": 38.591026306152344, "learning_rate": 2.5665815497469706e-07, "logits/chosen": -19.093494415283203, "logits/rejected": -18.844165802001953, "logps/chosen": -442.48895263671875, "logps/rejected": -386.82501220703125, "loss": 0.4216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.533756732940674, "rewards/margins": 0.9778517484664917, "rewards/rejected": 1.5559051036834717, "step": 15570 }, { "epoch": 0.7233390593806583, "grad_norm": 55.28741455078125, "learning_rate": 2.5663029852825105e-07, "logits/chosen": -18.876983642578125, "logits/rejected": -18.286712646484375, "logps/chosen": -489.62677001953125, "logps/rejected": -424.64959716796875, "loss": 0.7527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.990363597869873, "rewards/margins": 0.17773418128490448, "rewards/rejected": 2.812629222869873, "step": 15580 }, { "epoch": 0.7238033334880913, "grad_norm": 95.28441619873047, "learning_rate": 2.566024420818051e-07, "logits/chosen": -18.902223587036133, "logits/rejected": -18.632442474365234, "logps/chosen": -371.875, "logps/rejected": -412.7633361816406, "loss": 0.8331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4996299743652344, "rewards/margins": 0.013834243640303612, "rewards/rejected": 2.485795497894287, "step": 15590 }, { "epoch": 0.7242676075955244, "grad_norm": 21.159000396728516, "learning_rate": 2.5657458563535913e-07, "logits/chosen": -18.39861297607422, "logits/rejected": -17.794198989868164, "logps/chosen": -411.2022399902344, "logps/rejected": -310.89105224609375, "loss": 0.5498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.332878828048706, "rewards/margins": 0.7103937864303589, "rewards/rejected": 1.6224853992462158, "step": 15600 }, { "epoch": 0.7247318817029574, "grad_norm": 69.84664154052734, "learning_rate": 2.565467291889131e-07, "logits/chosen": -18.474103927612305, "logits/rejected": -17.7040958404541, "logps/chosen": -431.78192138671875, "logps/rejected": -328.86016845703125, "loss": 0.4761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2207953929901123, "rewards/margins": 0.9597026109695435, "rewards/rejected": 2.2610926628112793, "step": 15610 }, { "epoch": 0.7251961558103904, "grad_norm": 60.68447494506836, "learning_rate": 2.565188727424671e-07, "logits/chosen": -19.033109664916992, "logits/rejected": -17.80522346496582, "logps/chosen": -546.2651977539062, "logps/rejected": -436.733642578125, "loss": 0.7124, "rewards/accuracies": 0.5, "rewards/chosen": 3.2478995323181152, "rewards/margins": 0.46796417236328125, "rewards/rejected": 2.779935836791992, "step": 15620 }, { "epoch": 0.7256604299178235, "grad_norm": 86.45008850097656, "learning_rate": 2.5649101629602115e-07, "logits/chosen": -19.078699111938477, "logits/rejected": -18.778749465942383, "logps/chosen": -387.258056640625, "logps/rejected": -320.0406799316406, "loss": 0.5425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.060185432434082, "rewards/margins": 0.815061092376709, "rewards/rejected": 2.245124340057373, "step": 15630 }, { "epoch": 0.7261247040252565, "grad_norm": 3.890199899673462, "learning_rate": 2.564631598495752e-07, "logits/chosen": -17.953258514404297, "logits/rejected": -17.803789138793945, "logps/chosen": -292.7861022949219, "logps/rejected": -233.60427856445312, "loss": 0.6182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.228046417236328, "rewards/margins": 0.6490617990493774, "rewards/rejected": 1.5789846181869507, "step": 15640 }, { "epoch": 0.7265889781326895, "grad_norm": 27.78183937072754, "learning_rate": 2.5643530340312923e-07, "logits/chosen": -18.667293548583984, "logits/rejected": -18.035076141357422, "logps/chosen": -427.68682861328125, "logps/rejected": -253.11245727539062, "loss": 0.4713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4839072227478027, "rewards/margins": 1.0591869354248047, "rewards/rejected": 1.4247201681137085, "step": 15650 }, { "epoch": 0.7270532522401225, "grad_norm": 60.124961853027344, "learning_rate": 2.564074469566832e-07, "logits/chosen": -18.05531120300293, "logits/rejected": -17.155513763427734, "logps/chosen": -370.1578063964844, "logps/rejected": -356.7317199707031, "loss": 0.6426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.405637741088867, "rewards/margins": 0.49046340584754944, "rewards/rejected": 1.915174126625061, "step": 15660 }, { "epoch": 0.7275175263475556, "grad_norm": 12.49948501586914, "learning_rate": 2.563795905102372e-07, "logits/chosen": -19.15667152404785, "logits/rejected": -18.28342056274414, "logps/chosen": -457.2030334472656, "logps/rejected": -303.39971923828125, "loss": 0.587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.582289695739746, "rewards/margins": 0.5589069128036499, "rewards/rejected": 2.023383140563965, "step": 15670 }, { "epoch": 0.7279818004549886, "grad_norm": 193.68077087402344, "learning_rate": 2.5635173406379125e-07, "logits/chosen": -18.160409927368164, "logits/rejected": -17.926036834716797, "logps/chosen": -415.00421142578125, "logps/rejected": -389.5047302246094, "loss": 0.6627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6450066566467285, "rewards/margins": 0.4759007394313812, "rewards/rejected": 2.1691060066223145, "step": 15680 }, { "epoch": 0.7284460745624216, "grad_norm": 42.26495361328125, "learning_rate": 2.563238776173453e-07, "logits/chosen": -19.0175838470459, "logits/rejected": -17.232690811157227, "logps/chosen": -469.03240966796875, "logps/rejected": -234.96017456054688, "loss": 0.3787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2769503593444824, "rewards/margins": 1.646451711654663, "rewards/rejected": 1.6304988861083984, "step": 15690 }, { "epoch": 0.7289103486698547, "grad_norm": 59.97084426879883, "learning_rate": 2.562960211708993e-07, "logits/chosen": -18.293962478637695, "logits/rejected": -17.643306732177734, "logps/chosen": -425.4649353027344, "logps/rejected": -285.7955627441406, "loss": 0.6594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.379279613494873, "rewards/margins": 0.4864484667778015, "rewards/rejected": 1.8928312063217163, "step": 15700 }, { "epoch": 0.7293746227772877, "grad_norm": 156.486328125, "learning_rate": 2.562681647244533e-07, "logits/chosen": -19.129343032836914, "logits/rejected": -18.831270217895508, "logps/chosen": -298.45404052734375, "logps/rejected": -284.9842834472656, "loss": 0.8624, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8794448375701904, "rewards/margins": -0.03193390369415283, "rewards/rejected": 1.9113788604736328, "step": 15710 }, { "epoch": 0.7298388968847207, "grad_norm": 184.5902557373047, "learning_rate": 2.5624030827800736e-07, "logits/chosen": -18.11294937133789, "logits/rejected": -18.052440643310547, "logps/chosen": -320.12689208984375, "logps/rejected": -305.3054504394531, "loss": 1.1934, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1035315990448, "rewards/margins": 0.03631547838449478, "rewards/rejected": 2.067216396331787, "step": 15720 }, { "epoch": 0.7303031709921538, "grad_norm": 105.85315704345703, "learning_rate": 2.5621245183156135e-07, "logits/chosen": -17.679689407348633, "logits/rejected": -17.739276885986328, "logps/chosen": -350.49981689453125, "logps/rejected": -332.5029602050781, "loss": 1.073, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8317493200302124, "rewards/margins": -0.1856134682893753, "rewards/rejected": 2.017362594604492, "step": 15730 }, { "epoch": 0.7307674450995868, "grad_norm": 156.55911254882812, "learning_rate": 2.5618459538511534e-07, "logits/chosen": -18.277179718017578, "logits/rejected": -17.179065704345703, "logps/chosen": -368.34564208984375, "logps/rejected": -296.11322021484375, "loss": 0.7301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2247023582458496, "rewards/margins": 0.5900747179985046, "rewards/rejected": 1.634627342224121, "step": 15740 }, { "epoch": 0.7312317192070198, "grad_norm": 98.081298828125, "learning_rate": 2.561567389386694e-07, "logits/chosen": -18.736013412475586, "logits/rejected": -18.342134475708008, "logps/chosen": -507.51910400390625, "logps/rejected": -442.43603515625, "loss": 0.6049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.349914073944092, "rewards/margins": 0.4470750689506531, "rewards/rejected": 2.902838945388794, "step": 15750 }, { "epoch": 0.7316959933144529, "grad_norm": 110.16187286376953, "learning_rate": 2.561288824922234e-07, "logits/chosen": -18.764589309692383, "logits/rejected": -18.395105361938477, "logps/chosen": -481.38433837890625, "logps/rejected": -411.19873046875, "loss": 0.547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.966062068939209, "rewards/margins": 0.5547736287117004, "rewards/rejected": 2.4112884998321533, "step": 15760 }, { "epoch": 0.7321602674218859, "grad_norm": 121.14203643798828, "learning_rate": 2.561010260457774e-07, "logits/chosen": -19.646936416625977, "logits/rejected": -18.66512680053711, "logps/chosen": -461.70367431640625, "logps/rejected": -351.2685852050781, "loss": 0.5165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0601806640625, "rewards/margins": 0.6758030652999878, "rewards/rejected": 2.3843777179718018, "step": 15770 }, { "epoch": 0.7326245415293189, "grad_norm": 61.031639099121094, "learning_rate": 2.5607316959933145e-07, "logits/chosen": -19.366573333740234, "logits/rejected": -18.068002700805664, "logps/chosen": -374.88677978515625, "logps/rejected": -231.4895477294922, "loss": 0.3983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.594907283782959, "rewards/margins": 1.1347448825836182, "rewards/rejected": 1.46016263961792, "step": 15780 }, { "epoch": 0.733088815636752, "grad_norm": 66.51070404052734, "learning_rate": 2.5604531315288543e-07, "logits/chosen": -18.267004013061523, "logits/rejected": -17.550384521484375, "logps/chosen": -378.8968811035156, "logps/rejected": -237.9747772216797, "loss": 0.5701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4815359115600586, "rewards/margins": 0.8734281659126282, "rewards/rejected": 1.608107566833496, "step": 15790 }, { "epoch": 0.733553089744185, "grad_norm": 88.65013122558594, "learning_rate": 2.560174567064395e-07, "logits/chosen": -18.532806396484375, "logits/rejected": -17.946455001831055, "logps/chosen": -411.11962890625, "logps/rejected": -323.9721374511719, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2555038928985596, "rewards/margins": 0.553149402141571, "rewards/rejected": 1.7023544311523438, "step": 15800 }, { "epoch": 0.734017363851618, "grad_norm": 158.5780487060547, "learning_rate": 2.5598960025999346e-07, "logits/chosen": -18.569900512695312, "logits/rejected": -17.825485229492188, "logps/chosen": -441.2510681152344, "logps/rejected": -371.5365905761719, "loss": 0.7031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.552161455154419, "rewards/margins": 0.32185691595077515, "rewards/rejected": 2.23030424118042, "step": 15810 }, { "epoch": 0.734481637959051, "grad_norm": 97.21556854248047, "learning_rate": 2.559617438135475e-07, "logits/chosen": -18.689075469970703, "logits/rejected": -16.847583770751953, "logps/chosen": -484.52130126953125, "logps/rejected": -286.1214294433594, "loss": 0.5384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1898422241210938, "rewards/margins": 0.9931581616401672, "rewards/rejected": 2.1966843605041504, "step": 15820 }, { "epoch": 0.7349459120664841, "grad_norm": 69.3401107788086, "learning_rate": 2.5593388736710155e-07, "logits/chosen": -19.29296112060547, "logits/rejected": -18.369895935058594, "logps/chosen": -414.60699462890625, "logps/rejected": -321.84356689453125, "loss": 0.7442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4471564292907715, "rewards/margins": 0.46063584089279175, "rewards/rejected": 1.9865201711654663, "step": 15830 }, { "epoch": 0.7354101861739171, "grad_norm": 179.69357299804688, "learning_rate": 2.5590603092065553e-07, "logits/chosen": -17.966304779052734, "logits/rejected": -17.177446365356445, "logps/chosen": -360.33123779296875, "logps/rejected": -271.09088134765625, "loss": 0.6714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2267580032348633, "rewards/margins": 0.9071432948112488, "rewards/rejected": 1.3196146488189697, "step": 15840 }, { "epoch": 0.7358744602813501, "grad_norm": 207.28878784179688, "learning_rate": 2.558781744742096e-07, "logits/chosen": -18.019872665405273, "logits/rejected": -17.685504913330078, "logps/chosen": -379.9744873046875, "logps/rejected": -374.8552551269531, "loss": 0.7503, "rewards/accuracies": 0.5, "rewards/chosen": 2.5762157440185547, "rewards/margins": 0.1236886978149414, "rewards/rejected": 2.4525270462036133, "step": 15850 }, { "epoch": 0.7363387343887832, "grad_norm": 104.75152587890625, "learning_rate": 2.5585031802776356e-07, "logits/chosen": -20.006622314453125, "logits/rejected": -18.77371597290039, "logps/chosen": -442.0685119628906, "logps/rejected": -347.98760986328125, "loss": 0.7388, "rewards/accuracies": 0.5, "rewards/chosen": 2.563875436782837, "rewards/margins": 0.20025677978992462, "rewards/rejected": 2.3636186122894287, "step": 15860 }, { "epoch": 0.7368030084962162, "grad_norm": 120.3694839477539, "learning_rate": 2.558224615813176e-07, "logits/chosen": -19.620601654052734, "logits/rejected": -19.38593101501465, "logps/chosen": -336.7098693847656, "logps/rejected": -365.73724365234375, "loss": 0.8714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.214282274246216, "rewards/margins": -0.06738158315420151, "rewards/rejected": 2.2816638946533203, "step": 15870 }, { "epoch": 0.7372672826036492, "grad_norm": 65.9699935913086, "learning_rate": 2.5579460513487165e-07, "logits/chosen": -18.64558219909668, "logits/rejected": -18.6070499420166, "logps/chosen": -474.63104248046875, "logps/rejected": -442.1314392089844, "loss": 0.6309, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0454790592193604, "rewards/margins": 0.35996803641319275, "rewards/rejected": 2.6855106353759766, "step": 15880 }, { "epoch": 0.7377315567110823, "grad_norm": 61.613075256347656, "learning_rate": 2.5576674868842563e-07, "logits/chosen": -20.05425262451172, "logits/rejected": -17.59316635131836, "logps/chosen": -508.77117919921875, "logps/rejected": -294.5186462402344, "loss": 0.4627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9588944911956787, "rewards/margins": 1.0349347591400146, "rewards/rejected": 1.9239599704742432, "step": 15890 }, { "epoch": 0.7381958308185153, "grad_norm": 24.623332977294922, "learning_rate": 2.557388922419797e-07, "logits/chosen": -19.15335464477539, "logits/rejected": -19.0787296295166, "logps/chosen": -382.68438720703125, "logps/rejected": -385.3946228027344, "loss": 1.0394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1532468795776367, "rewards/margins": -0.22322773933410645, "rewards/rejected": 2.3764748573303223, "step": 15900 }, { "epoch": 0.7386601049259482, "grad_norm": 56.288719177246094, "learning_rate": 2.5571103579553366e-07, "logits/chosen": -18.75417709350586, "logits/rejected": -18.537975311279297, "logps/chosen": -362.68939208984375, "logps/rejected": -395.08978271484375, "loss": 0.9763, "rewards/accuracies": 0.5, "rewards/chosen": 2.3406662940979004, "rewards/margins": -0.27760809659957886, "rewards/rejected": 2.618274211883545, "step": 15910 }, { "epoch": 0.7391243790333814, "grad_norm": 11.144481658935547, "learning_rate": 2.5568317934908765e-07, "logits/chosen": -19.004589080810547, "logits/rejected": -17.45686149597168, "logps/chosen": -494.33062744140625, "logps/rejected": -332.3485412597656, "loss": 0.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.499671220779419, "rewards/margins": 0.7222381234169006, "rewards/rejected": 1.7774333953857422, "step": 15920 }, { "epoch": 0.7395886531408143, "grad_norm": 96.81707000732422, "learning_rate": 2.556553229026417e-07, "logits/chosen": -19.082563400268555, "logits/rejected": -18.186153411865234, "logps/chosen": -381.6147155761719, "logps/rejected": -294.19586181640625, "loss": 0.483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.60932993888855, "rewards/margins": 0.7550191879272461, "rewards/rejected": 1.8543107509613037, "step": 15930 }, { "epoch": 0.7400529272482473, "grad_norm": 207.79869079589844, "learning_rate": 2.5562746645619573e-07, "logits/chosen": -17.215829849243164, "logits/rejected": -16.107208251953125, "logps/chosen": -490.90606689453125, "logps/rejected": -385.2262878417969, "loss": 0.5073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.133279800415039, "rewards/margins": 0.9878731966018677, "rewards/rejected": 2.145406723022461, "step": 15940 }, { "epoch": 0.7405172013556804, "grad_norm": 248.73602294921875, "learning_rate": 2.5559961000974977e-07, "logits/chosen": -18.4277400970459, "logits/rejected": -18.156784057617188, "logps/chosen": -412.75347900390625, "logps/rejected": -400.66009521484375, "loss": 0.9413, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.49322509765625, "rewards/margins": -0.09050872176885605, "rewards/rejected": 2.583733558654785, "step": 15950 }, { "epoch": 0.7409814754631134, "grad_norm": 249.8312530517578, "learning_rate": 2.5557175356330376e-07, "logits/chosen": -17.617233276367188, "logits/rejected": -17.87925148010254, "logps/chosen": -418.15155029296875, "logps/rejected": -422.40191650390625, "loss": 1.1601, "rewards/accuracies": 0.5, "rewards/chosen": 2.5109968185424805, "rewards/margins": -0.31019967794418335, "rewards/rejected": 2.8211963176727295, "step": 15960 }, { "epoch": 0.7414457495705464, "grad_norm": 124.56487274169922, "learning_rate": 2.555438971168578e-07, "logits/chosen": -18.518383026123047, "logits/rejected": -18.826133728027344, "logps/chosen": -468.0257873535156, "logps/rejected": -447.63934326171875, "loss": 0.5476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0433099269866943, "rewards/margins": 0.6921356320381165, "rewards/rejected": 2.3511741161346436, "step": 15970 }, { "epoch": 0.7419100236779794, "grad_norm": 9.935558319091797, "learning_rate": 2.555160406704118e-07, "logits/chosen": -18.607017517089844, "logits/rejected": -17.121562957763672, "logps/chosen": -384.67962646484375, "logps/rejected": -297.2042541503906, "loss": 0.5424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.589926242828369, "rewards/margins": 0.8980365991592407, "rewards/rejected": 1.6918895244598389, "step": 15980 }, { "epoch": 0.7423742977854125, "grad_norm": 83.89319610595703, "learning_rate": 2.5548818422396583e-07, "logits/chosen": -19.025394439697266, "logits/rejected": -18.03605842590332, "logps/chosen": -292.216552734375, "logps/rejected": -243.1363067626953, "loss": 0.561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2969062328338623, "rewards/margins": 0.5178915858268738, "rewards/rejected": 1.7790145874023438, "step": 15990 }, { "epoch": 0.7428385718928455, "grad_norm": 132.47744750976562, "learning_rate": 2.554603277775198e-07, "logits/chosen": -19.03727912902832, "logits/rejected": -18.32314682006836, "logps/chosen": -435.181640625, "logps/rejected": -393.10369873046875, "loss": 0.551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.992159843444824, "rewards/margins": 0.9059656262397766, "rewards/rejected": 2.0861942768096924, "step": 16000 }, { "epoch": 0.7433028460002785, "grad_norm": 23.614118576049805, "learning_rate": 2.5543247133107386e-07, "logits/chosen": -18.289859771728516, "logits/rejected": -17.273752212524414, "logps/chosen": -433.37042236328125, "logps/rejected": -298.3884582519531, "loss": 0.5871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7962632179260254, "rewards/margins": 0.7475590109825134, "rewards/rejected": 2.048704147338867, "step": 16010 }, { "epoch": 0.7437671201077116, "grad_norm": 50.622257232666016, "learning_rate": 2.554046148846279e-07, "logits/chosen": -19.030641555786133, "logits/rejected": -17.853551864624023, "logps/chosen": -368.76910400390625, "logps/rejected": -263.50006103515625, "loss": 0.4688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8894457817077637, "rewards/margins": 0.6702525019645691, "rewards/rejected": 2.2191929817199707, "step": 16020 }, { "epoch": 0.7442313942151446, "grad_norm": 145.3035888671875, "learning_rate": 2.553767584381819e-07, "logits/chosen": -18.151063919067383, "logits/rejected": -18.070270538330078, "logps/chosen": -308.50665283203125, "logps/rejected": -262.40069580078125, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.530123710632324, "rewards/margins": 0.8231012225151062, "rewards/rejected": 1.7070226669311523, "step": 16030 }, { "epoch": 0.7446956683225776, "grad_norm": 2.2329964637756348, "learning_rate": 2.553489019917359e-07, "logits/chosen": -18.938459396362305, "logits/rejected": -18.212770462036133, "logps/chosen": -323.4518737792969, "logps/rejected": -258.94281005859375, "loss": 0.5414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2932982444763184, "rewards/margins": 0.6545493602752686, "rewards/rejected": 1.6387488842010498, "step": 16040 }, { "epoch": 0.7451599424300107, "grad_norm": 0.6190788149833679, "learning_rate": 2.553210455452899e-07, "logits/chosen": -17.950130462646484, "logits/rejected": -17.393651962280273, "logps/chosen": -305.87506103515625, "logps/rejected": -253.948974609375, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.323244333267212, "rewards/margins": 0.6512290835380554, "rewards/rejected": 1.6720151901245117, "step": 16050 }, { "epoch": 0.7456242165374437, "grad_norm": 15.86921215057373, "learning_rate": 2.5529318909884396e-07, "logits/chosen": -18.392898559570312, "logits/rejected": -18.397253036499023, "logps/chosen": -417.39068603515625, "logps/rejected": -442.1070251464844, "loss": 0.6489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.813662528991699, "rewards/margins": 0.4551362097263336, "rewards/rejected": 2.3585262298583984, "step": 16060 }, { "epoch": 0.7460884906448767, "grad_norm": 114.96328735351562, "learning_rate": 2.55265332652398e-07, "logits/chosen": -18.86119270324707, "logits/rejected": -18.315135955810547, "logps/chosen": -386.97247314453125, "logps/rejected": -342.73577880859375, "loss": 0.6361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7329020500183105, "rewards/margins": 0.5769966244697571, "rewards/rejected": 2.1559054851531982, "step": 16070 }, { "epoch": 0.7465527647523098, "grad_norm": 14.29543685913086, "learning_rate": 2.55237476205952e-07, "logits/chosen": -18.183258056640625, "logits/rejected": -18.07705307006836, "logps/chosen": -417.13299560546875, "logps/rejected": -328.8596496582031, "loss": 0.7505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5410187244415283, "rewards/margins": 0.6134496331214905, "rewards/rejected": 1.927569031715393, "step": 16080 }, { "epoch": 0.7470170388597428, "grad_norm": 47.315834045410156, "learning_rate": 2.55209619759506e-07, "logits/chosen": -18.98259925842285, "logits/rejected": -17.776302337646484, "logps/chosen": -443.59161376953125, "logps/rejected": -329.53155517578125, "loss": 0.437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.293024778366089, "rewards/margins": 0.9690984487533569, "rewards/rejected": 2.3239264488220215, "step": 16090 }, { "epoch": 0.7474813129671758, "grad_norm": 135.5561981201172, "learning_rate": 2.5518176331306e-07, "logits/chosen": -18.15098762512207, "logits/rejected": -17.32550048828125, "logps/chosen": -470.19970703125, "logps/rejected": -419.377685546875, "loss": 0.8123, "rewards/accuracies": 0.5, "rewards/chosen": 2.791268825531006, "rewards/margins": 0.41341885924339294, "rewards/rejected": 2.37785005569458, "step": 16100 }, { "epoch": 0.7479455870746089, "grad_norm": 42.687171936035156, "learning_rate": 2.55153906866614e-07, "logits/chosen": -18.92837905883789, "logits/rejected": -17.84553337097168, "logps/chosen": -404.9485168457031, "logps/rejected": -379.7091064453125, "loss": 0.9926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.087724208831787, "rewards/margins": 0.4650351405143738, "rewards/rejected": 2.6226894855499268, "step": 16110 }, { "epoch": 0.7484098611820419, "grad_norm": 78.33258819580078, "learning_rate": 2.5512605042016805e-07, "logits/chosen": -17.965681076049805, "logits/rejected": -17.58993911743164, "logps/chosen": -457.2889709472656, "logps/rejected": -451.23797607421875, "loss": 0.9109, "rewards/accuracies": 0.5, "rewards/chosen": 2.403125047683716, "rewards/margins": -0.11529765278100967, "rewards/rejected": 2.518422842025757, "step": 16120 }, { "epoch": 0.7488741352894749, "grad_norm": 156.97755432128906, "learning_rate": 2.550981939737221e-07, "logits/chosen": -17.6632080078125, "logits/rejected": -17.37684440612793, "logps/chosen": -406.66888427734375, "logps/rejected": -365.0013427734375, "loss": 1.0691, "rewards/accuracies": 0.5, "rewards/chosen": 2.3687593936920166, "rewards/margins": 0.16179366409778595, "rewards/rejected": 2.206965923309326, "step": 16130 }, { "epoch": 0.749338409396908, "grad_norm": 104.53511047363281, "learning_rate": 2.5507033752727613e-07, "logits/chosen": -19.446504592895508, "logits/rejected": -19.196264266967773, "logps/chosen": -398.17791748046875, "logps/rejected": -313.839111328125, "loss": 0.6677, "rewards/accuracies": 0.5, "rewards/chosen": 2.677833080291748, "rewards/margins": 0.6476699113845825, "rewards/rejected": 2.030163288116455, "step": 16140 }, { "epoch": 0.749802683504341, "grad_norm": 134.26046752929688, "learning_rate": 2.550424810808301e-07, "logits/chosen": -18.227005004882812, "logits/rejected": -18.111501693725586, "logps/chosen": -494.46905517578125, "logps/rejected": -473.9961853027344, "loss": 0.8346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.641352415084839, "rewards/margins": -0.0478692464530468, "rewards/rejected": 2.6892216205596924, "step": 16150 }, { "epoch": 0.750266957611774, "grad_norm": 102.3143310546875, "learning_rate": 2.550146246343841e-07, "logits/chosen": -18.441524505615234, "logits/rejected": -18.050081253051758, "logps/chosen": -530.2279052734375, "logps/rejected": -484.0579528808594, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.624359607696533, "rewards/margins": 0.8393077850341797, "rewards/rejected": 2.785051107406616, "step": 16160 }, { "epoch": 0.750731231719207, "grad_norm": 39.32597351074219, "learning_rate": 2.5498676818793815e-07, "logits/chosen": -19.043598175048828, "logits/rejected": -19.32428550720215, "logps/chosen": -430.60174560546875, "logps/rejected": -425.36279296875, "loss": 0.6038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1827588081359863, "rewards/margins": 0.3979312777519226, "rewards/rejected": 2.78482723236084, "step": 16170 }, { "epoch": 0.7511955058266401, "grad_norm": 18.28505516052246, "learning_rate": 2.549589117414922e-07, "logits/chosen": -18.619054794311523, "logits/rejected": -18.240026473999023, "logps/chosen": -355.2491149902344, "logps/rejected": -319.036865234375, "loss": 1.113, "rewards/accuracies": 0.5, "rewards/chosen": 1.8763577938079834, "rewards/margins": 0.11728501319885254, "rewards/rejected": 1.7590728998184204, "step": 16180 }, { "epoch": 0.7516597799340731, "grad_norm": 3.4139657020568848, "learning_rate": 2.549310552950462e-07, "logits/chosen": -18.271059036254883, "logits/rejected": -17.612842559814453, "logps/chosen": -378.7451171875, "logps/rejected": -320.2887268066406, "loss": 0.8728, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.387690782546997, "rewards/margins": 0.07530611753463745, "rewards/rejected": 2.312384843826294, "step": 16190 }, { "epoch": 0.7521240540415061, "grad_norm": 83.93643951416016, "learning_rate": 2.549031988486002e-07, "logits/chosen": -17.232786178588867, "logits/rejected": -17.110523223876953, "logps/chosen": -339.56805419921875, "logps/rejected": -271.85467529296875, "loss": 0.9064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.020798444747925, "rewards/margins": -0.11770696938037872, "rewards/rejected": 2.138505458831787, "step": 16200 }, { "epoch": 0.7525883281489392, "grad_norm": 144.77102661132812, "learning_rate": 2.548753424021542e-07, "logits/chosen": -17.860820770263672, "logits/rejected": -17.383907318115234, "logps/chosen": -355.8192138671875, "logps/rejected": -296.5852966308594, "loss": 0.68, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.658968448638916, "rewards/margins": 0.7497783303260803, "rewards/rejected": 1.9091901779174805, "step": 16210 }, { "epoch": 0.7530526022563722, "grad_norm": 63.08860778808594, "learning_rate": 2.5484748595570825e-07, "logits/chosen": -17.609973907470703, "logits/rejected": -17.6845703125, "logps/chosen": -348.76556396484375, "logps/rejected": -340.142578125, "loss": 0.8006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.079216480255127, "rewards/margins": 0.09673140943050385, "rewards/rejected": 1.9824848175048828, "step": 16220 }, { "epoch": 0.7535168763638052, "grad_norm": 8.035149574279785, "learning_rate": 2.5481962950926223e-07, "logits/chosen": -18.55206871032715, "logits/rejected": -17.987747192382812, "logps/chosen": -428.008056640625, "logps/rejected": -291.54608154296875, "loss": 0.5449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.149843215942383, "rewards/margins": 1.1916157007217407, "rewards/rejected": 1.9582273960113525, "step": 16230 }, { "epoch": 0.7539811504712383, "grad_norm": 121.29615020751953, "learning_rate": 2.547917730628163e-07, "logits/chosen": -17.898311614990234, "logits/rejected": -17.01166343688965, "logps/chosen": -441.5956115722656, "logps/rejected": -318.96844482421875, "loss": 0.6075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.33896803855896, "rewards/margins": 0.5332751274108887, "rewards/rejected": 1.8056930303573608, "step": 16240 }, { "epoch": 0.7544454245786713, "grad_norm": 36.680946350097656, "learning_rate": 2.547639166163703e-07, "logits/chosen": -18.5052547454834, "logits/rejected": -17.5084285736084, "logps/chosen": -303.51727294921875, "logps/rejected": -229.9956512451172, "loss": 0.4948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8782249689102173, "rewards/margins": 0.7677608728408813, "rewards/rejected": 1.110464096069336, "step": 16250 }, { "epoch": 0.7549096986861042, "grad_norm": 30.503864288330078, "learning_rate": 2.547360601699243e-07, "logits/chosen": -18.39933967590332, "logits/rejected": -18.07063865661621, "logps/chosen": -400.2071228027344, "logps/rejected": -342.6182556152344, "loss": 0.6645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8477327823638916, "rewards/margins": 0.5502988696098328, "rewards/rejected": 2.297434091567993, "step": 16260 }, { "epoch": 0.7553739727935374, "grad_norm": 17.9056339263916, "learning_rate": 2.5470820372347834e-07, "logits/chosen": -17.527294158935547, "logits/rejected": -18.298723220825195, "logps/chosen": -395.51513671875, "logps/rejected": -416.3946228027344, "loss": 1.1468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7370033264160156, "rewards/margins": -0.29239019751548767, "rewards/rejected": 3.029393434524536, "step": 16270 }, { "epoch": 0.7558382469009703, "grad_norm": 49.666053771972656, "learning_rate": 2.5468034727703233e-07, "logits/chosen": -19.337848663330078, "logits/rejected": -18.40226173400879, "logps/chosen": -403.91510009765625, "logps/rejected": -271.05889892578125, "loss": 0.4643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9008939266204834, "rewards/margins": 1.1064616441726685, "rewards/rejected": 1.794432282447815, "step": 16280 }, { "epoch": 0.7563025210084033, "grad_norm": 54.63804626464844, "learning_rate": 2.5465249083058637e-07, "logits/chosen": -18.942302703857422, "logits/rejected": -18.830333709716797, "logps/chosen": -421.2701721191406, "logps/rejected": -376.87359619140625, "loss": 0.7725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.301311731338501, "rewards/margins": 0.08333752304315567, "rewards/rejected": 2.2179741859436035, "step": 16290 }, { "epoch": 0.7567667951158364, "grad_norm": 95.79203796386719, "learning_rate": 2.5462463438414036e-07, "logits/chosen": -18.51106071472168, "logits/rejected": -17.84912109375, "logps/chosen": -291.4202575683594, "logps/rejected": -274.97955322265625, "loss": 0.7422, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9329429864883423, "rewards/margins": 0.23573437333106995, "rewards/rejected": 1.6972087621688843, "step": 16300 }, { "epoch": 0.7572310692232694, "grad_norm": 23.505050659179688, "learning_rate": 2.545967779376944e-07, "logits/chosen": -18.336122512817383, "logits/rejected": -17.35568618774414, "logps/chosen": -421.501220703125, "logps/rejected": -256.71197509765625, "loss": 0.2571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.126729965209961, "rewards/margins": 1.5293474197387695, "rewards/rejected": 1.5973827838897705, "step": 16310 }, { "epoch": 0.7576953433307024, "grad_norm": 108.82577514648438, "learning_rate": 2.5456892149124844e-07, "logits/chosen": -18.61983871459961, "logits/rejected": -19.06268882751465, "logps/chosen": -419.0768127441406, "logps/rejected": -406.56085205078125, "loss": 1.0554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.53601336479187, "rewards/margins": -0.19648131728172302, "rewards/rejected": 2.732494592666626, "step": 16320 }, { "epoch": 0.7581596174381354, "grad_norm": 60.21142578125, "learning_rate": 2.5454106504480243e-07, "logits/chosen": -18.063322067260742, "logits/rejected": -17.688297271728516, "logps/chosen": -343.7793273925781, "logps/rejected": -279.544677734375, "loss": 0.6988, "rewards/accuracies": 0.5, "rewards/chosen": 2.0236735343933105, "rewards/margins": 0.2774263620376587, "rewards/rejected": 1.7462472915649414, "step": 16330 }, { "epoch": 0.7586238915455685, "grad_norm": 126.31130981445312, "learning_rate": 2.545132085983564e-07, "logits/chosen": -18.612428665161133, "logits/rejected": -18.127674102783203, "logps/chosen": -351.4813537597656, "logps/rejected": -312.48187255859375, "loss": 0.6709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.278961181640625, "rewards/margins": 0.5647881627082825, "rewards/rejected": 1.7141729593276978, "step": 16340 }, { "epoch": 0.7590881656530015, "grad_norm": 59.84050369262695, "learning_rate": 2.5448535215191046e-07, "logits/chosen": -18.42252540588379, "logits/rejected": -19.447290420532227, "logps/chosen": -411.14398193359375, "logps/rejected": -436.25714111328125, "loss": 1.028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.088015079498291, "rewards/margins": 0.08571696281433105, "rewards/rejected": 2.00229811668396, "step": 16350 }, { "epoch": 0.7595524397604345, "grad_norm": 36.7324333190918, "learning_rate": 2.544574957054645e-07, "logits/chosen": -19.35929298400879, "logits/rejected": -18.825420379638672, "logps/chosen": -455.1165466308594, "logps/rejected": -342.3982238769531, "loss": 0.5773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.095310688018799, "rewards/margins": 0.6700271368026733, "rewards/rejected": 2.425283432006836, "step": 16360 }, { "epoch": 0.7600167138678676, "grad_norm": 30.509809494018555, "learning_rate": 2.5442963925901854e-07, "logits/chosen": -18.575353622436523, "logits/rejected": -18.422964096069336, "logps/chosen": -333.29620361328125, "logps/rejected": -367.19708251953125, "loss": 0.6102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1348280906677246, "rewards/margins": 0.4454224705696106, "rewards/rejected": 1.6894056797027588, "step": 16370 }, { "epoch": 0.7604809879753006, "grad_norm": 163.27638244628906, "learning_rate": 2.5440178281257253e-07, "logits/chosen": -18.147911071777344, "logits/rejected": -17.775726318359375, "logps/chosen": -316.79046630859375, "logps/rejected": -339.557861328125, "loss": 0.7999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.212261199951172, "rewards/margins": 0.021511757746338844, "rewards/rejected": 2.190749406814575, "step": 16380 }, { "epoch": 0.7609452620827336, "grad_norm": 35.36598205566406, "learning_rate": 2.5437392636612657e-07, "logits/chosen": -18.989009857177734, "logits/rejected": -18.163227081298828, "logps/chosen": -386.73846435546875, "logps/rejected": -372.6816101074219, "loss": 0.7461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7724814414978027, "rewards/margins": 0.3323458135128021, "rewards/rejected": 2.4401354789733887, "step": 16390 }, { "epoch": 0.7614095361901667, "grad_norm": 49.88683319091797, "learning_rate": 2.5434606991968056e-07, "logits/chosen": -18.43052864074707, "logits/rejected": -18.360370635986328, "logps/chosen": -338.1928405761719, "logps/rejected": -255.232177734375, "loss": 0.6655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4073755741119385, "rewards/margins": 0.5800169706344604, "rewards/rejected": 1.8273588418960571, "step": 16400 }, { "epoch": 0.7618738102975997, "grad_norm": 191.02117919921875, "learning_rate": 2.543182134732346e-07, "logits/chosen": -17.892261505126953, "logits/rejected": -17.494930267333984, "logps/chosen": -535.89794921875, "logps/rejected": -478.88482666015625, "loss": 0.7592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.941284656524658, "rewards/margins": 0.2348586767911911, "rewards/rejected": 2.706425905227661, "step": 16410 }, { "epoch": 0.7623380844050327, "grad_norm": 81.0503158569336, "learning_rate": 2.542903570267886e-07, "logits/chosen": -16.60776710510254, "logits/rejected": -16.714759826660156, "logps/chosen": -251.3863067626953, "logps/rejected": -270.4551086425781, "loss": 1.1452, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.377060890197754, "rewards/margins": -0.3321317732334137, "rewards/rejected": 1.7091926336288452, "step": 16420 }, { "epoch": 0.7628023585124658, "grad_norm": 34.18557357788086, "learning_rate": 2.5426250058034263e-07, "logits/chosen": -19.675891876220703, "logits/rejected": -18.630151748657227, "logps/chosen": -491.4349670410156, "logps/rejected": -376.298828125, "loss": 0.6435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.044368028640747, "rewards/margins": 0.32874828577041626, "rewards/rejected": 2.715620279312134, "step": 16430 }, { "epoch": 0.7632666326198988, "grad_norm": 73.79036712646484, "learning_rate": 2.5423464413389667e-07, "logits/chosen": -18.509334564208984, "logits/rejected": -17.703826904296875, "logps/chosen": -357.2140197753906, "logps/rejected": -265.5331115722656, "loss": 0.7073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.344104528427124, "rewards/margins": 0.7547965049743652, "rewards/rejected": 1.5893081426620483, "step": 16440 }, { "epoch": 0.7637309067273318, "grad_norm": 47.404544830322266, "learning_rate": 2.5420678768745066e-07, "logits/chosen": -18.949831008911133, "logits/rejected": -17.42843246459961, "logps/chosen": -441.47772216796875, "logps/rejected": -299.0196838378906, "loss": 0.5721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8113739490509033, "rewards/margins": 0.883194088935852, "rewards/rejected": 1.9281800985336304, "step": 16450 }, { "epoch": 0.7641951808347649, "grad_norm": 108.2694091796875, "learning_rate": 2.5417893124100465e-07, "logits/chosen": -19.28909683227539, "logits/rejected": -18.910602569580078, "logps/chosen": -503.67999267578125, "logps/rejected": -542.6241455078125, "loss": 0.6557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1626617908477783, "rewards/margins": 0.4431779980659485, "rewards/rejected": 2.7194838523864746, "step": 16460 }, { "epoch": 0.7646594549421979, "grad_norm": 96.17232513427734, "learning_rate": 2.541510747945587e-07, "logits/chosen": -18.6724910736084, "logits/rejected": -18.151411056518555, "logps/chosen": -401.7986755371094, "logps/rejected": -394.18804931640625, "loss": 0.694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5930495262145996, "rewards/margins": 0.21757133305072784, "rewards/rejected": 2.3754780292510986, "step": 16470 }, { "epoch": 0.7651237290496309, "grad_norm": 111.63461303710938, "learning_rate": 2.5412321834811273e-07, "logits/chosen": -18.33052635192871, "logits/rejected": -18.237783432006836, "logps/chosen": -435.6622619628906, "logps/rejected": -416.0594177246094, "loss": 0.7998, "rewards/accuracies": 0.5, "rewards/chosen": 2.844780683517456, "rewards/margins": 0.3489716649055481, "rewards/rejected": 2.4958088397979736, "step": 16480 }, { "epoch": 0.7655880031570639, "grad_norm": 64.7467269897461, "learning_rate": 2.540953619016667e-07, "logits/chosen": -19.14669418334961, "logits/rejected": -18.90494155883789, "logps/chosen": -385.62396240234375, "logps/rejected": -290.8526916503906, "loss": 0.5221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.697399377822876, "rewards/margins": 0.8457517623901367, "rewards/rejected": 1.8516477346420288, "step": 16490 }, { "epoch": 0.766052277264497, "grad_norm": 63.145240783691406, "learning_rate": 2.5407029109986534e-07, "logits/chosen": -17.8979434967041, "logits/rejected": -17.166339874267578, "logps/chosen": -427.55340576171875, "logps/rejected": -334.7711181640625, "loss": 0.984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1149423122406006, "rewards/margins": -0.06595874577760696, "rewards/rejected": 2.180901050567627, "step": 16500 }, { "epoch": 0.76651655137193, "grad_norm": 76.34349822998047, "learning_rate": 2.540424346534194e-07, "logits/chosen": -18.8200626373291, "logits/rejected": -17.560522079467773, "logps/chosen": -489.99664306640625, "logps/rejected": -419.548583984375, "loss": 0.5875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2739510536193848, "rewards/margins": 0.6970870494842529, "rewards/rejected": 2.576864004135132, "step": 16510 }, { "epoch": 0.766980825479363, "grad_norm": 6.424755096435547, "learning_rate": 2.5401457820697337e-07, "logits/chosen": -18.236083984375, "logits/rejected": -16.56634521484375, "logps/chosen": -413.821044921875, "logps/rejected": -244.36764526367188, "loss": 0.4245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.579277992248535, "rewards/margins": 0.9673298597335815, "rewards/rejected": 1.611948013305664, "step": 16520 }, { "epoch": 0.7674450995867961, "grad_norm": 39.47862243652344, "learning_rate": 2.539867217605274e-07, "logits/chosen": -18.257854461669922, "logits/rejected": -18.36752700805664, "logps/chosen": -417.4767150878906, "logps/rejected": -338.86285400390625, "loss": 0.9429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1466569900512695, "rewards/margins": -0.16857045888900757, "rewards/rejected": 2.3152272701263428, "step": 16530 }, { "epoch": 0.7679093736942291, "grad_norm": 27.354076385498047, "learning_rate": 2.5395886531408145e-07, "logits/chosen": -18.529359817504883, "logits/rejected": -17.516197204589844, "logps/chosen": -402.7590637207031, "logps/rejected": -304.10540771484375, "loss": 0.6552, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.068991184234619, "rewards/margins": 1.002820611000061, "rewards/rejected": 2.0661706924438477, "step": 16540 }, { "epoch": 0.7683736478016621, "grad_norm": 45.4425048828125, "learning_rate": 2.5393100886763544e-07, "logits/chosen": -19.329936981201172, "logits/rejected": -17.8759708404541, "logps/chosen": -439.33917236328125, "logps/rejected": -295.5068664550781, "loss": 0.3985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9401097297668457, "rewards/margins": 1.0811302661895752, "rewards/rejected": 1.85897958278656, "step": 16550 }, { "epoch": 0.7688379219090952, "grad_norm": 145.8253173828125, "learning_rate": 2.539031524211895e-07, "logits/chosen": -18.433483123779297, "logits/rejected": -16.92330551147461, "logps/chosen": -521.2639770507812, "logps/rejected": -316.9432067871094, "loss": 0.3966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6121420860290527, "rewards/margins": 1.8730007410049438, "rewards/rejected": 1.7391412258148193, "step": 16560 }, { "epoch": 0.7693021960165282, "grad_norm": 70.19710540771484, "learning_rate": 2.5387529597474347e-07, "logits/chosen": -19.673206329345703, "logits/rejected": -17.79239273071289, "logps/chosen": -474.259765625, "logps/rejected": -386.4837646484375, "loss": 0.6064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.91640305519104, "rewards/margins": 0.42237311601638794, "rewards/rejected": 2.494029998779297, "step": 16570 }, { "epoch": 0.7697664701239612, "grad_norm": 92.55513763427734, "learning_rate": 2.5384743952829745e-07, "logits/chosen": -19.223430633544922, "logits/rejected": -18.350271224975586, "logps/chosen": -453.039794921875, "logps/rejected": -347.2677917480469, "loss": 0.5986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9851808547973633, "rewards/margins": 0.4315153956413269, "rewards/rejected": 2.5536653995513916, "step": 16580 }, { "epoch": 0.7702307442313943, "grad_norm": 49.55934143066406, "learning_rate": 2.538195830818515e-07, "logits/chosen": -18.257131576538086, "logits/rejected": -18.537256240844727, "logps/chosen": -322.37188720703125, "logps/rejected": -337.0528564453125, "loss": 0.664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5161070823669434, "rewards/margins": 0.2944023609161377, "rewards/rejected": 2.2217044830322266, "step": 16590 }, { "epoch": 0.7706950183388273, "grad_norm": 80.05570983886719, "learning_rate": 2.5379172663540554e-07, "logits/chosen": -19.327531814575195, "logits/rejected": -19.208690643310547, "logps/chosen": -404.73126220703125, "logps/rejected": -400.4141540527344, "loss": 0.9113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.400050640106201, "rewards/margins": -0.1608913689851761, "rewards/rejected": 2.5609419345855713, "step": 16600 }, { "epoch": 0.7711592924462602, "grad_norm": 5.1227498054504395, "learning_rate": 2.537638701889596e-07, "logits/chosen": -20.13288116455078, "logits/rejected": -18.948328018188477, "logps/chosen": -500.6852111816406, "logps/rejected": -328.8721008300781, "loss": 0.421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.062236785888672, "rewards/margins": 1.3891923427581787, "rewards/rejected": 1.6730444431304932, "step": 16610 }, { "epoch": 0.7716235665536934, "grad_norm": 128.5662384033203, "learning_rate": 2.5373601374251357e-07, "logits/chosen": -19.476558685302734, "logits/rejected": -17.8719482421875, "logps/chosen": -349.97760009765625, "logps/rejected": -242.36386108398438, "loss": 0.4464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4446489810943604, "rewards/margins": 1.102329969406128, "rewards/rejected": 1.342319130897522, "step": 16620 }, { "epoch": 0.7720878406611263, "grad_norm": 28.126617431640625, "learning_rate": 2.537081572960676e-07, "logits/chosen": -18.44106101989746, "logits/rejected": -17.780590057373047, "logps/chosen": -402.21942138671875, "logps/rejected": -325.36492919921875, "loss": 0.6759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4935715198516846, "rewards/margins": 0.36991602182388306, "rewards/rejected": 2.123655080795288, "step": 16630 }, { "epoch": 0.7725521147685593, "grad_norm": 100.38699340820312, "learning_rate": 2.536803008496216e-07, "logits/chosen": -18.51235580444336, "logits/rejected": -17.552331924438477, "logps/chosen": -511.30572509765625, "logps/rejected": -404.61187744140625, "loss": 0.3579, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.343104124069214, "rewards/margins": 1.182787537574768, "rewards/rejected": 2.1603169441223145, "step": 16640 }, { "epoch": 0.7730163888759923, "grad_norm": 32.33974075317383, "learning_rate": 2.5365244440317564e-07, "logits/chosen": -18.302898406982422, "logits/rejected": -17.957820892333984, "logps/chosen": -421.7867736816406, "logps/rejected": -339.48321533203125, "loss": 0.6556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9437546730041504, "rewards/margins": 0.7257295846939087, "rewards/rejected": 2.2180252075195312, "step": 16650 }, { "epoch": 0.7734806629834254, "grad_norm": 47.99268341064453, "learning_rate": 2.536245879567296e-07, "logits/chosen": -18.478105545043945, "logits/rejected": -17.75741958618164, "logps/chosen": -418.42108154296875, "logps/rejected": -366.66851806640625, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5982635021209717, "rewards/margins": 0.6841856837272644, "rewards/rejected": 1.9140777587890625, "step": 16660 }, { "epoch": 0.7739449370908584, "grad_norm": 141.4029083251953, "learning_rate": 2.5359673151028366e-07, "logits/chosen": -17.89651870727539, "logits/rejected": -17.53569221496582, "logps/chosen": -443.9959411621094, "logps/rejected": -311.8573913574219, "loss": 0.4891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.371025562286377, "rewards/margins": 0.6587442755699158, "rewards/rejected": 1.7122812271118164, "step": 16670 }, { "epoch": 0.7744092111982914, "grad_norm": 69.83184051513672, "learning_rate": 2.535688750638377e-07, "logits/chosen": -19.244956970214844, "logits/rejected": -18.100618362426758, "logps/chosen": -506.8809509277344, "logps/rejected": -324.3634033203125, "loss": 0.5308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.995183229446411, "rewards/margins": 0.8023874163627625, "rewards/rejected": 2.192795515060425, "step": 16680 }, { "epoch": 0.7748734853057245, "grad_norm": 224.66990661621094, "learning_rate": 2.535410186173917e-07, "logits/chosen": -18.12306785583496, "logits/rejected": -17.216415405273438, "logps/chosen": -384.04718017578125, "logps/rejected": -274.7686767578125, "loss": 0.4975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.528662919998169, "rewards/margins": 0.9240372776985168, "rewards/rejected": 1.6046257019042969, "step": 16690 }, { "epoch": 0.7753377594131575, "grad_norm": 234.097900390625, "learning_rate": 2.535131621709457e-07, "logits/chosen": -18.284225463867188, "logits/rejected": -18.30617332458496, "logps/chosen": -343.61163330078125, "logps/rejected": -377.89324951171875, "loss": 1.395, "rewards/accuracies": 0.5, "rewards/chosen": 1.9493963718414307, "rewards/margins": -0.5609989762306213, "rewards/rejected": 2.5103952884674072, "step": 16700 }, { "epoch": 0.7758020335205905, "grad_norm": 73.82722473144531, "learning_rate": 2.534853057244997e-07, "logits/chosen": -19.353809356689453, "logits/rejected": -18.99365997314453, "logps/chosen": -507.3585510253906, "logps/rejected": -510.201171875, "loss": 0.7586, "rewards/accuracies": 0.5, "rewards/chosen": 3.152927875518799, "rewards/margins": -0.038293324410915375, "rewards/rejected": 3.191220998764038, "step": 16710 }, { "epoch": 0.7762663076280236, "grad_norm": 10.016443252563477, "learning_rate": 2.5345744927805376e-07, "logits/chosen": -18.674335479736328, "logits/rejected": -17.988086700439453, "logps/chosen": -439.750244140625, "logps/rejected": -351.44384765625, "loss": 0.57, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7646427154541016, "rewards/margins": 0.6971651315689087, "rewards/rejected": 2.0674777030944824, "step": 16720 }, { "epoch": 0.7767305817354566, "grad_norm": 254.14768981933594, "learning_rate": 2.534295928316078e-07, "logits/chosen": -18.530345916748047, "logits/rejected": -17.540225982666016, "logps/chosen": -474.0038146972656, "logps/rejected": -354.4456481933594, "loss": 0.6206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.991319179534912, "rewards/margins": 0.7234889268875122, "rewards/rejected": 2.2678303718566895, "step": 16730 }, { "epoch": 0.7771948558428896, "grad_norm": 52.67941665649414, "learning_rate": 2.534017363851618e-07, "logits/chosen": -19.810958862304688, "logits/rejected": -17.842300415039062, "logps/chosen": -391.5279846191406, "logps/rejected": -259.62152099609375, "loss": 0.3825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.225212812423706, "rewards/margins": 1.3477897644042969, "rewards/rejected": 1.8774230480194092, "step": 16740 }, { "epoch": 0.7776591299503227, "grad_norm": 138.15029907226562, "learning_rate": 2.533738799387158e-07, "logits/chosen": -18.830577850341797, "logits/rejected": -18.336030960083008, "logps/chosen": -430.76531982421875, "logps/rejected": -427.28533935546875, "loss": 0.5568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3566770553588867, "rewards/margins": 0.5074421167373657, "rewards/rejected": 1.849234938621521, "step": 16750 }, { "epoch": 0.7781234040577557, "grad_norm": 45.65188980102539, "learning_rate": 2.533460234922698e-07, "logits/chosen": -17.78731918334961, "logits/rejected": -17.6064395904541, "logps/chosen": -315.7518005371094, "logps/rejected": -310.60968017578125, "loss": 0.9099, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0911202430725098, "rewards/margins": -0.0524408221244812, "rewards/rejected": 2.1435611248016357, "step": 16760 }, { "epoch": 0.7785876781651887, "grad_norm": 205.4661407470703, "learning_rate": 2.533181670458238e-07, "logits/chosen": -19.23689079284668, "logits/rejected": -18.864261627197266, "logps/chosen": -507.58319091796875, "logps/rejected": -463.2852478027344, "loss": 0.8022, "rewards/accuracies": 0.5, "rewards/chosen": 2.661689519882202, "rewards/margins": -0.06315630674362183, "rewards/rejected": 2.724846124649048, "step": 16770 }, { "epoch": 0.7790519522726218, "grad_norm": 67.48579406738281, "learning_rate": 2.5329031059937785e-07, "logits/chosen": -18.918203353881836, "logits/rejected": -17.976608276367188, "logps/chosen": -332.65380859375, "logps/rejected": -327.05926513671875, "loss": 0.8535, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0583720207214355, "rewards/margins": 0.026395510882139206, "rewards/rejected": 2.0319764614105225, "step": 16780 }, { "epoch": 0.7795162263800548, "grad_norm": 18.853628158569336, "learning_rate": 2.532624541529319e-07, "logits/chosen": -18.45835304260254, "logits/rejected": -18.23749542236328, "logps/chosen": -388.1525573730469, "logps/rejected": -352.0625, "loss": 0.6349, "rewards/accuracies": 0.5, "rewards/chosen": 2.4473369121551514, "rewards/margins": 0.27855220437049866, "rewards/rejected": 2.1687848567962646, "step": 16790 }, { "epoch": 0.7799805004874878, "grad_norm": 8.760897636413574, "learning_rate": 2.5323459770648593e-07, "logits/chosen": -17.94304084777832, "logits/rejected": -16.67520523071289, "logps/chosen": -373.9747314453125, "logps/rejected": -276.61273193359375, "loss": 0.5752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.804140567779541, "rewards/margins": 1.4051934480667114, "rewards/rejected": 1.3989471197128296, "step": 16800 }, { "epoch": 0.7804447745949208, "grad_norm": 143.78549194335938, "learning_rate": 2.532067412600399e-07, "logits/chosen": -18.26334571838379, "logits/rejected": -17.930728912353516, "logps/chosen": -414.3984375, "logps/rejected": -421.41680908203125, "loss": 1.0021, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6140408515930176, "rewards/margins": -0.14558503031730652, "rewards/rejected": 2.7596256732940674, "step": 16810 }, { "epoch": 0.7809090487023539, "grad_norm": 154.35963439941406, "learning_rate": 2.531788848135939e-07, "logits/chosen": -18.978445053100586, "logits/rejected": -18.89323616027832, "logps/chosen": -399.30645751953125, "logps/rejected": -408.41162109375, "loss": 0.5429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0940897464752197, "rewards/margins": 0.5375241041183472, "rewards/rejected": 2.556565761566162, "step": 16820 }, { "epoch": 0.7813733228097869, "grad_norm": 15.522088050842285, "learning_rate": 2.5315102836714795e-07, "logits/chosen": -19.28099250793457, "logits/rejected": -18.702383041381836, "logps/chosen": -379.09344482421875, "logps/rejected": -320.2213439941406, "loss": 0.7221, "rewards/accuracies": 0.5, "rewards/chosen": 2.61344575881958, "rewards/margins": 0.4177113473415375, "rewards/rejected": 2.1957345008850098, "step": 16830 }, { "epoch": 0.7818375969172199, "grad_norm": 163.27394104003906, "learning_rate": 2.53123171920702e-07, "logits/chosen": -18.783069610595703, "logits/rejected": -17.600482940673828, "logps/chosen": -415.9734802246094, "logps/rejected": -294.645263671875, "loss": 0.4424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7822983264923096, "rewards/margins": 0.9220563173294067, "rewards/rejected": 1.8602418899536133, "step": 16840 }, { "epoch": 0.782301871024653, "grad_norm": 169.88613891601562, "learning_rate": 2.53095315474256e-07, "logits/chosen": -19.36866569519043, "logits/rejected": -19.205968856811523, "logps/chosen": -516.8294677734375, "logps/rejected": -461.4952697753906, "loss": 0.7664, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.500295639038086, "rewards/margins": 0.31599777936935425, "rewards/rejected": 3.184298038482666, "step": 16850 }, { "epoch": 0.782766145132086, "grad_norm": 42.14687728881836, "learning_rate": 2.5306745902781e-07, "logits/chosen": -17.700645446777344, "logits/rejected": -17.475841522216797, "logps/chosen": -358.6661682128906, "logps/rejected": -281.304443359375, "loss": 0.6758, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9051148891448975, "rewards/margins": 0.25238990783691406, "rewards/rejected": 1.6527248620986938, "step": 16860 }, { "epoch": 0.783230419239519, "grad_norm": 163.8141632080078, "learning_rate": 2.53039602581364e-07, "logits/chosen": -19.257890701293945, "logits/rejected": -18.176301956176758, "logps/chosen": -457.74169921875, "logps/rejected": -377.98065185546875, "loss": 0.7172, "rewards/accuracies": 0.5, "rewards/chosen": 2.409015655517578, "rewards/margins": 0.16239148378372192, "rewards/rejected": 2.24662446975708, "step": 16870 }, { "epoch": 0.7836946933469521, "grad_norm": 18.771223068237305, "learning_rate": 2.5301174613491805e-07, "logits/chosen": -18.968278884887695, "logits/rejected": -18.307376861572266, "logps/chosen": -593.5001831054688, "logps/rejected": -507.5292053222656, "loss": 0.9084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.391040325164795, "rewards/margins": 0.18895170092582703, "rewards/rejected": 3.2020888328552246, "step": 16880 }, { "epoch": 0.7841589674543851, "grad_norm": 110.72969055175781, "learning_rate": 2.5298388968847204e-07, "logits/chosen": -18.524423599243164, "logits/rejected": -17.93537139892578, "logps/chosen": -429.13653564453125, "logps/rejected": -306.0528869628906, "loss": 0.5161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8603546619415283, "rewards/margins": 0.5553810596466064, "rewards/rejected": 2.304973602294922, "step": 16890 }, { "epoch": 0.7846232415618181, "grad_norm": 43.395904541015625, "learning_rate": 2.529560332420261e-07, "logits/chosen": -20.256378173828125, "logits/rejected": -18.766937255859375, "logps/chosen": -367.9552307128906, "logps/rejected": -359.67724609375, "loss": 0.5699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7035179138183594, "rewards/margins": 0.6402917504310608, "rewards/rejected": 2.0632262229919434, "step": 16900 }, { "epoch": 0.7850875156692512, "grad_norm": 229.56155395507812, "learning_rate": 2.529281767955801e-07, "logits/chosen": -18.493803024291992, "logits/rejected": -18.643308639526367, "logps/chosen": -454.501220703125, "logps/rejected": -476.232666015625, "loss": 1.2543, "rewards/accuracies": 0.5, "rewards/chosen": 2.9935357570648193, "rewards/margins": -0.5259791612625122, "rewards/rejected": 3.519514799118042, "step": 16910 }, { "epoch": 0.7855517897766842, "grad_norm": 47.265193939208984, "learning_rate": 2.529003203491341e-07, "logits/chosen": -18.533830642700195, "logits/rejected": -18.21332550048828, "logps/chosen": -309.2423400878906, "logps/rejected": -332.9449157714844, "loss": 0.8795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0070598125457764, "rewards/margins": 0.16925020515918732, "rewards/rejected": 1.8378095626831055, "step": 16920 }, { "epoch": 0.7860160638841172, "grad_norm": 72.92996978759766, "learning_rate": 2.5287246390268815e-07, "logits/chosen": -18.361425399780273, "logits/rejected": -18.33749771118164, "logps/chosen": -335.0798034667969, "logps/rejected": -345.58099365234375, "loss": 0.8746, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9694583415985107, "rewards/margins": -0.1700439751148224, "rewards/rejected": 2.139502763748169, "step": 16930 }, { "epoch": 0.7864803379915503, "grad_norm": 69.63986206054688, "learning_rate": 2.5284460745624214e-07, "logits/chosen": -19.042991638183594, "logits/rejected": -17.656856536865234, "logps/chosen": -359.90374755859375, "logps/rejected": -245.01943969726562, "loss": 0.6272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9473596811294556, "rewards/margins": 0.41507387161254883, "rewards/rejected": 1.5322858095169067, "step": 16940 }, { "epoch": 0.7869446120989833, "grad_norm": 22.420957565307617, "learning_rate": 2.528167510097962e-07, "logits/chosen": -18.610546112060547, "logits/rejected": -17.310178756713867, "logps/chosen": -498.8877868652344, "logps/rejected": -347.9459228515625, "loss": 0.513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6835668087005615, "rewards/margins": 0.8276697397232056, "rewards/rejected": 1.855897307395935, "step": 16950 }, { "epoch": 0.7874088862064162, "grad_norm": 88.50859069824219, "learning_rate": 2.5278889456335017e-07, "logits/chosen": -18.53578758239746, "logits/rejected": -17.977487564086914, "logps/chosen": -492.8140563964844, "logps/rejected": -391.078369140625, "loss": 0.6695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5079240798950195, "rewards/margins": 0.3422653079032898, "rewards/rejected": 2.165658473968506, "step": 16960 }, { "epoch": 0.7878731603138492, "grad_norm": 11.238288879394531, "learning_rate": 2.527610381169042e-07, "logits/chosen": -18.587017059326172, "logits/rejected": -17.086685180664062, "logps/chosen": -445.2876892089844, "logps/rejected": -308.05889892578125, "loss": 0.3719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7178092002868652, "rewards/margins": 1.1480425596237183, "rewards/rejected": 1.5697667598724365, "step": 16970 }, { "epoch": 0.7883374344212823, "grad_norm": 47.17942810058594, "learning_rate": 2.5273318167045825e-07, "logits/chosen": -19.33914566040039, "logits/rejected": -18.84391212463379, "logps/chosen": -456.93658447265625, "logps/rejected": -428.9764709472656, "loss": 0.5847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.328838586807251, "rewards/margins": 0.43857908248901367, "rewards/rejected": 1.8902593851089478, "step": 16980 }, { "epoch": 0.7888017085287153, "grad_norm": 11.38517951965332, "learning_rate": 2.5270532522401224e-07, "logits/chosen": -17.882423400878906, "logits/rejected": -18.042682647705078, "logps/chosen": -403.85699462890625, "logps/rejected": -464.3377990722656, "loss": 1.0766, "rewards/accuracies": 0.5, "rewards/chosen": 2.9255871772766113, "rewards/margins": 0.09777633100748062, "rewards/rejected": 2.8278110027313232, "step": 16990 }, { "epoch": 0.7892659826361483, "grad_norm": 46.85162353515625, "learning_rate": 2.526774687775662e-07, "logits/chosen": -18.9101505279541, "logits/rejected": -18.34634780883789, "logps/chosen": -367.4072265625, "logps/rejected": -353.69171142578125, "loss": 0.5945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1950221061706543, "rewards/margins": 0.31924712657928467, "rewards/rejected": 1.8757750988006592, "step": 17000 }, { "epoch": 0.7897302567435814, "grad_norm": 24.316566467285156, "learning_rate": 2.5264961233112026e-07, "logits/chosen": -18.1845645904541, "logits/rejected": -18.51995086669922, "logps/chosen": -305.61285400390625, "logps/rejected": -364.14813232421875, "loss": 0.8195, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8019685745239258, "rewards/margins": -0.14316998422145844, "rewards/rejected": 1.9451385736465454, "step": 17010 }, { "epoch": 0.7901945308510144, "grad_norm": 156.10801696777344, "learning_rate": 2.526217558846743e-07, "logits/chosen": -18.503894805908203, "logits/rejected": -17.76202964782715, "logps/chosen": -484.544677734375, "logps/rejected": -398.54315185546875, "loss": 0.59, "rewards/accuracies": 0.5, "rewards/chosen": 2.790517807006836, "rewards/margins": 0.5047593116760254, "rewards/rejected": 2.2857584953308105, "step": 17020 }, { "epoch": 0.7906588049584474, "grad_norm": 29.43488311767578, "learning_rate": 2.5259389943822835e-07, "logits/chosen": -19.12405014038086, "logits/rejected": -17.06003189086914, "logps/chosen": -400.25732421875, "logps/rejected": -257.5472106933594, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0386271476745605, "rewards/margins": 1.0852720737457275, "rewards/rejected": 1.9533551931381226, "step": 17030 }, { "epoch": 0.7911230790658805, "grad_norm": 149.3070068359375, "learning_rate": 2.5256604299178233e-07, "logits/chosen": -17.15953254699707, "logits/rejected": -17.955692291259766, "logps/chosen": -281.8736267089844, "logps/rejected": -345.4233703613281, "loss": 1.243, "rewards/accuracies": 0.5, "rewards/chosen": 2.2165980339050293, "rewards/margins": -0.5953888893127441, "rewards/rejected": 2.8119869232177734, "step": 17040 }, { "epoch": 0.7915873531733135, "grad_norm": 6.046469211578369, "learning_rate": 2.525381865453364e-07, "logits/chosen": -18.536388397216797, "logits/rejected": -17.81052589416504, "logps/chosen": -342.261474609375, "logps/rejected": -219.16268920898438, "loss": 0.5181, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.668121814727783, "rewards/margins": 0.9072310328483582, "rewards/rejected": 1.7608907222747803, "step": 17050 }, { "epoch": 0.7920516272807465, "grad_norm": 82.21556091308594, "learning_rate": 2.5251033009889036e-07, "logits/chosen": -18.659427642822266, "logits/rejected": -17.297487258911133, "logps/chosen": -394.5480651855469, "logps/rejected": -288.5545654296875, "loss": 0.6689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0559446811676025, "rewards/margins": 0.28378668427467346, "rewards/rejected": 1.772157907485962, "step": 17060 }, { "epoch": 0.7925159013881796, "grad_norm": 203.12042236328125, "learning_rate": 2.524824736524444e-07, "logits/chosen": -17.543691635131836, "logits/rejected": -17.85891342163086, "logps/chosen": -376.9192810058594, "logps/rejected": -358.6008605957031, "loss": 0.8958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.682833671569824, "rewards/margins": -0.011042570695281029, "rewards/rejected": 2.693876266479492, "step": 17070 }, { "epoch": 0.7929801754956126, "grad_norm": 75.64604187011719, "learning_rate": 2.524546172059984e-07, "logits/chosen": -18.12704849243164, "logits/rejected": -17.14989471435547, "logps/chosen": -396.8795166015625, "logps/rejected": -219.36343383789062, "loss": 0.5861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6196532249450684, "rewards/margins": 0.4810921251773834, "rewards/rejected": 2.1385607719421387, "step": 17080 }, { "epoch": 0.7934444496030456, "grad_norm": 63.32790756225586, "learning_rate": 2.5242676075955243e-07, "logits/chosen": -18.356922149658203, "logits/rejected": -17.519323348999023, "logps/chosen": -473.2032775878906, "logps/rejected": -346.32550048828125, "loss": 0.7387, "rewards/accuracies": 0.5, "rewards/chosen": 2.611083507537842, "rewards/margins": 0.412191241979599, "rewards/rejected": 2.198892116546631, "step": 17090 }, { "epoch": 0.7939087237104787, "grad_norm": 47.454444885253906, "learning_rate": 2.523989043131065e-07, "logits/chosen": -18.553552627563477, "logits/rejected": -17.54033088684082, "logps/chosen": -329.88983154296875, "logps/rejected": -244.9658966064453, "loss": 0.5068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7855095863342285, "rewards/margins": 0.9396324157714844, "rewards/rejected": 1.8458770513534546, "step": 17100 }, { "epoch": 0.7943729978179117, "grad_norm": 103.65926361083984, "learning_rate": 2.5237104786666046e-07, "logits/chosen": -18.866714477539062, "logits/rejected": -18.73966407775879, "logps/chosen": -413.0594177246094, "logps/rejected": -284.61663818359375, "loss": 0.8831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.954885959625244, "rewards/margins": 0.39245858788490295, "rewards/rejected": 2.562427043914795, "step": 17110 }, { "epoch": 0.7948372719253447, "grad_norm": 130.03421020507812, "learning_rate": 2.5234319142021445e-07, "logits/chosen": -19.000598907470703, "logits/rejected": -17.600933074951172, "logps/chosen": -450.6319885253906, "logps/rejected": -312.6732482910156, "loss": 0.6286, "rewards/accuracies": 0.5, "rewards/chosen": 3.0499234199523926, "rewards/margins": 1.161903738975525, "rewards/rejected": 1.8880199193954468, "step": 17120 }, { "epoch": 0.7953015460327778, "grad_norm": 56.984439849853516, "learning_rate": 2.523153349737685e-07, "logits/chosen": -19.150203704833984, "logits/rejected": -17.508617401123047, "logps/chosen": -375.1892395019531, "logps/rejected": -230.09823608398438, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 2.378462553024292, "rewards/margins": 1.2207677364349365, "rewards/rejected": 1.157694697380066, "step": 17130 }, { "epoch": 0.7957658201402108, "grad_norm": 148.546875, "learning_rate": 2.5228747852732253e-07, "logits/chosen": -18.384307861328125, "logits/rejected": -17.94798469543457, "logps/chosen": -451.3123474121094, "logps/rejected": -373.20159912109375, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": 2.4379830360412598, "rewards/margins": 0.39587023854255676, "rewards/rejected": 2.0421128273010254, "step": 17140 }, { "epoch": 0.7962300942476438, "grad_norm": 232.41990661621094, "learning_rate": 2.522596220808765e-07, "logits/chosen": -20.05956268310547, "logits/rejected": -18.81704330444336, "logps/chosen": -383.59466552734375, "logps/rejected": -290.11175537109375, "loss": 0.4998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.696521282196045, "rewards/margins": 0.7995851635932922, "rewards/rejected": 1.896936058998108, "step": 17150 }, { "epoch": 0.7966943683550768, "grad_norm": 162.1693878173828, "learning_rate": 2.5223176563443056e-07, "logits/chosen": -17.829252243041992, "logits/rejected": -18.00713348388672, "logps/chosen": -439.1268005371094, "logps/rejected": -402.42022705078125, "loss": 0.836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6195342540740967, "rewards/margins": 0.09026376903057098, "rewards/rejected": 2.5292704105377197, "step": 17160 }, { "epoch": 0.7971586424625099, "grad_norm": 158.96437072753906, "learning_rate": 2.5220390918798455e-07, "logits/chosen": -20.29356575012207, "logits/rejected": -19.03325843811035, "logps/chosen": -464.84228515625, "logps/rejected": -333.4862060546875, "loss": 0.6689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.708890199661255, "rewards/margins": 0.44717058539390564, "rewards/rejected": 2.2617194652557373, "step": 17170 }, { "epoch": 0.7976229165699429, "grad_norm": 129.5843505859375, "learning_rate": 2.521760527415386e-07, "logits/chosen": -18.612634658813477, "logits/rejected": -17.21304702758789, "logps/chosen": -424.1065979003906, "logps/rejected": -282.4654846191406, "loss": 0.671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.702632427215576, "rewards/margins": 0.6450002193450928, "rewards/rejected": 2.0576322078704834, "step": 17180 }, { "epoch": 0.7980871906773759, "grad_norm": 43.197853088378906, "learning_rate": 2.521481962950926e-07, "logits/chosen": -18.130367279052734, "logits/rejected": -17.708484649658203, "logps/chosen": -289.58453369140625, "logps/rejected": -262.68377685546875, "loss": 0.8176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8247836828231812, "rewards/margins": -0.08563891053199768, "rewards/rejected": 1.9104225635528564, "step": 17190 }, { "epoch": 0.798551464784809, "grad_norm": 27.478437423706055, "learning_rate": 2.521203398486466e-07, "logits/chosen": -18.900829315185547, "logits/rejected": -18.64583969116211, "logps/chosen": -397.42803955078125, "logps/rejected": -320.58270263671875, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.053144931793213, "rewards/margins": 0.5193568468093872, "rewards/rejected": 2.533787965774536, "step": 17200 }, { "epoch": 0.799015738892242, "grad_norm": 135.9906463623047, "learning_rate": 2.5209248340220066e-07, "logits/chosen": -18.720001220703125, "logits/rejected": -17.82590103149414, "logps/chosen": -565.754638671875, "logps/rejected": -450.83392333984375, "loss": 0.7378, "rewards/accuracies": 0.5, "rewards/chosen": 3.390716075897217, "rewards/margins": 0.2389732152223587, "rewards/rejected": 3.151742696762085, "step": 17210 }, { "epoch": 0.799480012999675, "grad_norm": 64.92018127441406, "learning_rate": 2.520646269557547e-07, "logits/chosen": -19.112255096435547, "logits/rejected": -18.603076934814453, "logps/chosen": -296.3211669921875, "logps/rejected": -286.12060546875, "loss": 0.6957, "rewards/accuracies": 0.5, "rewards/chosen": 2.1990418434143066, "rewards/margins": 0.11618509143590927, "rewards/rejected": 2.0828568935394287, "step": 17220 }, { "epoch": 0.7999442871071081, "grad_norm": 138.58338928222656, "learning_rate": 2.520367705093087e-07, "logits/chosen": -18.818864822387695, "logits/rejected": -18.43708038330078, "logps/chosen": -318.45550537109375, "logps/rejected": -275.8882751464844, "loss": 0.9562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5043225288391113, "rewards/margins": 0.24116010963916779, "rewards/rejected": 2.26316237449646, "step": 17230 }, { "epoch": 0.8004085612145411, "grad_norm": 78.34424591064453, "learning_rate": 2.520089140628627e-07, "logits/chosen": -17.608280181884766, "logits/rejected": -17.924108505249023, "logps/chosen": -330.27679443359375, "logps/rejected": -371.8657531738281, "loss": 0.6829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.463041305541992, "rewards/margins": 0.3037847578525543, "rewards/rejected": 2.1592564582824707, "step": 17240 }, { "epoch": 0.8008728353219741, "grad_norm": 133.74761962890625, "learning_rate": 2.519810576164167e-07, "logits/chosen": -19.49606704711914, "logits/rejected": -18.035091400146484, "logps/chosen": -481.093505859375, "logps/rejected": -371.25543212890625, "loss": 0.7072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0241081714630127, "rewards/margins": 0.31582847237586975, "rewards/rejected": 2.708279848098755, "step": 17250 }, { "epoch": 0.8013371094294072, "grad_norm": 179.093017578125, "learning_rate": 2.5195320116997076e-07, "logits/chosen": -19.439573287963867, "logits/rejected": -19.000465393066406, "logps/chosen": -398.0167541503906, "logps/rejected": -345.99737548828125, "loss": 0.7659, "rewards/accuracies": 0.5, "rewards/chosen": 2.356595277786255, "rewards/margins": 0.11409483104944229, "rewards/rejected": 2.2425005435943604, "step": 17260 }, { "epoch": 0.8018013835368402, "grad_norm": 7.930450916290283, "learning_rate": 2.5192534472352475e-07, "logits/chosen": -18.457155227661133, "logits/rejected": -16.87991714477539, "logps/chosen": -575.3521118164062, "logps/rejected": -353.4749755859375, "loss": 0.4436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.069427728652954, "rewards/margins": 0.9863060712814331, "rewards/rejected": 2.0831215381622314, "step": 17270 }, { "epoch": 0.8022656576442732, "grad_norm": 202.1439971923828, "learning_rate": 2.518974882770788e-07, "logits/chosen": -19.547077178955078, "logits/rejected": -18.414043426513672, "logps/chosen": -452.1675720214844, "logps/rejected": -317.7792663574219, "loss": 0.7619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6565020084381104, "rewards/margins": 0.3569941520690918, "rewards/rejected": 2.2995076179504395, "step": 17280 }, { "epoch": 0.8027299317517063, "grad_norm": 244.37242126464844, "learning_rate": 2.518696318306328e-07, "logits/chosen": -19.487157821655273, "logits/rejected": -19.62744140625, "logps/chosen": -455.1953125, "logps/rejected": -516.4512939453125, "loss": 0.7751, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2495312690734863, "rewards/margins": 0.1677340567111969, "rewards/rejected": 3.0817971229553223, "step": 17290 }, { "epoch": 0.8031942058591393, "grad_norm": 121.95244598388672, "learning_rate": 2.518417753841868e-07, "logits/chosen": -18.5681095123291, "logits/rejected": -17.726497650146484, "logps/chosen": -436.2350158691406, "logps/rejected": -367.7584533691406, "loss": 0.5474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5425994396209717, "rewards/margins": 0.9478266835212708, "rewards/rejected": 2.5947728157043457, "step": 17300 }, { "epoch": 0.8036584799665722, "grad_norm": 13.991044998168945, "learning_rate": 2.518139189377408e-07, "logits/chosen": -19.18096351623535, "logits/rejected": -17.328594207763672, "logps/chosen": -397.5848083496094, "logps/rejected": -233.8650665283203, "loss": 0.4526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0895612239837646, "rewards/margins": 0.8241814374923706, "rewards/rejected": 1.265379786491394, "step": 17310 }, { "epoch": 0.8041227540740052, "grad_norm": 72.719482421875, "learning_rate": 2.5178606249129485e-07, "logits/chosen": -18.93440818786621, "logits/rejected": -19.246572494506836, "logps/chosen": -396.26458740234375, "logps/rejected": -409.3282165527344, "loss": 1.1508, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2208805084228516, "rewards/margins": -0.4948016107082367, "rewards/rejected": 2.715682029724121, "step": 17320 }, { "epoch": 0.8045870281814383, "grad_norm": 182.17063903808594, "learning_rate": 2.517582060448489e-07, "logits/chosen": -18.84786605834961, "logits/rejected": -18.265220642089844, "logps/chosen": -359.4782409667969, "logps/rejected": -292.43377685546875, "loss": 0.4511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9652421474456787, "rewards/margins": 1.0230525732040405, "rewards/rejected": 1.9421895742416382, "step": 17330 }, { "epoch": 0.8050513022888713, "grad_norm": 110.37034606933594, "learning_rate": 2.517303495984029e-07, "logits/chosen": -18.46314239501953, "logits/rejected": -17.230175018310547, "logps/chosen": -446.0581970214844, "logps/rejected": -288.2370910644531, "loss": 0.5041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.904724359512329, "rewards/margins": 1.197499394416809, "rewards/rejected": 1.7072248458862305, "step": 17340 }, { "epoch": 0.8055155763963043, "grad_norm": 208.74989318847656, "learning_rate": 2.517024931519569e-07, "logits/chosen": -18.7908935546875, "logits/rejected": -17.952064514160156, "logps/chosen": -434.6337890625, "logps/rejected": -338.120849609375, "loss": 0.6211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.225119113922119, "rewards/margins": 0.3227485120296478, "rewards/rejected": 1.9023706912994385, "step": 17350 }, { "epoch": 0.8059798505037374, "grad_norm": 105.41705322265625, "learning_rate": 2.516746367055109e-07, "logits/chosen": -19.538925170898438, "logits/rejected": -18.852397918701172, "logps/chosen": -340.3604736328125, "logps/rejected": -275.56390380859375, "loss": 0.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.400101661682129, "rewards/margins": 0.48424214124679565, "rewards/rejected": 1.9158594608306885, "step": 17360 }, { "epoch": 0.8064441246111704, "grad_norm": 162.968017578125, "learning_rate": 2.5164678025906495e-07, "logits/chosen": -19.23628044128418, "logits/rejected": -19.821704864501953, "logps/chosen": -381.1248474121094, "logps/rejected": -411.45135498046875, "loss": 0.9177, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.496715784072876, "rewards/margins": -0.315039724111557, "rewards/rejected": 2.811755657196045, "step": 17370 }, { "epoch": 0.8069083987186034, "grad_norm": 132.24107360839844, "learning_rate": 2.5161892381261893e-07, "logits/chosen": -18.669851303100586, "logits/rejected": -17.73546600341797, "logps/chosen": -409.29290771484375, "logps/rejected": -296.81036376953125, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6418025493621826, "rewards/margins": 0.9713737368583679, "rewards/rejected": 1.6704288721084595, "step": 17380 }, { "epoch": 0.8073726728260365, "grad_norm": 29.66823959350586, "learning_rate": 2.51591067366173e-07, "logits/chosen": -19.580955505371094, "logits/rejected": -18.15192222595215, "logps/chosen": -527.7702026367188, "logps/rejected": -389.7735900878906, "loss": 0.4491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4078917503356934, "rewards/margins": 0.8696355819702148, "rewards/rejected": 2.5382561683654785, "step": 17390 }, { "epoch": 0.8078369469334695, "grad_norm": 39.989044189453125, "learning_rate": 2.51563210919727e-07, "logits/chosen": -18.766475677490234, "logits/rejected": -17.84591293334961, "logps/chosen": -509.7068786621094, "logps/rejected": -400.2294616699219, "loss": 0.5344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6522622108459473, "rewards/margins": 1.0371421575546265, "rewards/rejected": 2.615119695663452, "step": 17400 }, { "epoch": 0.8083012210409025, "grad_norm": 52.581275939941406, "learning_rate": 2.51535354473281e-07, "logits/chosen": -18.78257179260254, "logits/rejected": -17.407913208007812, "logps/chosen": -490.1095275878906, "logps/rejected": -328.3633728027344, "loss": 0.4287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392117738723755, "rewards/margins": 1.3659873008728027, "rewards/rejected": 2.026130437850952, "step": 17410 }, { "epoch": 0.8087654951483356, "grad_norm": 93.82474517822266, "learning_rate": 2.51507498026835e-07, "logits/chosen": -19.12240219116211, "logits/rejected": -18.666921615600586, "logps/chosen": -305.58892822265625, "logps/rejected": -234.233642578125, "loss": 0.5917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1163148880004883, "rewards/margins": 0.7706499099731445, "rewards/rejected": 1.3456650972366333, "step": 17420 }, { "epoch": 0.8092297692557686, "grad_norm": 97.99439239501953, "learning_rate": 2.5147964158038903e-07, "logits/chosen": -19.698970794677734, "logits/rejected": -19.316177368164062, "logps/chosen": -423.92236328125, "logps/rejected": -409.142578125, "loss": 0.719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.74566650390625, "rewards/margins": 0.10959452390670776, "rewards/rejected": 2.6360719203948975, "step": 17430 }, { "epoch": 0.8096940433632016, "grad_norm": 104.78720092773438, "learning_rate": 2.514517851339431e-07, "logits/chosen": -18.433330535888672, "logits/rejected": -17.93431282043457, "logps/chosen": -324.623291015625, "logps/rejected": -288.7090759277344, "loss": 0.8651, "rewards/accuracies": 0.5, "rewards/chosen": 2.1408934593200684, "rewards/margins": 0.08443725854158401, "rewards/rejected": 2.0564560890197754, "step": 17440 }, { "epoch": 0.8101583174706347, "grad_norm": 35.79730224609375, "learning_rate": 2.514239286874971e-07, "logits/chosen": -19.146739959716797, "logits/rejected": -17.583913803100586, "logps/chosen": -349.6225280761719, "logps/rejected": -164.66470336914062, "loss": 0.4929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8092349767684937, "rewards/margins": 0.7384979724884033, "rewards/rejected": 1.0707371234893799, "step": 17450 }, { "epoch": 0.8106225915780677, "grad_norm": 116.91084289550781, "learning_rate": 2.513960722410511e-07, "logits/chosen": -19.881175994873047, "logits/rejected": -18.800643920898438, "logps/chosen": -426.59381103515625, "logps/rejected": -354.4383239746094, "loss": 0.7185, "rewards/accuracies": 0.5, "rewards/chosen": 2.4932034015655518, "rewards/margins": 0.472585529088974, "rewards/rejected": 2.020617961883545, "step": 17460 }, { "epoch": 0.8110868656855007, "grad_norm": 67.1536865234375, "learning_rate": 2.5136821579460515e-07, "logits/chosen": -19.647594451904297, "logits/rejected": -18.67894172668457, "logps/chosen": -465.3858337402344, "logps/rejected": -314.37945556640625, "loss": 0.5552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.391669511795044, "rewards/margins": 0.7995541095733643, "rewards/rejected": 2.5921154022216797, "step": 17470 }, { "epoch": 0.8115511397929337, "grad_norm": 43.70121383666992, "learning_rate": 2.5134035934815913e-07, "logits/chosen": -17.773534774780273, "logits/rejected": -17.87558364868164, "logps/chosen": -270.7781982421875, "logps/rejected": -220.3686065673828, "loss": 0.7607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6637319326400757, "rewards/margins": 0.5195194482803345, "rewards/rejected": 1.1442124843597412, "step": 17480 }, { "epoch": 0.8120154139003668, "grad_norm": 7.998778343200684, "learning_rate": 2.513125029017132e-07, "logits/chosen": -17.875268936157227, "logits/rejected": -16.89371681213379, "logps/chosen": -443.0186462402344, "logps/rejected": -321.9942321777344, "loss": 0.7432, "rewards/accuracies": 0.5, "rewards/chosen": 2.7554800510406494, "rewards/margins": 1.212853193283081, "rewards/rejected": 1.5426268577575684, "step": 17490 }, { "epoch": 0.8124796880077998, "grad_norm": 1.2133535146713257, "learning_rate": 2.5128464645526716e-07, "logits/chosen": -18.91796875, "logits/rejected": -17.514720916748047, "logps/chosen": -325.579833984375, "logps/rejected": -255.71389770507812, "loss": 0.3907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.254326343536377, "rewards/margins": 1.6539827585220337, "rewards/rejected": 1.6003437042236328, "step": 17500 }, { "epoch": 0.8129439621152328, "grad_norm": 9.65401554107666, "learning_rate": 2.512567900088212e-07, "logits/chosen": -19.88138198852539, "logits/rejected": -18.784770965576172, "logps/chosen": -403.6920166015625, "logps/rejected": -297.39215087890625, "loss": 0.7149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9894530773162842, "rewards/margins": 0.43435248732566833, "rewards/rejected": 1.555100679397583, "step": 17510 }, { "epoch": 0.8134082362226659, "grad_norm": 98.49745178222656, "learning_rate": 2.5122893356237524e-07, "logits/chosen": -19.033302307128906, "logits/rejected": -19.360246658325195, "logps/chosen": -446.17449951171875, "logps/rejected": -491.59210205078125, "loss": 0.7485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.310629367828369, "rewards/margins": 0.1345420628786087, "rewards/rejected": 2.1760871410369873, "step": 17520 }, { "epoch": 0.8138725103300989, "grad_norm": 119.41771697998047, "learning_rate": 2.5120107711592923e-07, "logits/chosen": -19.39974594116211, "logits/rejected": -18.01629066467285, "logps/chosen": -539.2460327148438, "logps/rejected": -306.0157775878906, "loss": 0.5072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.341129779815674, "rewards/margins": 1.2083185911178589, "rewards/rejected": 2.1328113079071045, "step": 17530 }, { "epoch": 0.8143367844375319, "grad_norm": 33.162296295166016, "learning_rate": 2.511732206694832e-07, "logits/chosen": -18.137283325195312, "logits/rejected": -17.517436981201172, "logps/chosen": -253.2960968017578, "logps/rejected": -247.48519897460938, "loss": 0.5593, "rewards/accuracies": 0.5, "rewards/chosen": 1.9793758392333984, "rewards/margins": 0.4422038197517395, "rewards/rejected": 1.5371721982955933, "step": 17540 }, { "epoch": 0.814801058544965, "grad_norm": 112.81346130371094, "learning_rate": 2.5114536422303726e-07, "logits/chosen": -18.71072006225586, "logits/rejected": -18.107454299926758, "logps/chosen": -414.90765380859375, "logps/rejected": -343.7562561035156, "loss": 0.9247, "rewards/accuracies": 0.5, "rewards/chosen": 2.735830068588257, "rewards/margins": 0.404425710439682, "rewards/rejected": 2.331404209136963, "step": 17550 }, { "epoch": 0.815265332652398, "grad_norm": 16.494991302490234, "learning_rate": 2.511175077765913e-07, "logits/chosen": -18.181259155273438, "logits/rejected": -17.4749698638916, "logps/chosen": -332.7610168457031, "logps/rejected": -203.4962158203125, "loss": 0.4005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.521859884262085, "rewards/margins": 1.2639405727386475, "rewards/rejected": 1.2579195499420166, "step": 17560 }, { "epoch": 0.815729606759831, "grad_norm": 171.8759765625, "learning_rate": 2.510896513301453e-07, "logits/chosen": -18.93397331237793, "logits/rejected": -18.296667098999023, "logps/chosen": -414.71649169921875, "logps/rejected": -357.3134765625, "loss": 0.7897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.432492256164551, "rewards/margins": 0.3362688720226288, "rewards/rejected": 2.0962235927581787, "step": 17570 }, { "epoch": 0.8161938808672641, "grad_norm": 31.244342803955078, "learning_rate": 2.5106179488369933e-07, "logits/chosen": -19.076757431030273, "logits/rejected": -18.01120948791504, "logps/chosen": -422.6993713378906, "logps/rejected": -324.19403076171875, "loss": 0.464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.90201735496521, "rewards/margins": 0.8169681429862976, "rewards/rejected": 2.0850491523742676, "step": 17580 }, { "epoch": 0.8166581549746971, "grad_norm": 32.72859191894531, "learning_rate": 2.510339384372533e-07, "logits/chosen": -18.577112197875977, "logits/rejected": -17.655132293701172, "logps/chosen": -443.8636169433594, "logps/rejected": -321.077392578125, "loss": 0.6057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.292388439178467, "rewards/margins": 0.5170490145683289, "rewards/rejected": 1.775339126586914, "step": 17590 }, { "epoch": 0.8171224290821301, "grad_norm": 216.23696899414062, "learning_rate": 2.5100608199080736e-07, "logits/chosen": -19.346351623535156, "logits/rejected": -18.366270065307617, "logps/chosen": -443.88397216796875, "logps/rejected": -364.54205322265625, "loss": 0.9585, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.675384998321533, "rewards/margins": 0.21411359310150146, "rewards/rejected": 2.4612715244293213, "step": 17600 }, { "epoch": 0.8175867031895632, "grad_norm": 28.65806770324707, "learning_rate": 2.5097822554436135e-07, "logits/chosen": -19.7849063873291, "logits/rejected": -18.25356674194336, "logps/chosen": -446.7940368652344, "logps/rejected": -285.02777099609375, "loss": 0.3911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9779980182647705, "rewards/margins": 1.056766152381897, "rewards/rejected": 1.9212318658828735, "step": 17610 }, { "epoch": 0.8180509772969962, "grad_norm": 32.788570404052734, "learning_rate": 2.509503690979154e-07, "logits/chosen": -19.503080368041992, "logits/rejected": -19.22190284729004, "logps/chosen": -423.6830139160156, "logps/rejected": -403.92230224609375, "loss": 0.6093, "rewards/accuracies": 0.5, "rewards/chosen": 2.910892963409424, "rewards/margins": 0.6321876645088196, "rewards/rejected": 2.278705358505249, "step": 17620 }, { "epoch": 0.8185152514044292, "grad_norm": 14.612135887145996, "learning_rate": 2.5092251265146943e-07, "logits/chosen": -18.456859588623047, "logits/rejected": -18.001041412353516, "logps/chosen": -310.6936950683594, "logps/rejected": -316.9580993652344, "loss": 0.7333, "rewards/accuracies": 0.5, "rewards/chosen": 2.2163450717926025, "rewards/margins": 0.29524898529052734, "rewards/rejected": 1.9210960865020752, "step": 17630 }, { "epoch": 0.8189795255118621, "grad_norm": 3.7654778957366943, "learning_rate": 2.5089465620502347e-07, "logits/chosen": -18.837574005126953, "logits/rejected": -17.130889892578125, "logps/chosen": -385.4656066894531, "logps/rejected": -301.2513732910156, "loss": 0.6034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7289443016052246, "rewards/margins": 0.7276820540428162, "rewards/rejected": 2.0012624263763428, "step": 17640 }, { "epoch": 0.8194437996192953, "grad_norm": 127.01386260986328, "learning_rate": 2.5086679975857746e-07, "logits/chosen": -19.03730010986328, "logits/rejected": -17.938976287841797, "logps/chosen": -473.82568359375, "logps/rejected": -313.29644775390625, "loss": 0.4148, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7646565437316895, "rewards/margins": 1.0000066757202148, "rewards/rejected": 1.7646501064300537, "step": 17650 }, { "epoch": 0.8199080737267282, "grad_norm": 136.3452911376953, "learning_rate": 2.5083894331213145e-07, "logits/chosen": -18.660614013671875, "logits/rejected": -18.241762161254883, "logps/chosen": -461.3248596191406, "logps/rejected": -409.61981201171875, "loss": 0.7071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.269742488861084, "rewards/margins": 0.10835476219654083, "rewards/rejected": 3.1613879203796387, "step": 17660 }, { "epoch": 0.8203723478341612, "grad_norm": 126.39356231689453, "learning_rate": 2.508110868656855e-07, "logits/chosen": -18.403369903564453, "logits/rejected": -19.04531478881836, "logps/chosen": -308.59967041015625, "logps/rejected": -402.92364501953125, "loss": 1.157, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.8571882247924805, "rewards/margins": -0.6639207005500793, "rewards/rejected": 2.521109104156494, "step": 17670 }, { "epoch": 0.8208366219415943, "grad_norm": 215.08596801757812, "learning_rate": 2.5078323041923953e-07, "logits/chosen": -17.501718521118164, "logits/rejected": -17.106426239013672, "logps/chosen": -369.5341796875, "logps/rejected": -312.57098388671875, "loss": 0.7232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.080134630203247, "rewards/margins": 0.47085151076316833, "rewards/rejected": 1.609283208847046, "step": 17680 }, { "epoch": 0.8213008960490273, "grad_norm": 76.94708251953125, "learning_rate": 2.507553739727935e-07, "logits/chosen": -18.629703521728516, "logits/rejected": -17.763713836669922, "logps/chosen": -575.7366333007812, "logps/rejected": -411.65521240234375, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.442213773727417, "rewards/margins": 0.8535364270210266, "rewards/rejected": 2.588677406311035, "step": 17690 }, { "epoch": 0.8217651701564603, "grad_norm": 72.35923767089844, "learning_rate": 2.5072751752634756e-07, "logits/chosen": -18.893123626708984, "logits/rejected": -18.82567596435547, "logps/chosen": -487.131591796875, "logps/rejected": -378.5166320800781, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": 3.2239537239074707, "rewards/margins": 0.7787803411483765, "rewards/rejected": 2.4451732635498047, "step": 17700 }, { "epoch": 0.8222294442638934, "grad_norm": 214.94195556640625, "learning_rate": 2.5069966107990155e-07, "logits/chosen": -18.888410568237305, "logits/rejected": -18.619903564453125, "logps/chosen": -418.9173889160156, "logps/rejected": -354.0546875, "loss": 0.973, "rewards/accuracies": 0.5, "rewards/chosen": 3.3118929862976074, "rewards/margins": 0.5394667387008667, "rewards/rejected": 2.772426128387451, "step": 17710 }, { "epoch": 0.8226937183713264, "grad_norm": 99.99842071533203, "learning_rate": 2.506718046334556e-07, "logits/chosen": -18.37268829345703, "logits/rejected": -17.351490020751953, "logps/chosen": -453.6353454589844, "logps/rejected": -353.4720764160156, "loss": 0.6143, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7791476249694824, "rewards/margins": 0.9821921586990356, "rewards/rejected": 1.7969557046890259, "step": 17720 }, { "epoch": 0.8231579924787594, "grad_norm": 91.91248321533203, "learning_rate": 2.506439481870096e-07, "logits/chosen": -19.55939292907715, "logits/rejected": -17.81584358215332, "logps/chosen": -479.07781982421875, "logps/rejected": -361.09234619140625, "loss": 0.462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7192771434783936, "rewards/margins": 1.3371684551239014, "rewards/rejected": 2.3821091651916504, "step": 17730 }, { "epoch": 0.8236222665861925, "grad_norm": 72.46222686767578, "learning_rate": 2.506160917405636e-07, "logits/chosen": -19.09310531616211, "logits/rejected": -17.863418579101562, "logps/chosen": -396.01568603515625, "logps/rejected": -347.04937744140625, "loss": 0.5943, "rewards/accuracies": 0.5, "rewards/chosen": 2.353060483932495, "rewards/margins": 0.4156765937805176, "rewards/rejected": 1.937384009361267, "step": 17740 }, { "epoch": 0.8240865406936255, "grad_norm": 32.96444320678711, "learning_rate": 2.5058823529411766e-07, "logits/chosen": -18.984886169433594, "logits/rejected": -17.014450073242188, "logps/chosen": -478.5973205566406, "logps/rejected": -286.1038513183594, "loss": 0.4507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2421295642852783, "rewards/margins": 1.2403606176376343, "rewards/rejected": 2.0017685890197754, "step": 17750 }, { "epoch": 0.8245508148010585, "grad_norm": 58.835487365722656, "learning_rate": 2.5056037884767165e-07, "logits/chosen": -18.618877410888672, "logits/rejected": -18.675460815429688, "logps/chosen": -323.3620910644531, "logps/rejected": -316.63360595703125, "loss": 0.9851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8581968545913696, "rewards/margins": -0.16161270439624786, "rewards/rejected": 2.0198097229003906, "step": 17760 }, { "epoch": 0.8250150889084916, "grad_norm": 94.4964828491211, "learning_rate": 2.505325224012257e-07, "logits/chosen": -18.686792373657227, "logits/rejected": -17.658348083496094, "logps/chosen": -366.47296142578125, "logps/rejected": -225.4632110595703, "loss": 0.4685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4369139671325684, "rewards/margins": 0.9338358640670776, "rewards/rejected": 1.503077745437622, "step": 17770 }, { "epoch": 0.8254793630159246, "grad_norm": 14.210713386535645, "learning_rate": 2.505046659547797e-07, "logits/chosen": -18.549890518188477, "logits/rejected": -17.772504806518555, "logps/chosen": -464.02618408203125, "logps/rejected": -309.54461669921875, "loss": 0.8035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.373720407485962, "rewards/margins": 0.4580724835395813, "rewards/rejected": 1.9156477451324463, "step": 17780 }, { "epoch": 0.8259436371233576, "grad_norm": 72.43745422363281, "learning_rate": 2.504768095083337e-07, "logits/chosen": -19.47443962097168, "logits/rejected": -18.123300552368164, "logps/chosen": -366.3122863769531, "logps/rejected": -265.13006591796875, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 2.477114200592041, "rewards/margins": 1.0355781316757202, "rewards/rejected": 1.4415361881256104, "step": 17790 }, { "epoch": 0.8264079112307906, "grad_norm": 93.3531723022461, "learning_rate": 2.504489530618877e-07, "logits/chosen": -18.363340377807617, "logits/rejected": -17.332815170288086, "logps/chosen": -409.8623046875, "logps/rejected": -327.653076171875, "loss": 0.5116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.567922592163086, "rewards/margins": 0.885059654712677, "rewards/rejected": 1.6828629970550537, "step": 17800 }, { "epoch": 0.8268721853382237, "grad_norm": 13.425718307495117, "learning_rate": 2.5042109661544175e-07, "logits/chosen": -19.00228500366211, "logits/rejected": -19.10033416748047, "logps/chosen": -315.1590881347656, "logps/rejected": -324.2253723144531, "loss": 0.7529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2767436504364014, "rewards/margins": 0.1979464441537857, "rewards/rejected": 2.0787973403930664, "step": 17810 }, { "epoch": 0.8273364594456567, "grad_norm": 106.50415802001953, "learning_rate": 2.503932401689958e-07, "logits/chosen": -18.266712188720703, "logits/rejected": -18.383071899414062, "logps/chosen": -388.3122863769531, "logps/rejected": -337.40997314453125, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.64341402053833, "rewards/margins": 0.2667580544948578, "rewards/rejected": 2.3766558170318604, "step": 17820 }, { "epoch": 0.8278007335530897, "grad_norm": 105.66062927246094, "learning_rate": 2.503653837225498e-07, "logits/chosen": -18.43939971923828, "logits/rejected": -18.599470138549805, "logps/chosen": -368.9979553222656, "logps/rejected": -405.80352783203125, "loss": 0.9513, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.235060691833496, "rewards/margins": -0.29906436800956726, "rewards/rejected": 2.5341250896453857, "step": 17830 }, { "epoch": 0.8282650076605228, "grad_norm": 160.3778076171875, "learning_rate": 2.5033752727610376e-07, "logits/chosen": -18.755329132080078, "logits/rejected": -17.430463790893555, "logps/chosen": -467.0029296875, "logps/rejected": -376.8683776855469, "loss": 0.3854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8451926708221436, "rewards/margins": 1.1774470806121826, "rewards/rejected": 1.66774582862854, "step": 17840 }, { "epoch": 0.8287292817679558, "grad_norm": 60.90586471557617, "learning_rate": 2.503096708296578e-07, "logits/chosen": -19.19130516052246, "logits/rejected": -19.273120880126953, "logps/chosen": -519.0632934570312, "logps/rejected": -459.6611328125, "loss": 0.653, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1887941360473633, "rewards/margins": 0.17648252844810486, "rewards/rejected": 3.0123116970062256, "step": 17850 }, { "epoch": 0.8291935558753888, "grad_norm": 11.777382850646973, "learning_rate": 2.5028181438321184e-07, "logits/chosen": -18.74295997619629, "logits/rejected": -17.619873046875, "logps/chosen": -363.34454345703125, "logps/rejected": -294.64703369140625, "loss": 0.4552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1276068687438965, "rewards/margins": 1.043795108795166, "rewards/rejected": 2.0838117599487305, "step": 17860 }, { "epoch": 0.8296578299828219, "grad_norm": 101.06513977050781, "learning_rate": 2.502539579367659e-07, "logits/chosen": -19.264789581298828, "logits/rejected": -17.781055450439453, "logps/chosen": -492.61346435546875, "logps/rejected": -326.9500427246094, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.956721544265747, "rewards/margins": 1.1988937854766846, "rewards/rejected": 2.7578277587890625, "step": 17870 }, { "epoch": 0.8301221040902549, "grad_norm": 191.988037109375, "learning_rate": 2.502261014903199e-07, "logits/chosen": -18.58159828186035, "logits/rejected": -18.235567092895508, "logps/chosen": -372.4486083984375, "logps/rejected": -323.39337158203125, "loss": 0.6693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.746908664703369, "rewards/margins": 0.578882098197937, "rewards/rejected": 2.1680264472961426, "step": 17880 }, { "epoch": 0.8305863781976879, "grad_norm": 15.051148414611816, "learning_rate": 2.501982450438739e-07, "logits/chosen": -18.41069793701172, "logits/rejected": -17.917957305908203, "logps/chosen": -314.2330017089844, "logps/rejected": -268.37469482421875, "loss": 0.7515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.178297519683838, "rewards/margins": 0.3346598744392395, "rewards/rejected": 1.8436378240585327, "step": 17890 }, { "epoch": 0.831050652305121, "grad_norm": 71.98115539550781, "learning_rate": 2.501703885974279e-07, "logits/chosen": -19.10236167907715, "logits/rejected": -19.319849014282227, "logps/chosen": -520.5716552734375, "logps/rejected": -539.30908203125, "loss": 0.8732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9508113861083984, "rewards/margins": -0.011416506953537464, "rewards/rejected": 2.962228298187256, "step": 17900 }, { "epoch": 0.831514926412554, "grad_norm": 131.76190185546875, "learning_rate": 2.5014253215098194e-07, "logits/chosen": -19.029813766479492, "logits/rejected": -18.496339797973633, "logps/chosen": -357.56951904296875, "logps/rejected": -299.67095947265625, "loss": 0.8108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1607558727264404, "rewards/margins": 0.09905455261468887, "rewards/rejected": 2.061701536178589, "step": 17910 }, { "epoch": 0.831979200519987, "grad_norm": 64.92961883544922, "learning_rate": 2.5011467570453593e-07, "logits/chosen": -18.57326316833496, "logits/rejected": -18.0941219329834, "logps/chosen": -401.58660888671875, "logps/rejected": -310.2967834472656, "loss": 0.6035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4941940307617188, "rewards/margins": 1.188953161239624, "rewards/rejected": 2.3052406311035156, "step": 17920 }, { "epoch": 0.8324434746274201, "grad_norm": 21.098190307617188, "learning_rate": 2.5008681925808997e-07, "logits/chosen": -18.85428237915039, "logits/rejected": -19.02986717224121, "logps/chosen": -314.64056396484375, "logps/rejected": -277.92816162109375, "loss": 0.6546, "rewards/accuracies": 0.5, "rewards/chosen": 1.9489805698394775, "rewards/margins": 0.19575437903404236, "rewards/rejected": 1.7532262802124023, "step": 17930 }, { "epoch": 0.8329077487348531, "grad_norm": 85.92500305175781, "learning_rate": 2.50058962811644e-07, "logits/chosen": -18.669071197509766, "logits/rejected": -18.012353897094727, "logps/chosen": -416.3436584472656, "logps/rejected": -378.8545837402344, "loss": 0.6218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7093214988708496, "rewards/margins": 0.7201374173164368, "rewards/rejected": 1.9891841411590576, "step": 17940 }, { "epoch": 0.8333720228422861, "grad_norm": 47.6193733215332, "learning_rate": 2.50031106365198e-07, "logits/chosen": -19.903949737548828, "logits/rejected": -18.92099952697754, "logps/chosen": -302.86907958984375, "logps/rejected": -275.1702575683594, "loss": 0.478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.202902317047119, "rewards/margins": 0.7109379768371582, "rewards/rejected": 1.49196457862854, "step": 17950 }, { "epoch": 0.8338362969497192, "grad_norm": 73.50524139404297, "learning_rate": 2.50003249918752e-07, "logits/chosen": -18.74513053894043, "logits/rejected": -17.85302734375, "logps/chosen": -491.67620849609375, "logps/rejected": -308.8819274902344, "loss": 0.6298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9556479454040527, "rewards/margins": 0.9589052200317383, "rewards/rejected": 1.996742606163025, "step": 17960 }, { "epoch": 0.8343005710571522, "grad_norm": 15.89479923248291, "learning_rate": 2.4997539347230603e-07, "logits/chosen": -18.78024673461914, "logits/rejected": -18.850744247436523, "logps/chosen": -359.5011901855469, "logps/rejected": -387.69415283203125, "loss": 0.7172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.390815258026123, "rewards/margins": 0.8862510919570923, "rewards/rejected": 2.5045647621154785, "step": 17970 }, { "epoch": 0.8347648451645852, "grad_norm": 125.25101470947266, "learning_rate": 2.4994753702586007e-07, "logits/chosen": -18.000608444213867, "logits/rejected": -17.050647735595703, "logps/chosen": -349.66168212890625, "logps/rejected": -247.02993774414062, "loss": 0.5605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1916632652282715, "rewards/margins": 0.8240232467651367, "rewards/rejected": 1.3676397800445557, "step": 17980 }, { "epoch": 0.8352291192720182, "grad_norm": 22.379119873046875, "learning_rate": 2.4991968057941406e-07, "logits/chosen": -19.33556365966797, "logits/rejected": -17.980167388916016, "logps/chosen": -317.6013488769531, "logps/rejected": -197.5337371826172, "loss": 0.4088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.187623977661133, "rewards/margins": 1.0657601356506348, "rewards/rejected": 1.121863842010498, "step": 17990 }, { "epoch": 0.8356933933794513, "grad_norm": 51.01185989379883, "learning_rate": 2.498918241329681e-07, "logits/chosen": -19.86928939819336, "logits/rejected": -17.97640037536621, "logps/chosen": -569.9647216796875, "logps/rejected": -394.0472717285156, "loss": 0.5947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3607254028320312, "rewards/margins": 0.6123026013374329, "rewards/rejected": 2.748422622680664, "step": 18000 }, { "epoch": 0.8361576674868842, "grad_norm": 22.015369415283203, "learning_rate": 2.498639676865221e-07, "logits/chosen": -18.401826858520508, "logits/rejected": -17.824636459350586, "logps/chosen": -505.3643493652344, "logps/rejected": -366.11212158203125, "loss": 0.4085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.051738977432251, "rewards/margins": 1.2221486568450928, "rewards/rejected": 1.829590082168579, "step": 18010 }, { "epoch": 0.8366219415943172, "grad_norm": 67.35382843017578, "learning_rate": 2.4983611124007613e-07, "logits/chosen": -17.890605926513672, "logits/rejected": -17.555896759033203, "logps/chosen": -311.22564697265625, "logps/rejected": -333.49310302734375, "loss": 0.6715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2306013107299805, "rewards/margins": 0.41805964708328247, "rewards/rejected": 1.8125417232513428, "step": 18020 }, { "epoch": 0.8370862157017503, "grad_norm": 17.83305549621582, "learning_rate": 2.498082547936301e-07, "logits/chosen": -18.31887435913086, "logits/rejected": -16.839399337768555, "logps/chosen": -398.1730651855469, "logps/rejected": -212.3279266357422, "loss": 0.4234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9563615322113037, "rewards/margins": 1.0404037237167358, "rewards/rejected": 0.9159579277038574, "step": 18030 }, { "epoch": 0.8375504898091833, "grad_norm": 102.37715911865234, "learning_rate": 2.4978039834718416e-07, "logits/chosen": -19.11322784423828, "logits/rejected": -18.71962547302246, "logps/chosen": -384.1781921386719, "logps/rejected": -293.7996826171875, "loss": 0.6942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.584576368331909, "rewards/margins": 0.5393164753913879, "rewards/rejected": 2.045259952545166, "step": 18040 }, { "epoch": 0.8380147639166163, "grad_norm": 51.80786895751953, "learning_rate": 2.497525419007382e-07, "logits/chosen": -18.4822998046875, "logits/rejected": -17.692163467407227, "logps/chosen": -453.60479736328125, "logps/rejected": -345.9750061035156, "loss": 0.4334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7429580688476562, "rewards/margins": 0.7637304067611694, "rewards/rejected": 1.9792276620864868, "step": 18050 }, { "epoch": 0.8384790380240494, "grad_norm": 68.8277359008789, "learning_rate": 2.4972468545429224e-07, "logits/chosen": -18.741601943969727, "logits/rejected": -17.182605743408203, "logps/chosen": -429.17706298828125, "logps/rejected": -278.61248779296875, "loss": 0.4536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9192023277282715, "rewards/margins": 1.1417934894561768, "rewards/rejected": 1.7774089574813843, "step": 18060 }, { "epoch": 0.8389433121314824, "grad_norm": 46.68790054321289, "learning_rate": 2.4969682900784623e-07, "logits/chosen": -18.901348114013672, "logits/rejected": -18.75476837158203, "logps/chosen": -286.1242980957031, "logps/rejected": -269.79547119140625, "loss": 0.5948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0696301460266113, "rewards/margins": 0.4128631055355072, "rewards/rejected": 1.6567668914794922, "step": 18070 }, { "epoch": 0.8394075862389154, "grad_norm": 22.880552291870117, "learning_rate": 2.496689725614002e-07, "logits/chosen": -19.361560821533203, "logits/rejected": -18.95659065246582, "logps/chosen": -374.025146484375, "logps/rejected": -280.19268798828125, "loss": 0.4807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0822031497955322, "rewards/margins": 1.1194894313812256, "rewards/rejected": 1.962713599205017, "step": 18080 }, { "epoch": 0.8398718603463485, "grad_norm": 38.981712341308594, "learning_rate": 2.4964111611495426e-07, "logits/chosen": -17.88833999633789, "logits/rejected": -17.4693603515625, "logps/chosen": -336.02935791015625, "logps/rejected": -263.11859130859375, "loss": 0.5356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.063681125640869, "rewards/margins": 0.5223426222801208, "rewards/rejected": 1.5413384437561035, "step": 18090 }, { "epoch": 0.8403361344537815, "grad_norm": 30.480167388916016, "learning_rate": 2.496132596685083e-07, "logits/chosen": -18.292266845703125, "logits/rejected": -16.79515838623047, "logps/chosen": -435.8680725097656, "logps/rejected": -342.546875, "loss": 0.6478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.891176462173462, "rewards/margins": 0.8520156741142273, "rewards/rejected": 2.039160966873169, "step": 18100 }, { "epoch": 0.8408004085612145, "grad_norm": 11.629594802856445, "learning_rate": 2.495854032220623e-07, "logits/chosen": -20.159090042114258, "logits/rejected": -19.26540756225586, "logps/chosen": -479.80615234375, "logps/rejected": -251.24276733398438, "loss": 0.4722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7030975818634033, "rewards/margins": 0.976254940032959, "rewards/rejected": 1.7268426418304443, "step": 18110 }, { "epoch": 0.8412646826686476, "grad_norm": 214.85372924804688, "learning_rate": 2.4955754677561633e-07, "logits/chosen": -19.108436584472656, "logits/rejected": -18.258880615234375, "logps/chosen": -451.3109436035156, "logps/rejected": -397.3547668457031, "loss": 0.5813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9003257751464844, "rewards/margins": 0.7771457433700562, "rewards/rejected": 2.1231801509857178, "step": 18120 }, { "epoch": 0.8417289567760806, "grad_norm": 20.4392032623291, "learning_rate": 2.495296903291703e-07, "logits/chosen": -18.54358673095703, "logits/rejected": -17.712873458862305, "logps/chosen": -400.94525146484375, "logps/rejected": -359.0162353515625, "loss": 0.4365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8199477195739746, "rewards/margins": 1.065752625465393, "rewards/rejected": 2.754195213317871, "step": 18130 }, { "epoch": 0.8421932308835136, "grad_norm": 61.424068450927734, "learning_rate": 2.4950183388272436e-07, "logits/chosen": -18.584674835205078, "logits/rejected": -17.89685821533203, "logps/chosen": -295.76385498046875, "logps/rejected": -205.15878295898438, "loss": 0.5853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9398590326309204, "rewards/margins": 0.577083170413971, "rewards/rejected": 1.3627758026123047, "step": 18140 }, { "epoch": 0.8426575049909466, "grad_norm": 37.84703063964844, "learning_rate": 2.4947397743627835e-07, "logits/chosen": -18.0156192779541, "logits/rejected": -17.609363555908203, "logps/chosen": -466.28521728515625, "logps/rejected": -363.5050048828125, "loss": 0.7912, "rewards/accuracies": 0.5, "rewards/chosen": 2.5873029232025146, "rewards/margins": -0.01975433900952339, "rewards/rejected": 2.6070570945739746, "step": 18150 }, { "epoch": 0.8431217790983797, "grad_norm": 2.331163167953491, "learning_rate": 2.494461209898324e-07, "logits/chosen": -17.84568977355957, "logits/rejected": -17.254741668701172, "logps/chosen": -448.30255126953125, "logps/rejected": -340.528076171875, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1600232124328613, "rewards/margins": 1.0385520458221436, "rewards/rejected": 2.1214711666107178, "step": 18160 }, { "epoch": 0.8435860532058127, "grad_norm": 89.66975402832031, "learning_rate": 2.4941826454338643e-07, "logits/chosen": -18.25079345703125, "logits/rejected": -17.494548797607422, "logps/chosen": -518.7348022460938, "logps/rejected": -465.449462890625, "loss": 0.7115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8444323539733887, "rewards/margins": 0.8106330633163452, "rewards/rejected": 3.033799648284912, "step": 18170 }, { "epoch": 0.8440503273132457, "grad_norm": 104.12405395507812, "learning_rate": 2.493904080969404e-07, "logits/chosen": -17.84905242919922, "logits/rejected": -17.10347557067871, "logps/chosen": -435.6293029785156, "logps/rejected": -339.65716552734375, "loss": 0.5681, "rewards/accuracies": 0.5, "rewards/chosen": 2.6082541942596436, "rewards/margins": 0.6736907958984375, "rewards/rejected": 1.9345636367797852, "step": 18180 }, { "epoch": 0.8445146014206788, "grad_norm": 41.54685592651367, "learning_rate": 2.4936255165049446e-07, "logits/chosen": -19.397497177124023, "logits/rejected": -18.958791732788086, "logps/chosen": -490.59503173828125, "logps/rejected": -609.31982421875, "loss": 0.5561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3334288597106934, "rewards/margins": 0.7430599331855774, "rewards/rejected": 2.5903687477111816, "step": 18190 }, { "epoch": 0.8449788755281118, "grad_norm": 168.76930236816406, "learning_rate": 2.4933469520404844e-07, "logits/chosen": -18.585004806518555, "logits/rejected": -18.126384735107422, "logps/chosen": -461.64825439453125, "logps/rejected": -380.2384338378906, "loss": 0.9267, "rewards/accuracies": 0.5, "rewards/chosen": 3.3427186012268066, "rewards/margins": 0.12362043559551239, "rewards/rejected": 3.219097852706909, "step": 18200 }, { "epoch": 0.8454431496355448, "grad_norm": 3.5595686435699463, "learning_rate": 2.493068387576025e-07, "logits/chosen": -19.090097427368164, "logits/rejected": -16.693912506103516, "logps/chosen": -524.1007080078125, "logps/rejected": -337.51763916015625, "loss": 0.3074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.492030620574951, "rewards/margins": 1.4117003679275513, "rewards/rejected": 2.0803303718566895, "step": 18210 }, { "epoch": 0.8459074237429779, "grad_norm": 245.83355712890625, "learning_rate": 2.492789823111565e-07, "logits/chosen": -20.349056243896484, "logits/rejected": -18.484872817993164, "logps/chosen": -488.74737548828125, "logps/rejected": -430.6980895996094, "loss": 0.7288, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7592408657073975, "rewards/margins": 0.24058470129966736, "rewards/rejected": 2.5186562538146973, "step": 18220 }, { "epoch": 0.8463716978504109, "grad_norm": 7.043127536773682, "learning_rate": 2.492511258647105e-07, "logits/chosen": -18.728044509887695, "logits/rejected": -17.651044845581055, "logps/chosen": -491.41900634765625, "logps/rejected": -386.7389831542969, "loss": 0.7633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.271221160888672, "rewards/margins": 0.7600444555282593, "rewards/rejected": 2.5111773014068604, "step": 18230 }, { "epoch": 0.8468359719578439, "grad_norm": 31.796966552734375, "learning_rate": 2.4922326941826456e-07, "logits/chosen": -19.545927047729492, "logits/rejected": -17.942928314208984, "logps/chosen": -492.4690856933594, "logps/rejected": -324.9067077636719, "loss": 0.6059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.929111957550049, "rewards/margins": 0.44190359115600586, "rewards/rejected": 2.487208604812622, "step": 18240 }, { "epoch": 0.847300246065277, "grad_norm": 105.6241226196289, "learning_rate": 2.4919541297181854e-07, "logits/chosen": -18.249191284179688, "logits/rejected": -17.334495544433594, "logps/chosen": -297.43865966796875, "logps/rejected": -267.3506774902344, "loss": 0.8485, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7647294998168945, "rewards/margins": -0.20066149532794952, "rewards/rejected": 1.9653911590576172, "step": 18250 }, { "epoch": 0.84776452017271, "grad_norm": 32.186641693115234, "learning_rate": 2.4916755652537253e-07, "logits/chosen": -17.61643409729004, "logits/rejected": -17.302669525146484, "logps/chosen": -315.65606689453125, "logps/rejected": -302.5565185546875, "loss": 0.7724, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1663665771484375, "rewards/margins": 0.2315228432416916, "rewards/rejected": 1.9348437786102295, "step": 18260 }, { "epoch": 0.848228794280143, "grad_norm": 80.02580261230469, "learning_rate": 2.4913970007892657e-07, "logits/chosen": -19.31667709350586, "logits/rejected": -18.08453941345215, "logps/chosen": -467.2774353027344, "logps/rejected": -341.5486145019531, "loss": 0.8264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2974376678466797, "rewards/margins": 0.6468304395675659, "rewards/rejected": 2.6506073474884033, "step": 18270 }, { "epoch": 0.8486930683875761, "grad_norm": 37.948307037353516, "learning_rate": 2.491118436324806e-07, "logits/chosen": -17.645559310913086, "logits/rejected": -17.137920379638672, "logps/chosen": -343.15032958984375, "logps/rejected": -260.92864990234375, "loss": 0.7667, "rewards/accuracies": 0.5, "rewards/chosen": 1.9202649593353271, "rewards/margins": 0.030217409133911133, "rewards/rejected": 1.8900476694107056, "step": 18280 }, { "epoch": 0.8491573424950091, "grad_norm": 102.60369873046875, "learning_rate": 2.4908398718603465e-07, "logits/chosen": -18.676240921020508, "logits/rejected": -18.580059051513672, "logps/chosen": -344.89105224609375, "logps/rejected": -331.1417236328125, "loss": 0.9824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7742435932159424, "rewards/margins": 0.1450522094964981, "rewards/rejected": 2.6291909217834473, "step": 18290 }, { "epoch": 0.8496216166024421, "grad_norm": 302.3721923828125, "learning_rate": 2.4905613073958864e-07, "logits/chosen": -18.709583282470703, "logits/rejected": -17.728116989135742, "logps/chosen": -422.0830993652344, "logps/rejected": -292.65234375, "loss": 0.8181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2172977924346924, "rewards/margins": 0.20442751049995422, "rewards/rejected": 2.0128700733184814, "step": 18300 }, { "epoch": 0.8500858907098751, "grad_norm": 91.58853912353516, "learning_rate": 2.490282742931427e-07, "logits/chosen": -18.278438568115234, "logits/rejected": -16.974796295166016, "logps/chosen": -346.5130310058594, "logps/rejected": -243.4837188720703, "loss": 0.4867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0596773624420166, "rewards/margins": 0.667998194694519, "rewards/rejected": 1.391679286956787, "step": 18310 }, { "epoch": 0.8505501648173082, "grad_norm": 24.133445739746094, "learning_rate": 2.4900041784669667e-07, "logits/chosen": -18.275360107421875, "logits/rejected": -17.826152801513672, "logps/chosen": -383.7561950683594, "logps/rejected": -324.19000244140625, "loss": 0.6512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2819042205810547, "rewards/margins": 0.45788460969924927, "rewards/rejected": 1.8240196704864502, "step": 18320 }, { "epoch": 0.8510144389247412, "grad_norm": 244.0522918701172, "learning_rate": 2.4897256140025066e-07, "logits/chosen": -18.568180084228516, "logits/rejected": -17.899459838867188, "logps/chosen": -351.45257568359375, "logps/rejected": -324.671630859375, "loss": 1.0671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1347548961639404, "rewards/margins": 0.12504935264587402, "rewards/rejected": 2.0097055435180664, "step": 18330 }, { "epoch": 0.8514787130321742, "grad_norm": 38.56053924560547, "learning_rate": 2.489447049538047e-07, "logits/chosen": -18.15070343017578, "logits/rejected": -17.460834503173828, "logps/chosen": -387.97637939453125, "logps/rejected": -275.6716613769531, "loss": 0.5542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6613638401031494, "rewards/margins": 0.8901824951171875, "rewards/rejected": 1.7711808681488037, "step": 18340 }, { "epoch": 0.8519429871396073, "grad_norm": 6.503035545349121, "learning_rate": 2.4891684850735874e-07, "logits/chosen": -18.274972915649414, "logits/rejected": -17.298114776611328, "logps/chosen": -352.5156555175781, "logps/rejected": -325.2076721191406, "loss": 0.7656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4138548374176025, "rewards/margins": 0.658295214176178, "rewards/rejected": 1.7555595636367798, "step": 18350 }, { "epoch": 0.8524072612470402, "grad_norm": 136.04708862304688, "learning_rate": 2.488889920609128e-07, "logits/chosen": -18.514606475830078, "logits/rejected": -18.450443267822266, "logps/chosen": -358.76739501953125, "logps/rejected": -369.55279541015625, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3061373233795166, "rewards/margins": 0.2879238426685333, "rewards/rejected": 2.0182132720947266, "step": 18360 }, { "epoch": 0.8528715353544732, "grad_norm": 77.9776840209961, "learning_rate": 2.4886113561446677e-07, "logits/chosen": -18.252410888671875, "logits/rejected": -17.060230255126953, "logps/chosen": -469.18267822265625, "logps/rejected": -266.4277648925781, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.571012020111084, "rewards/margins": 0.7651864290237427, "rewards/rejected": 1.8058255910873413, "step": 18370 }, { "epoch": 0.8533358094619063, "grad_norm": 11.792502403259277, "learning_rate": 2.4883327916802076e-07, "logits/chosen": -18.42239761352539, "logits/rejected": -16.81076431274414, "logps/chosen": -499.8779296875, "logps/rejected": -268.3787841796875, "loss": 0.3677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.934403896331787, "rewards/margins": 1.1707711219787598, "rewards/rejected": 1.7636327743530273, "step": 18380 }, { "epoch": 0.8538000835693393, "grad_norm": 66.07332611083984, "learning_rate": 2.488054227215748e-07, "logits/chosen": -19.07036018371582, "logits/rejected": -17.832910537719727, "logps/chosen": -438.33154296875, "logps/rejected": -343.04742431640625, "loss": 0.4062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0710315704345703, "rewards/margins": 1.1105306148529053, "rewards/rejected": 1.9605014324188232, "step": 18390 }, { "epoch": 0.8542643576767723, "grad_norm": 39.12870788574219, "learning_rate": 2.4877756627512884e-07, "logits/chosen": -18.05976676940918, "logits/rejected": -18.087421417236328, "logps/chosen": -425.4840393066406, "logps/rejected": -382.1614685058594, "loss": 0.636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.344191312789917, "rewards/margins": 1.059446096420288, "rewards/rejected": 2.284745454788208, "step": 18400 }, { "epoch": 0.8547286317842054, "grad_norm": 82.84429931640625, "learning_rate": 2.4874970982868283e-07, "logits/chosen": -19.08506202697754, "logits/rejected": -18.145437240600586, "logps/chosen": -342.28131103515625, "logps/rejected": -304.80718994140625, "loss": 0.6121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7235279083251953, "rewards/margins": 0.7181499600410461, "rewards/rejected": 2.005378246307373, "step": 18410 }, { "epoch": 0.8551929058916384, "grad_norm": 83.04500579833984, "learning_rate": 2.4872185338223687e-07, "logits/chosen": -18.60614776611328, "logits/rejected": -17.765727996826172, "logps/chosen": -479.3309020996094, "logps/rejected": -385.1153564453125, "loss": 0.5153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.696866512298584, "rewards/margins": 0.7606621980667114, "rewards/rejected": 1.9362043142318726, "step": 18420 }, { "epoch": 0.8556571799990714, "grad_norm": 37.666290283203125, "learning_rate": 2.4869399693579086e-07, "logits/chosen": -18.40671157836914, "logits/rejected": -17.698848724365234, "logps/chosen": -358.5804748535156, "logps/rejected": -271.11834716796875, "loss": 0.6288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7089009284973145, "rewards/margins": 0.5398002862930298, "rewards/rejected": 2.1691009998321533, "step": 18430 }, { "epoch": 0.8561214541065045, "grad_norm": 44.239139556884766, "learning_rate": 2.486661404893449e-07, "logits/chosen": -18.92119598388672, "logits/rejected": -18.260902404785156, "logps/chosen": -361.67108154296875, "logps/rejected": -261.6964416503906, "loss": 0.6431, "rewards/accuracies": 0.5, "rewards/chosen": 2.641756057739258, "rewards/margins": 0.8335281610488892, "rewards/rejected": 1.8082278966903687, "step": 18440 }, { "epoch": 0.8565857282139375, "grad_norm": 10.199455261230469, "learning_rate": 2.486382840428989e-07, "logits/chosen": -19.273923873901367, "logits/rejected": -18.17361068725586, "logps/chosen": -432.23529052734375, "logps/rejected": -314.64495849609375, "loss": 0.4212, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4535109996795654, "rewards/margins": 0.8977165222167969, "rewards/rejected": 2.5557942390441895, "step": 18450 }, { "epoch": 0.8570500023213705, "grad_norm": 48.10973358154297, "learning_rate": 2.4861042759645293e-07, "logits/chosen": -19.036575317382812, "logits/rejected": -17.927082061767578, "logps/chosen": -433.1642150878906, "logps/rejected": -340.0115051269531, "loss": 0.5056, "rewards/accuracies": 0.5, "rewards/chosen": 2.82210111618042, "rewards/margins": 1.0972789525985718, "rewards/rejected": 1.7248220443725586, "step": 18460 }, { "epoch": 0.8575142764288035, "grad_norm": 41.30629348754883, "learning_rate": 2.4858257115000697e-07, "logits/chosen": -19.92019271850586, "logits/rejected": -18.479887008666992, "logps/chosen": -502.7467346191406, "logps/rejected": -281.7110290527344, "loss": 0.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9934451580047607, "rewards/margins": 1.1269862651824951, "rewards/rejected": 1.8664591312408447, "step": 18470 }, { "epoch": 0.8579785505362366, "grad_norm": 17.710723876953125, "learning_rate": 2.48554714703561e-07, "logits/chosen": -17.761138916015625, "logits/rejected": -17.879045486450195, "logps/chosen": -477.83349609375, "logps/rejected": -469.6150817871094, "loss": 0.7381, "rewards/accuracies": 0.5, "rewards/chosen": 3.109687328338623, "rewards/margins": 0.19219207763671875, "rewards/rejected": 2.9174952507019043, "step": 18480 }, { "epoch": 0.8584428246436696, "grad_norm": 74.66098022460938, "learning_rate": 2.48526858257115e-07, "logits/chosen": -18.166126251220703, "logits/rejected": -17.346492767333984, "logps/chosen": -425.5377502441406, "logps/rejected": -313.13909912109375, "loss": 0.4028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7619242668151855, "rewards/margins": 0.8686287999153137, "rewards/rejected": 1.8932956457138062, "step": 18490 }, { "epoch": 0.8589070987511026, "grad_norm": 113.86212921142578, "learning_rate": 2.48499001810669e-07, "logits/chosen": -18.629728317260742, "logits/rejected": -17.925573348999023, "logps/chosen": -410.24053955078125, "logps/rejected": -276.8771057128906, "loss": 0.5501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2454066276550293, "rewards/margins": 0.6491442322731018, "rewards/rejected": 1.5962622165679932, "step": 18500 }, { "epoch": 0.8593713728585357, "grad_norm": 114.9581527709961, "learning_rate": 2.4847114536422303e-07, "logits/chosen": -18.878101348876953, "logits/rejected": -18.22932243347168, "logps/chosen": -537.3740844726562, "logps/rejected": -475.1814880371094, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": 3.171130657196045, "rewards/margins": 0.09081493318080902, "rewards/rejected": 3.080315351486206, "step": 18510 }, { "epoch": 0.8598356469659687, "grad_norm": 63.18217849731445, "learning_rate": 2.4844328891777707e-07, "logits/chosen": -18.88350486755371, "logits/rejected": -18.311559677124023, "logps/chosen": -318.7665710449219, "logps/rejected": -297.1345520019531, "loss": 0.4948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0242533683776855, "rewards/margins": 0.7362058758735657, "rewards/rejected": 1.288047432899475, "step": 18520 }, { "epoch": 0.8602999210734017, "grad_norm": 124.52759552001953, "learning_rate": 2.4841543247133106e-07, "logits/chosen": -18.42184066772461, "logits/rejected": -18.331724166870117, "logps/chosen": -369.2012939453125, "logps/rejected": -345.8257141113281, "loss": 0.6938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1188714504241943, "rewards/margins": 0.6689327955245972, "rewards/rejected": 2.4499387741088867, "step": 18530 }, { "epoch": 0.8607641951808348, "grad_norm": 110.10555267333984, "learning_rate": 2.483875760248851e-07, "logits/chosen": -18.450273513793945, "logits/rejected": -17.259103775024414, "logps/chosen": -462.37646484375, "logps/rejected": -335.6546325683594, "loss": 0.5522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.707547426223755, "rewards/margins": 1.3939170837402344, "rewards/rejected": 2.3136298656463623, "step": 18540 }, { "epoch": 0.8612284692882678, "grad_norm": 52.165897369384766, "learning_rate": 2.483597195784391e-07, "logits/chosen": -18.540369033813477, "logits/rejected": -17.806053161621094, "logps/chosen": -357.0494689941406, "logps/rejected": -287.92254638671875, "loss": 1.0266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.489142417907715, "rewards/margins": 0.1441139429807663, "rewards/rejected": 2.3450284004211426, "step": 18550 }, { "epoch": 0.8616927433957008, "grad_norm": 139.13385009765625, "learning_rate": 2.4833186313199313e-07, "logits/chosen": -19.548358917236328, "logits/rejected": -18.9748477935791, "logps/chosen": -351.56341552734375, "logps/rejected": -295.05987548828125, "loss": 0.6017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8692612648010254, "rewards/margins": 0.8289119601249695, "rewards/rejected": 2.040349245071411, "step": 18560 }, { "epoch": 0.8621570175031339, "grad_norm": 39.629066467285156, "learning_rate": 2.483040066855471e-07, "logits/chosen": -19.05356216430664, "logits/rejected": -18.77669906616211, "logps/chosen": -350.06890869140625, "logps/rejected": -340.7501525878906, "loss": 0.7373, "rewards/accuracies": 0.5, "rewards/chosen": 2.1769602298736572, "rewards/margins": 0.30243659019470215, "rewards/rejected": 1.874523401260376, "step": 18570 }, { "epoch": 0.8626212916105669, "grad_norm": 154.98072814941406, "learning_rate": 2.4827615023910116e-07, "logits/chosen": -18.13644027709961, "logits/rejected": -17.898258209228516, "logps/chosen": -473.7247009277344, "logps/rejected": -418.39752197265625, "loss": 0.8112, "rewards/accuracies": 0.5, "rewards/chosen": 2.8135836124420166, "rewards/margins": 0.09287073463201523, "rewards/rejected": 2.720712423324585, "step": 18580 }, { "epoch": 0.8630855657179999, "grad_norm": 136.1291046142578, "learning_rate": 2.482482937926552e-07, "logits/chosen": -19.189294815063477, "logits/rejected": -18.54874610900879, "logps/chosen": -392.17022705078125, "logps/rejected": -325.2735290527344, "loss": 0.7283, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7340731620788574, "rewards/margins": 0.3197513520717621, "rewards/rejected": 2.4143218994140625, "step": 18590 }, { "epoch": 0.863549839825433, "grad_norm": 20.14865493774414, "learning_rate": 2.482204373462092e-07, "logits/chosen": -18.473064422607422, "logits/rejected": -18.13193702697754, "logps/chosen": -390.07366943359375, "logps/rejected": -282.8013916015625, "loss": 0.4273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.96051287651062, "rewards/margins": 1.1981918811798096, "rewards/rejected": 1.7623212337493896, "step": 18600 }, { "epoch": 0.864014113932866, "grad_norm": 147.27366638183594, "learning_rate": 2.481925808997632e-07, "logits/chosen": -17.56337547302246, "logits/rejected": -17.77007293701172, "logps/chosen": -489.49993896484375, "logps/rejected": -456.11248779296875, "loss": 1.2008, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8081459999084473, "rewards/margins": -0.5689009428024292, "rewards/rejected": 3.377047061920166, "step": 18610 }, { "epoch": 0.864478388040299, "grad_norm": 30.600011825561523, "learning_rate": 2.481647244533172e-07, "logits/chosen": -18.55640983581543, "logits/rejected": -17.905412673950195, "logps/chosen": -390.9432067871094, "logps/rejected": -380.77130126953125, "loss": 1.1941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7063844203948975, "rewards/margins": -0.3088272213935852, "rewards/rejected": 3.015211582183838, "step": 18620 }, { "epoch": 0.864942662147732, "grad_norm": 47.74150085449219, "learning_rate": 2.4813686800687126e-07, "logits/chosen": -18.7155818939209, "logits/rejected": -17.250099182128906, "logps/chosen": -413.0201721191406, "logps/rejected": -344.622314453125, "loss": 0.4867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4944963455200195, "rewards/margins": 0.6827253699302673, "rewards/rejected": 1.811771035194397, "step": 18630 }, { "epoch": 0.8654069362551651, "grad_norm": 197.4284210205078, "learning_rate": 2.4810901156042524e-07, "logits/chosen": -19.192859649658203, "logits/rejected": -18.571163177490234, "logps/chosen": -438.1678771972656, "logps/rejected": -489.94964599609375, "loss": 1.1595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.525205135345459, "rewards/margins": -0.2986679971218109, "rewards/rejected": 2.823873281478882, "step": 18640 }, { "epoch": 0.8658712103625981, "grad_norm": 48.64472198486328, "learning_rate": 2.480811551139793e-07, "logits/chosen": -18.4080810546875, "logits/rejected": -17.379741668701172, "logps/chosen": -411.85009765625, "logps/rejected": -334.97589111328125, "loss": 0.5499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3022713661193848, "rewards/margins": 0.6546109914779663, "rewards/rejected": 1.647660255432129, "step": 18650 }, { "epoch": 0.8663354844700311, "grad_norm": 142.2146453857422, "learning_rate": 2.480532986675333e-07, "logits/chosen": -18.93621826171875, "logits/rejected": -18.334400177001953, "logps/chosen": -298.4836120605469, "logps/rejected": -249.451904296875, "loss": 0.8388, "rewards/accuracies": 0.5, "rewards/chosen": 2.164874792098999, "rewards/margins": 0.22615787386894226, "rewards/rejected": 1.9387168884277344, "step": 18660 }, { "epoch": 0.8667997585774642, "grad_norm": 13.445497512817383, "learning_rate": 2.480254422210873e-07, "logits/chosen": -18.86996841430664, "logits/rejected": -17.415908813476562, "logps/chosen": -402.2608337402344, "logps/rejected": -198.80618286132812, "loss": 0.5528, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.359809398651123, "rewards/margins": 0.887325644493103, "rewards/rejected": 1.47248375415802, "step": 18670 }, { "epoch": 0.8672640326848972, "grad_norm": 68.62779998779297, "learning_rate": 2.479975857746413e-07, "logits/chosen": -18.57921600341797, "logits/rejected": -18.30996322631836, "logps/chosen": -411.9371032714844, "logps/rejected": -369.5825500488281, "loss": 0.8189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.023344039916992, "rewards/margins": 0.5918235778808594, "rewards/rejected": 2.4315202236175537, "step": 18680 }, { "epoch": 0.8677283067923302, "grad_norm": 84.27352142333984, "learning_rate": 2.4796972932819534e-07, "logits/chosen": -18.34189224243164, "logits/rejected": -17.876346588134766, "logps/chosen": -400.8858947753906, "logps/rejected": -391.9229736328125, "loss": 0.9919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9665886163711548, "rewards/margins": -0.1297672986984253, "rewards/rejected": 2.09635591506958, "step": 18690 }, { "epoch": 0.8681925808997633, "grad_norm": 78.228759765625, "learning_rate": 2.479418728817494e-07, "logits/chosen": -19.295785903930664, "logits/rejected": -18.92618179321289, "logps/chosen": -349.3095703125, "logps/rejected": -279.5113830566406, "loss": 0.7846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6060593128204346, "rewards/margins": 0.21534104645252228, "rewards/rejected": 2.390718460083008, "step": 18700 }, { "epoch": 0.8686568550071962, "grad_norm": 11.969575881958008, "learning_rate": 2.479140164353034e-07, "logits/chosen": -19.539867401123047, "logits/rejected": -18.710556030273438, "logps/chosen": -431.98974609375, "logps/rejected": -178.49148559570312, "loss": 0.3543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8239357471466064, "rewards/margins": 1.7271610498428345, "rewards/rejected": 1.0967748165130615, "step": 18710 }, { "epoch": 0.8691211291146292, "grad_norm": 63.60881423950195, "learning_rate": 2.478861599888574e-07, "logits/chosen": -18.875295639038086, "logits/rejected": -17.488208770751953, "logps/chosen": -442.0821838378906, "logps/rejected": -305.3652038574219, "loss": 0.5355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.469937562942505, "rewards/margins": 0.6071762442588806, "rewards/rejected": 1.8627614974975586, "step": 18720 }, { "epoch": 0.8695854032220623, "grad_norm": 33.61241149902344, "learning_rate": 2.4785830354241145e-07, "logits/chosen": -18.808874130249023, "logits/rejected": -18.20012092590332, "logps/chosen": -297.95062255859375, "logps/rejected": -236.27542114257812, "loss": 0.641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7180635929107666, "rewards/margins": 0.6260494589805603, "rewards/rejected": 2.0920138359069824, "step": 18730 }, { "epoch": 0.8700496773294953, "grad_norm": 40.97005844116211, "learning_rate": 2.4783044709596544e-07, "logits/chosen": -19.641572952270508, "logits/rejected": -18.365856170654297, "logps/chosen": -397.69580078125, "logps/rejected": -284.478759765625, "loss": 0.4738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6964995861053467, "rewards/margins": 0.8270435333251953, "rewards/rejected": 1.8694560527801514, "step": 18740 }, { "epoch": 0.8705139514369283, "grad_norm": 4.086805820465088, "learning_rate": 2.4780259064951943e-07, "logits/chosen": -20.003984451293945, "logits/rejected": -18.508146286010742, "logps/chosen": -403.73895263671875, "logps/rejected": -325.32550048828125, "loss": 0.4934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.874216079711914, "rewards/margins": 0.9821955561637878, "rewards/rejected": 1.8920204639434814, "step": 18750 }, { "epoch": 0.8709782255443614, "grad_norm": 81.52867126464844, "learning_rate": 2.4777473420307347e-07, "logits/chosen": -18.315174102783203, "logits/rejected": -17.389385223388672, "logps/chosen": -488.6904296875, "logps/rejected": -393.591552734375, "loss": 0.6153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.839728832244873, "rewards/margins": 0.5059071779251099, "rewards/rejected": 2.3338217735290527, "step": 18760 }, { "epoch": 0.8714424996517944, "grad_norm": 202.90884399414062, "learning_rate": 2.477468777566275e-07, "logits/chosen": -18.264379501342773, "logits/rejected": -17.948484420776367, "logps/chosen": -486.1822814941406, "logps/rejected": -446.8390197753906, "loss": 0.8845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.04710054397583, "rewards/margins": -0.0034439205192029476, "rewards/rejected": 3.050544261932373, "step": 18770 }, { "epoch": 0.8719067737592274, "grad_norm": 5.6582231521606445, "learning_rate": 2.4771902131018155e-07, "logits/chosen": -18.63763427734375, "logits/rejected": -18.33415985107422, "logps/chosen": -329.108642578125, "logps/rejected": -262.58038330078125, "loss": 0.7921, "rewards/accuracies": 0.5, "rewards/chosen": 2.283684015274048, "rewards/margins": 0.41732969880104065, "rewards/rejected": 1.8663543462753296, "step": 18780 }, { "epoch": 0.8723710478666605, "grad_norm": 13.747422218322754, "learning_rate": 2.4769116486373554e-07, "logits/chosen": -18.63887596130371, "logits/rejected": -18.00027084350586, "logps/chosen": -321.9980773925781, "logps/rejected": -275.2978515625, "loss": 0.7363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.087035655975342, "rewards/margins": 0.4442559778690338, "rewards/rejected": 1.6427793502807617, "step": 18790 }, { "epoch": 0.8728353219740935, "grad_norm": 51.043582916259766, "learning_rate": 2.4766330841728953e-07, "logits/chosen": -17.965417861938477, "logits/rejected": -17.450824737548828, "logps/chosen": -328.59796142578125, "logps/rejected": -245.92678833007812, "loss": 0.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4438912868499756, "rewards/margins": 0.5694459676742554, "rewards/rejected": 1.8744451999664307, "step": 18800 }, { "epoch": 0.8732995960815265, "grad_norm": 13.079090118408203, "learning_rate": 2.4763545197084357e-07, "logits/chosen": -19.407318115234375, "logits/rejected": -18.444660186767578, "logps/chosen": -364.7237854003906, "logps/rejected": -299.1397399902344, "loss": 0.6381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7919178009033203, "rewards/margins": 0.7297691702842712, "rewards/rejected": 2.0621485710144043, "step": 18810 }, { "epoch": 0.8737638701889595, "grad_norm": 1.174193263053894, "learning_rate": 2.476075955243976e-07, "logits/chosen": -18.486902236938477, "logits/rejected": -17.584205627441406, "logps/chosen": -455.7940368652344, "logps/rejected": -363.0209655761719, "loss": 0.3954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.959691286087036, "rewards/margins": 1.3928172588348389, "rewards/rejected": 1.5668739080429077, "step": 18820 }, { "epoch": 0.8742281442963926, "grad_norm": 108.04254913330078, "learning_rate": 2.475797390779516e-07, "logits/chosen": -17.792325973510742, "logits/rejected": -17.85857582092285, "logps/chosen": -293.2518005371094, "logps/rejected": -320.91943359375, "loss": 0.8488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.011904001235962, "rewards/margins": -0.03402522951364517, "rewards/rejected": 2.045929193496704, "step": 18830 }, { "epoch": 0.8746924184038256, "grad_norm": 6.452847957611084, "learning_rate": 2.4755188263150564e-07, "logits/chosen": -19.201610565185547, "logits/rejected": -17.81216049194336, "logps/chosen": -380.3049621582031, "logps/rejected": -205.8688507080078, "loss": 0.517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3585238456726074, "rewards/margins": 1.3602440357208252, "rewards/rejected": 1.9982802867889404, "step": 18840 }, { "epoch": 0.8751566925112586, "grad_norm": 170.40013122558594, "learning_rate": 2.4752402618505963e-07, "logits/chosen": -18.972660064697266, "logits/rejected": -17.71993637084961, "logps/chosen": -435.118408203125, "logps/rejected": -282.21502685546875, "loss": 0.4867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0164217948913574, "rewards/margins": 1.142777681350708, "rewards/rejected": 1.873644232749939, "step": 18850 }, { "epoch": 0.8756209666186917, "grad_norm": 41.62741470336914, "learning_rate": 2.4749616973861367e-07, "logits/chosen": -19.31255340576172, "logits/rejected": -18.31282615661621, "logps/chosen": -450.1021423339844, "logps/rejected": -271.8124084472656, "loss": 0.3195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.02779221534729, "rewards/margins": 1.717780351638794, "rewards/rejected": 1.3100117444992065, "step": 18860 }, { "epoch": 0.8760852407261247, "grad_norm": 112.25687408447266, "learning_rate": 2.4746831329216766e-07, "logits/chosen": -18.90662384033203, "logits/rejected": -18.343137741088867, "logps/chosen": -356.153076171875, "logps/rejected": -299.898193359375, "loss": 0.8079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1201486587524414, "rewards/margins": 0.07354015111923218, "rewards/rejected": 2.0466086864471436, "step": 18870 }, { "epoch": 0.8765495148335577, "grad_norm": 98.88165283203125, "learning_rate": 2.474404568457217e-07, "logits/chosen": -18.319408416748047, "logits/rejected": -17.914522171020508, "logps/chosen": -336.3918762207031, "logps/rejected": -301.6561584472656, "loss": 0.9728, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9191757440567017, "rewards/margins": -0.20722413063049316, "rewards/rejected": 2.1264002323150635, "step": 18880 }, { "epoch": 0.8770137889409908, "grad_norm": 111.10832214355469, "learning_rate": 2.4741260039927574e-07, "logits/chosen": -18.639400482177734, "logits/rejected": -18.789539337158203, "logps/chosen": -334.74591064453125, "logps/rejected": -381.3807067871094, "loss": 1.0357, "rewards/accuracies": 0.5, "rewards/chosen": 2.0249671936035156, "rewards/margins": -0.12249217927455902, "rewards/rejected": 2.1474592685699463, "step": 18890 }, { "epoch": 0.8774780630484238, "grad_norm": 32.29560470581055, "learning_rate": 2.473847439528298e-07, "logits/chosen": -19.079689025878906, "logits/rejected": -17.368000030517578, "logps/chosen": -457.11993408203125, "logps/rejected": -302.4842834472656, "loss": 0.4222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.822556972503662, "rewards/margins": 1.1462454795837402, "rewards/rejected": 1.676311731338501, "step": 18900 }, { "epoch": 0.8779423371558568, "grad_norm": 82.14466857910156, "learning_rate": 2.4735688750638377e-07, "logits/chosen": -19.012210845947266, "logits/rejected": -18.39871597290039, "logps/chosen": -462.4676208496094, "logps/rejected": -352.43133544921875, "loss": 0.5979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4071855545043945, "rewards/margins": 0.36748751997947693, "rewards/rejected": 2.0396978855133057, "step": 18910 }, { "epoch": 0.8784066112632899, "grad_norm": 31.509571075439453, "learning_rate": 2.4732903105993776e-07, "logits/chosen": -18.802494049072266, "logits/rejected": -17.854206085205078, "logps/chosen": -387.791748046875, "logps/rejected": -313.39703369140625, "loss": 0.5821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.471698045730591, "rewards/margins": 0.7542837262153625, "rewards/rejected": 1.7174142599105835, "step": 18920 }, { "epoch": 0.8788708853707229, "grad_norm": 179.9296112060547, "learning_rate": 2.473011746134918e-07, "logits/chosen": -18.704421997070312, "logits/rejected": -17.829227447509766, "logps/chosen": -343.6546325683594, "logps/rejected": -380.40179443359375, "loss": 0.9891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.33805251121521, "rewards/margins": 0.1800020933151245, "rewards/rejected": 2.158050060272217, "step": 18930 }, { "epoch": 0.8793351594781559, "grad_norm": 71.82202911376953, "learning_rate": 2.472733181670458e-07, "logits/chosen": -19.213850021362305, "logits/rejected": -18.773487091064453, "logps/chosen": -422.7416076660156, "logps/rejected": -346.3523864746094, "loss": 0.5391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1529288291931152, "rewards/margins": 0.8587358593940735, "rewards/rejected": 2.2941927909851074, "step": 18940 }, { "epoch": 0.879799433585589, "grad_norm": 34.03224182128906, "learning_rate": 2.472454617205998e-07, "logits/chosen": -19.41802406311035, "logits/rejected": -18.845951080322266, "logps/chosen": -475.91485595703125, "logps/rejected": -343.5657043457031, "loss": 0.3871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.527283191680908, "rewards/margins": 1.1951851844787598, "rewards/rejected": 2.3320982456207275, "step": 18950 }, { "epoch": 0.880263707693022, "grad_norm": 21.20468521118164, "learning_rate": 2.4721760527415387e-07, "logits/chosen": -18.35964584350586, "logits/rejected": -17.27231216430664, "logps/chosen": -474.947265625, "logps/rejected": -300.12847900390625, "loss": 0.5299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0986056327819824, "rewards/margins": 0.7571254372596741, "rewards/rejected": 2.3414804935455322, "step": 18960 }, { "epoch": 0.880727981800455, "grad_norm": 81.15801239013672, "learning_rate": 2.4718974882770786e-07, "logits/chosen": -18.694751739501953, "logits/rejected": -18.1735782623291, "logps/chosen": -407.28955078125, "logps/rejected": -366.93035888671875, "loss": 0.7226, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8111073970794678, "rewards/margins": 0.2639482915401459, "rewards/rejected": 2.54715895652771, "step": 18970 }, { "epoch": 0.881192255907888, "grad_norm": 164.6442413330078, "learning_rate": 2.471618923812619e-07, "logits/chosen": -18.906078338623047, "logits/rejected": -18.712120056152344, "logps/chosen": -438.1941833496094, "logps/rejected": -425.33709716796875, "loss": 0.7097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2942709922790527, "rewards/margins": 0.267409086227417, "rewards/rejected": 3.026862382888794, "step": 18980 }, { "epoch": 0.8816565300153211, "grad_norm": 41.91046142578125, "learning_rate": 2.471340359348159e-07, "logits/chosen": -19.70257568359375, "logits/rejected": -18.437976837158203, "logps/chosen": -463.2330017089844, "logps/rejected": -329.6542663574219, "loss": 0.5626, "rewards/accuracies": 0.5, "rewards/chosen": 3.123774528503418, "rewards/margins": 1.1582047939300537, "rewards/rejected": 1.9655697345733643, "step": 18990 }, { "epoch": 0.8821208041227541, "grad_norm": 157.95721435546875, "learning_rate": 2.471061794883699e-07, "logits/chosen": -19.413604736328125, "logits/rejected": -19.935781478881836, "logps/chosen": -393.34027099609375, "logps/rejected": -404.0048828125, "loss": 0.8472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6834523677825928, "rewards/margins": -0.1943756639957428, "rewards/rejected": 2.877828359603882, "step": 19000 }, { "epoch": 0.8825850782301871, "grad_norm": 74.09590148925781, "learning_rate": 2.4707832304192397e-07, "logits/chosen": -18.69808578491211, "logits/rejected": -19.40215301513672, "logps/chosen": -398.662841796875, "logps/rejected": -379.1340026855469, "loss": 0.654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9023454189300537, "rewards/margins": 0.3655496835708618, "rewards/rejected": 2.5367956161499023, "step": 19010 }, { "epoch": 0.8830493523376202, "grad_norm": 104.47615051269531, "learning_rate": 2.4705046659547795e-07, "logits/chosen": -19.096277236938477, "logits/rejected": -18.253036499023438, "logps/chosen": -427.5545349121094, "logps/rejected": -346.53350830078125, "loss": 0.5937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1791467666625977, "rewards/margins": 0.8425785899162292, "rewards/rejected": 2.3365683555603027, "step": 19020 }, { "epoch": 0.8835136264450532, "grad_norm": 49.62615966796875, "learning_rate": 2.47022610149032e-07, "logits/chosen": -18.679790496826172, "logits/rejected": -18.405376434326172, "logps/chosen": -343.6808776855469, "logps/rejected": -333.77398681640625, "loss": 0.7988, "rewards/accuracies": 0.5, "rewards/chosen": 2.612025737762451, "rewards/margins": 0.02843579091131687, "rewards/rejected": 2.583590030670166, "step": 19030 }, { "epoch": 0.8839779005524862, "grad_norm": 104.85027313232422, "learning_rate": 2.46994753702586e-07, "logits/chosen": -18.663562774658203, "logits/rejected": -18.23371124267578, "logps/chosen": -314.2206115722656, "logps/rejected": -288.5951232910156, "loss": 0.6095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.082685947418213, "rewards/margins": 0.4442766606807709, "rewards/rejected": 1.6384090185165405, "step": 19040 }, { "epoch": 0.8844421746599193, "grad_norm": 28.874706268310547, "learning_rate": 2.4696689725614e-07, "logits/chosen": -19.547151565551758, "logits/rejected": -18.357702255249023, "logps/chosen": -449.78167724609375, "logps/rejected": -308.61859130859375, "loss": 0.5157, "rewards/accuracies": 0.5, "rewards/chosen": 3.682178497314453, "rewards/margins": 0.8910611271858215, "rewards/rejected": 2.7911171913146973, "step": 19050 }, { "epoch": 0.8849064487673522, "grad_norm": 50.646724700927734, "learning_rate": 2.46939040809694e-07, "logits/chosen": -19.093639373779297, "logits/rejected": -17.387008666992188, "logps/chosen": -464.74755859375, "logps/rejected": -272.46636962890625, "loss": 0.3629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.046931505203247, "rewards/margins": 1.568655252456665, "rewards/rejected": 1.478276252746582, "step": 19060 }, { "epoch": 0.8853707228747852, "grad_norm": 17.66512107849121, "learning_rate": 2.4691118436324805e-07, "logits/chosen": -17.80507469177246, "logits/rejected": -17.646259307861328, "logps/chosen": -318.48358154296875, "logps/rejected": -337.0899963378906, "loss": 0.9903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1574130058288574, "rewards/margins": -0.18887969851493835, "rewards/rejected": 2.346292734146118, "step": 19070 }, { "epoch": 0.8858349969822183, "grad_norm": 54.50902557373047, "learning_rate": 2.468861135614467e-07, "logits/chosen": -18.64103126525879, "logits/rejected": -18.223730087280273, "logps/chosen": -492.28192138671875, "logps/rejected": -402.1795349121094, "loss": 0.9316, "rewards/accuracies": 0.5, "rewards/chosen": 3.0077686309814453, "rewards/margins": 0.32299405336380005, "rewards/rejected": 2.684774875640869, "step": 19080 }, { "epoch": 0.8862992710896513, "grad_norm": 109.84673309326172, "learning_rate": 2.468582571150007e-07, "logits/chosen": -18.444150924682617, "logits/rejected": -17.65899085998535, "logps/chosen": -567.5748291015625, "logps/rejected": -439.68072509765625, "loss": 0.443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1851744651794434, "rewards/margins": 1.0451076030731201, "rewards/rejected": 2.140066623687744, "step": 19090 }, { "epoch": 0.8867635451970843, "grad_norm": 181.1037139892578, "learning_rate": 2.468304006685547e-07, "logits/chosen": -18.671682357788086, "logits/rejected": -18.069459915161133, "logps/chosen": -478.23187255859375, "logps/rejected": -388.4288024902344, "loss": 0.5624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3727195262908936, "rewards/margins": 0.5767335891723633, "rewards/rejected": 2.7959859371185303, "step": 19100 }, { "epoch": 0.8872278193045174, "grad_norm": 32.85224914550781, "learning_rate": 2.468025442221087e-07, "logits/chosen": -17.677026748657227, "logits/rejected": -18.353023529052734, "logps/chosen": -240.16525268554688, "logps/rejected": -289.5054931640625, "loss": 0.7737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.178938388824463, "rewards/margins": 0.462171733379364, "rewards/rejected": 1.7167667150497437, "step": 19110 }, { "epoch": 0.8876920934119504, "grad_norm": 159.56838989257812, "learning_rate": 2.4677468777566273e-07, "logits/chosen": -18.625072479248047, "logits/rejected": -18.44638442993164, "logps/chosen": -457.23626708984375, "logps/rejected": -370.362548828125, "loss": 0.7414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.946323871612549, "rewards/margins": 0.3298535943031311, "rewards/rejected": 2.6164703369140625, "step": 19120 }, { "epoch": 0.8881563675193834, "grad_norm": 136.54129028320312, "learning_rate": 2.467468313292168e-07, "logits/chosen": -18.179651260375977, "logits/rejected": -17.75514793395996, "logps/chosen": -403.51409912109375, "logps/rejected": -309.7485656738281, "loss": 0.75, "rewards/accuracies": 0.5, "rewards/chosen": 3.4615044593811035, "rewards/margins": 0.7964185476303101, "rewards/rejected": 2.665086269378662, "step": 19130 }, { "epoch": 0.8886206416268164, "grad_norm": 63.99048614501953, "learning_rate": 2.467189748827708e-07, "logits/chosen": -20.194995880126953, "logits/rejected": -19.33771514892578, "logps/chosen": -438.986083984375, "logps/rejected": -354.2431640625, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": 2.8097922801971436, "rewards/margins": 0.19167084991931915, "rewards/rejected": 2.61812162399292, "step": 19140 }, { "epoch": 0.8890849157342495, "grad_norm": 104.65191650390625, "learning_rate": 2.466911184363248e-07, "logits/chosen": -18.886592864990234, "logits/rejected": -18.348384857177734, "logps/chosen": -376.8203125, "logps/rejected": -348.9632873535156, "loss": 0.5869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.976323366165161, "rewards/margins": 0.516711950302124, "rewards/rejected": 2.459611415863037, "step": 19150 }, { "epoch": 0.8895491898416825, "grad_norm": 91.47004699707031, "learning_rate": 2.466632619898788e-07, "logits/chosen": -18.843379974365234, "logits/rejected": -18.246326446533203, "logps/chosen": -346.71405029296875, "logps/rejected": -271.4642028808594, "loss": 0.5541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3466570377349854, "rewards/margins": 0.4993467330932617, "rewards/rejected": 1.8473103046417236, "step": 19160 }, { "epoch": 0.8900134639491155, "grad_norm": 185.9527587890625, "learning_rate": 2.4663540554343283e-07, "logits/chosen": -17.808961868286133, "logits/rejected": -18.063146591186523, "logps/chosen": -356.1012878417969, "logps/rejected": -374.85693359375, "loss": 0.967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.382049798965454, "rewards/margins": 0.047339487820863724, "rewards/rejected": 2.3347103595733643, "step": 19170 }, { "epoch": 0.8904777380565486, "grad_norm": 2.916531801223755, "learning_rate": 2.466075490969868e-07, "logits/chosen": -18.520639419555664, "logits/rejected": -17.758838653564453, "logps/chosen": -435.23577880859375, "logps/rejected": -301.8170166015625, "loss": 0.6697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6576592922210693, "rewards/margins": 0.5965583920478821, "rewards/rejected": 2.061100721359253, "step": 19180 }, { "epoch": 0.8909420121639816, "grad_norm": 95.83621215820312, "learning_rate": 2.4657969265054086e-07, "logits/chosen": -19.386842727661133, "logits/rejected": -18.993772506713867, "logps/chosen": -418.37591552734375, "logps/rejected": -415.6432189941406, "loss": 0.7057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.266069173812866, "rewards/margins": 0.1009560376405716, "rewards/rejected": 2.1651129722595215, "step": 19190 }, { "epoch": 0.8914062862714146, "grad_norm": 18.530153274536133, "learning_rate": 2.465518362040949e-07, "logits/chosen": -18.786190032958984, "logits/rejected": -19.09146499633789, "logps/chosen": -332.7565612792969, "logps/rejected": -299.1017761230469, "loss": 0.7608, "rewards/accuracies": 0.5, "rewards/chosen": 2.609410285949707, "rewards/margins": 0.5060073137283325, "rewards/rejected": 2.103403091430664, "step": 19200 }, { "epoch": 0.8918705603788477, "grad_norm": 212.10218811035156, "learning_rate": 2.465239797576489e-07, "logits/chosen": -19.23763656616211, "logits/rejected": -18.585235595703125, "logps/chosen": -413.55975341796875, "logps/rejected": -332.6973876953125, "loss": 0.67, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.904832363128662, "rewards/margins": 0.44134148955345154, "rewards/rejected": 2.4634909629821777, "step": 19210 }, { "epoch": 0.8923348344862807, "grad_norm": 26.263565063476562, "learning_rate": 2.4649612331120293e-07, "logits/chosen": -19.131946563720703, "logits/rejected": -17.60745620727539, "logps/chosen": -324.90423583984375, "logps/rejected": -184.84207153320312, "loss": 0.3708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.444636583328247, "rewards/margins": 1.2395813465118408, "rewards/rejected": 1.2050549983978271, "step": 19220 }, { "epoch": 0.8927991085937137, "grad_norm": 13.639985084533691, "learning_rate": 2.464682668647569e-07, "logits/chosen": -18.428852081298828, "logits/rejected": -17.730098724365234, "logps/chosen": -322.0133056640625, "logps/rejected": -321.1741638183594, "loss": 0.5498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6060566902160645, "rewards/margins": 0.9759889841079712, "rewards/rejected": 1.6300678253173828, "step": 19230 }, { "epoch": 0.8932633827011468, "grad_norm": 52.97843933105469, "learning_rate": 2.4644041041831096e-07, "logits/chosen": -17.782812118530273, "logits/rejected": -17.311656951904297, "logps/chosen": -386.35760498046875, "logps/rejected": -347.70904541015625, "loss": 0.7599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4406630992889404, "rewards/margins": 0.6064267158508301, "rewards/rejected": 1.8342365026474, "step": 19240 }, { "epoch": 0.8937276568085798, "grad_norm": 207.12884521484375, "learning_rate": 2.46412553971865e-07, "logits/chosen": -19.1779727935791, "logits/rejected": -18.37129020690918, "logps/chosen": -465.01409912109375, "logps/rejected": -416.72955322265625, "loss": 0.5227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.957075357437134, "rewards/margins": 0.7149376273155212, "rewards/rejected": 2.2421374320983887, "step": 19250 }, { "epoch": 0.8941919309160128, "grad_norm": 115.5775146484375, "learning_rate": 2.46384697525419e-07, "logits/chosen": -19.059816360473633, "logits/rejected": -18.426982879638672, "logps/chosen": -521.0096435546875, "logps/rejected": -360.8286437988281, "loss": 0.5045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.824671983718872, "rewards/margins": 0.5549591183662415, "rewards/rejected": 2.2697126865386963, "step": 19260 }, { "epoch": 0.8946562050234459, "grad_norm": 33.8978157043457, "learning_rate": 2.4635684107897303e-07, "logits/chosen": -18.274898529052734, "logits/rejected": -17.8789119720459, "logps/chosen": -468.1568298339844, "logps/rejected": -394.1687927246094, "loss": 0.578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.113426446914673, "rewards/margins": 0.7373636960983276, "rewards/rejected": 2.3760628700256348, "step": 19270 }, { "epoch": 0.8951204791308789, "grad_norm": 61.40397644042969, "learning_rate": 2.46328984632527e-07, "logits/chosen": -18.692121505737305, "logits/rejected": -17.882837295532227, "logps/chosen": -404.41546630859375, "logps/rejected": -352.2626953125, "loss": 0.9059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0211052894592285, "rewards/margins": 0.45630764961242676, "rewards/rejected": 2.564797878265381, "step": 19280 }, { "epoch": 0.8955847532383119, "grad_norm": 31.839624404907227, "learning_rate": 2.4630112818608106e-07, "logits/chosen": -19.1946964263916, "logits/rejected": -17.768932342529297, "logps/chosen": -288.45770263671875, "logps/rejected": -194.15052795410156, "loss": 0.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9720630645751953, "rewards/margins": 0.33052974939346313, "rewards/rejected": 1.6415332555770874, "step": 19290 }, { "epoch": 0.8960490273457449, "grad_norm": 154.95716857910156, "learning_rate": 2.4627327173963505e-07, "logits/chosen": -18.3420467376709, "logits/rejected": -17.46334457397461, "logps/chosen": -333.9317321777344, "logps/rejected": -240.40625, "loss": 0.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9408857822418213, "rewards/margins": 0.847431480884552, "rewards/rejected": 1.093454360961914, "step": 19300 }, { "epoch": 0.896513301453178, "grad_norm": 51.54572677612305, "learning_rate": 2.462454152931891e-07, "logits/chosen": -19.794132232666016, "logits/rejected": -18.325719833374023, "logps/chosen": -355.5751037597656, "logps/rejected": -310.614013671875, "loss": 0.5296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7641079425811768, "rewards/margins": 0.631369948387146, "rewards/rejected": 2.132737874984741, "step": 19310 }, { "epoch": 0.896977575560611, "grad_norm": 73.1594467163086, "learning_rate": 2.4621755884674313e-07, "logits/chosen": -18.98370361328125, "logits/rejected": -18.024314880371094, "logps/chosen": -340.28759765625, "logps/rejected": -212.2308807373047, "loss": 0.562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.240175247192383, "rewards/margins": 0.7384241819381714, "rewards/rejected": 1.5017510652542114, "step": 19320 }, { "epoch": 0.897441849668044, "grad_norm": 9.390901565551758, "learning_rate": 2.461897024002971e-07, "logits/chosen": -19.328231811523438, "logits/rejected": -18.29763412475586, "logps/chosen": -372.6424255371094, "logps/rejected": -262.0857238769531, "loss": 0.3217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0727434158325195, "rewards/margins": 1.4214050769805908, "rewards/rejected": 1.6513382196426392, "step": 19330 }, { "epoch": 0.8979061237754771, "grad_norm": 27.341676712036133, "learning_rate": 2.4616184595385116e-07, "logits/chosen": -18.715045928955078, "logits/rejected": -17.655149459838867, "logps/chosen": -441.68353271484375, "logps/rejected": -368.5115661621094, "loss": 0.6849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.720731735229492, "rewards/margins": 0.590562641620636, "rewards/rejected": 2.13016939163208, "step": 19340 }, { "epoch": 0.8983703978829101, "grad_norm": 127.06971740722656, "learning_rate": 2.4613398950740515e-07, "logits/chosen": -18.485557556152344, "logits/rejected": -19.459535598754883, "logps/chosen": -472.24676513671875, "logps/rejected": -403.22540283203125, "loss": 0.883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0728471279144287, "rewards/margins": -0.1406196802854538, "rewards/rejected": 3.2134673595428467, "step": 19350 }, { "epoch": 0.8988346719903431, "grad_norm": 79.49371337890625, "learning_rate": 2.461061330609592e-07, "logits/chosen": -17.830387115478516, "logits/rejected": -18.107097625732422, "logps/chosen": -303.7087707519531, "logps/rejected": -339.1022033691406, "loss": 0.8848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4736781120300293, "rewards/margins": 0.19922590255737305, "rewards/rejected": 2.2744522094726562, "step": 19360 }, { "epoch": 0.8992989460977762, "grad_norm": 0.6886135339736938, "learning_rate": 2.4607827661451323e-07, "logits/chosen": -18.510324478149414, "logits/rejected": -17.844364166259766, "logps/chosen": -402.36883544921875, "logps/rejected": -320.103515625, "loss": 0.5976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6713919639587402, "rewards/margins": 0.7118555307388306, "rewards/rejected": 1.9595365524291992, "step": 19370 }, { "epoch": 0.8997632202052092, "grad_norm": 45.38221740722656, "learning_rate": 2.460504201680672e-07, "logits/chosen": -17.517253875732422, "logits/rejected": -17.329174041748047, "logps/chosen": -326.603271484375, "logps/rejected": -299.029052734375, "loss": 0.8426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1527435779571533, "rewards/margins": 0.14705157279968262, "rewards/rejected": 2.0056920051574707, "step": 19380 }, { "epoch": 0.9002274943126422, "grad_norm": 48.355064392089844, "learning_rate": 2.4602256372162126e-07, "logits/chosen": -18.04953384399414, "logits/rejected": -17.193340301513672, "logps/chosen": -400.8627014160156, "logps/rejected": -242.76467895507812, "loss": 0.4646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3846170902252197, "rewards/margins": 0.6651555299758911, "rewards/rejected": 1.7194616794586182, "step": 19390 }, { "epoch": 0.9006917684200753, "grad_norm": 10.666536331176758, "learning_rate": 2.4599470727517525e-07, "logits/chosen": -19.566970825195312, "logits/rejected": -17.84566307067871, "logps/chosen": -468.03961181640625, "logps/rejected": -262.7117614746094, "loss": 0.4021, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.587080717086792, "rewards/margins": 1.718534231185913, "rewards/rejected": 1.8685468435287476, "step": 19400 }, { "epoch": 0.9011560425275082, "grad_norm": 2.5137410163879395, "learning_rate": 2.4596685082872923e-07, "logits/chosen": -18.644821166992188, "logits/rejected": -18.572065353393555, "logps/chosen": -349.9165344238281, "logps/rejected": -338.4559631347656, "loss": 0.8581, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.7770416736602783, "rewards/margins": 0.20625007152557373, "rewards/rejected": 2.570791482925415, "step": 19410 }, { "epoch": 0.9016203166349412, "grad_norm": 151.41053771972656, "learning_rate": 2.459389943822833e-07, "logits/chosen": -18.198598861694336, "logits/rejected": -16.89386749267578, "logps/chosen": -320.3205261230469, "logps/rejected": -222.40414428710938, "loss": 0.5815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.170311689376831, "rewards/margins": 0.6837798953056335, "rewards/rejected": 1.4865318536758423, "step": 19420 }, { "epoch": 0.9020845907423743, "grad_norm": 49.96500778198242, "learning_rate": 2.459111379358373e-07, "logits/chosen": -18.5499267578125, "logits/rejected": -18.49227523803711, "logps/chosen": -497.7012634277344, "logps/rejected": -444.91278076171875, "loss": 0.7657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.119258165359497, "rewards/margins": 0.2789211869239807, "rewards/rejected": 2.840336799621582, "step": 19430 }, { "epoch": 0.9025488648498073, "grad_norm": 46.62080001831055, "learning_rate": 2.4588328148939136e-07, "logits/chosen": -19.314146041870117, "logits/rejected": -18.85537338256836, "logps/chosen": -398.6241455078125, "logps/rejected": -306.51519775390625, "loss": 0.5868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0072784423828125, "rewards/margins": 0.8099665641784668, "rewards/rejected": 2.1973118782043457, "step": 19440 }, { "epoch": 0.9030131389572403, "grad_norm": 56.880069732666016, "learning_rate": 2.4585542504294534e-07, "logits/chosen": -20.038227081298828, "logits/rejected": -18.727083206176758, "logps/chosen": -415.5597229003906, "logps/rejected": -267.5146789550781, "loss": 0.8148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7022595405578613, "rewards/margins": 0.4878525137901306, "rewards/rejected": 2.214406967163086, "step": 19450 }, { "epoch": 0.9034774130646733, "grad_norm": 105.47542572021484, "learning_rate": 2.4582756859649933e-07, "logits/chosen": -18.678211212158203, "logits/rejected": -18.158252716064453, "logps/chosen": -343.02203369140625, "logps/rejected": -308.7718811035156, "loss": 0.5999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4444165229797363, "rewards/margins": 0.5070927143096924, "rewards/rejected": 1.9373241662979126, "step": 19460 }, { "epoch": 0.9039416871721064, "grad_norm": 59.31681442260742, "learning_rate": 2.457997121500534e-07, "logits/chosen": -19.036033630371094, "logits/rejected": -17.74953269958496, "logps/chosen": -553.7599487304688, "logps/rejected": -394.44866943359375, "loss": 0.5078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.484858751296997, "rewards/margins": 0.9947797656059265, "rewards/rejected": 2.4900784492492676, "step": 19470 }, { "epoch": 0.9044059612795394, "grad_norm": 1.9738447666168213, "learning_rate": 2.457718557036074e-07, "logits/chosen": -19.471004486083984, "logits/rejected": -19.304302215576172, "logps/chosen": -322.66363525390625, "logps/rejected": -383.9198303222656, "loss": 0.9545, "rewards/accuracies": 0.5, "rewards/chosen": 2.5643563270568848, "rewards/margins": 0.1973201036453247, "rewards/rejected": 2.3670363426208496, "step": 19480 }, { "epoch": 0.9048702353869724, "grad_norm": 34.39517593383789, "learning_rate": 2.457439992571614e-07, "logits/chosen": -18.18889617919922, "logits/rejected": -18.015785217285156, "logps/chosen": -382.5479431152344, "logps/rejected": -313.7739562988281, "loss": 0.6304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.98899507522583, "rewards/margins": 0.7423128485679626, "rewards/rejected": 2.2466819286346436, "step": 19490 }, { "epoch": 0.9053345094944055, "grad_norm": 57.01348876953125, "learning_rate": 2.4571614281071544e-07, "logits/chosen": -18.77509307861328, "logits/rejected": -17.823040008544922, "logps/chosen": -331.95062255859375, "logps/rejected": -279.40887451171875, "loss": 0.7299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5136609077453613, "rewards/margins": 0.33347633481025696, "rewards/rejected": 2.180184841156006, "step": 19500 }, { "epoch": 0.9057987836018385, "grad_norm": 72.2147216796875, "learning_rate": 2.456882863642695e-07, "logits/chosen": -19.20413589477539, "logits/rejected": -18.944805145263672, "logps/chosen": -282.2776184082031, "logps/rejected": -308.6517028808594, "loss": 0.975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.100399971008301, "rewards/margins": -0.2033948451280594, "rewards/rejected": 2.3037948608398438, "step": 19510 }, { "epoch": 0.9062630577092715, "grad_norm": 16.746673583984375, "learning_rate": 2.4566042991782347e-07, "logits/chosen": -19.535303115844727, "logits/rejected": -18.345199584960938, "logps/chosen": -405.4429016113281, "logps/rejected": -318.93212890625, "loss": 0.5091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.444549083709717, "rewards/margins": 0.8191843032836914, "rewards/rejected": 2.6253647804260254, "step": 19520 }, { "epoch": 0.9067273318167046, "grad_norm": 79.96434783935547, "learning_rate": 2.4563257347137746e-07, "logits/chosen": -18.84217071533203, "logits/rejected": -17.57337188720703, "logps/chosen": -315.996826171875, "logps/rejected": -236.9305877685547, "loss": 0.5724, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.458712339401245, "rewards/margins": 0.7593430280685425, "rewards/rejected": 1.6993694305419922, "step": 19530 }, { "epoch": 0.9071916059241376, "grad_norm": 90.9991226196289, "learning_rate": 2.456047170249315e-07, "logits/chosen": -19.314624786376953, "logits/rejected": -19.252819061279297, "logps/chosen": -305.83477783203125, "logps/rejected": -264.3274841308594, "loss": 0.7957, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.653311014175415, "rewards/margins": 0.12169674783945084, "rewards/rejected": 2.531614303588867, "step": 19540 }, { "epoch": 0.9076558800315706, "grad_norm": 176.89315795898438, "learning_rate": 2.4557686057848554e-07, "logits/chosen": -19.208553314208984, "logits/rejected": -18.212684631347656, "logps/chosen": -334.1274108886719, "logps/rejected": -292.23468017578125, "loss": 0.7912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4825212955474854, "rewards/margins": 0.8307539820671082, "rewards/rejected": 1.6517677307128906, "step": 19550 }, { "epoch": 0.9081201541390037, "grad_norm": 22.620256423950195, "learning_rate": 2.455490041320396e-07, "logits/chosen": -19.54452896118164, "logits/rejected": -18.66851806640625, "logps/chosen": -425.7566833496094, "logps/rejected": -330.6842956542969, "loss": 0.6579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2667152881622314, "rewards/margins": 0.9416066408157349, "rewards/rejected": 2.325108289718628, "step": 19560 }, { "epoch": 0.9085844282464367, "grad_norm": 103.7154312133789, "learning_rate": 2.4552114768559357e-07, "logits/chosen": -18.80924415588379, "logits/rejected": -18.20184326171875, "logps/chosen": -422.87872314453125, "logps/rejected": -386.38238525390625, "loss": 0.6421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.450411796569824, "rewards/margins": 0.2325369119644165, "rewards/rejected": 2.2178750038146973, "step": 19570 }, { "epoch": 0.9090487023538697, "grad_norm": 48.56863784790039, "learning_rate": 2.4549329123914756e-07, "logits/chosen": -18.336780548095703, "logits/rejected": -17.980106353759766, "logps/chosen": -356.94683837890625, "logps/rejected": -299.0797119140625, "loss": 0.5024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0744709968566895, "rewards/margins": 0.6922787427902222, "rewards/rejected": 2.3821921348571777, "step": 19580 }, { "epoch": 0.9095129764613028, "grad_norm": 176.51805114746094, "learning_rate": 2.454654347927016e-07, "logits/chosen": -18.9022216796875, "logits/rejected": -17.78563117980957, "logps/chosen": -459.0537109375, "logps/rejected": -360.50390625, "loss": 1.1862, "rewards/accuracies": 0.5, "rewards/chosen": 2.542729616165161, "rewards/margins": -0.08769150078296661, "rewards/rejected": 2.6304211616516113, "step": 19590 }, { "epoch": 0.9099772505687358, "grad_norm": 20.932344436645508, "learning_rate": 2.454375783462556e-07, "logits/chosen": -19.016273498535156, "logits/rejected": -18.56476402282715, "logps/chosen": -493.7728576660156, "logps/rejected": -391.65008544921875, "loss": 0.6021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8744421005249023, "rewards/margins": 0.6227295398712158, "rewards/rejected": 2.2517127990722656, "step": 19600 }, { "epoch": 0.9104415246761688, "grad_norm": 96.17900085449219, "learning_rate": 2.4540972189980963e-07, "logits/chosen": -19.38323974609375, "logits/rejected": -18.346817016601562, "logps/chosen": -389.8989562988281, "logps/rejected": -303.0863342285156, "loss": 0.4446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0320987701416016, "rewards/margins": 1.187819004058838, "rewards/rejected": 1.8442796468734741, "step": 19610 }, { "epoch": 0.9109057987836018, "grad_norm": 156.49838256835938, "learning_rate": 2.4538186545336367e-07, "logits/chosen": -18.160594940185547, "logits/rejected": -16.715457916259766, "logps/chosen": -423.9618225097656, "logps/rejected": -263.3130187988281, "loss": 0.559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8929848670959473, "rewards/margins": 0.9261150360107422, "rewards/rejected": 1.9668699502944946, "step": 19620 }, { "epoch": 0.9113700728910349, "grad_norm": 27.52247428894043, "learning_rate": 2.4535400900691766e-07, "logits/chosen": -17.648088455200195, "logits/rejected": -17.549943923950195, "logps/chosen": -362.17669677734375, "logps/rejected": -334.2801208496094, "loss": 0.7162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9950549602508545, "rewards/margins": 0.44084852933883667, "rewards/rejected": 1.5542064905166626, "step": 19630 }, { "epoch": 0.9118343469984679, "grad_norm": 94.11202239990234, "learning_rate": 2.453261525604717e-07, "logits/chosen": -18.58905601501465, "logits/rejected": -18.165660858154297, "logps/chosen": -432.71221923828125, "logps/rejected": -373.3029479980469, "loss": 0.8372, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3632073402404785, "rewards/margins": -0.0943467989563942, "rewards/rejected": 2.457554578781128, "step": 19640 }, { "epoch": 0.9122986211059009, "grad_norm": 12.124529838562012, "learning_rate": 2.452982961140257e-07, "logits/chosen": -19.362478256225586, "logits/rejected": -17.8686466217041, "logps/chosen": -455.72509765625, "logps/rejected": -288.92364501953125, "loss": 0.5044, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9688549041748047, "rewards/margins": 1.1611977815628052, "rewards/rejected": 1.807657241821289, "step": 19650 }, { "epoch": 0.912762895213334, "grad_norm": 54.240230560302734, "learning_rate": 2.4527043966757973e-07, "logits/chosen": -18.387996673583984, "logits/rejected": -17.937551498413086, "logps/chosen": -351.2356262207031, "logps/rejected": -326.07452392578125, "loss": 0.8287, "rewards/accuracies": 0.5, "rewards/chosen": 2.150602340698242, "rewards/margins": 0.04376993328332901, "rewards/rejected": 2.106832265853882, "step": 19660 }, { "epoch": 0.913227169320767, "grad_norm": 42.69696044921875, "learning_rate": 2.4524258322113377e-07, "logits/chosen": -18.739887237548828, "logits/rejected": -18.586137771606445, "logps/chosen": -323.4453430175781, "logps/rejected": -295.62493896484375, "loss": 0.601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2917606830596924, "rewards/margins": 0.3008635938167572, "rewards/rejected": 1.9908968210220337, "step": 19670 }, { "epoch": 0.9136914434282, "grad_norm": 166.4817352294922, "learning_rate": 2.4521472677468776e-07, "logits/chosen": -18.716915130615234, "logits/rejected": -17.19251823425293, "logps/chosen": -321.2150573730469, "logps/rejected": -233.27603149414062, "loss": 0.3917, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4781832695007324, "rewards/margins": 1.4392117261886597, "rewards/rejected": 1.038971185684204, "step": 19680 }, { "epoch": 0.9141557175356331, "grad_norm": 8.274818420410156, "learning_rate": 2.451868703282418e-07, "logits/chosen": -18.764419555664062, "logits/rejected": -18.0365047454834, "logps/chosen": -413.3018493652344, "logps/rejected": -303.4322814941406, "loss": 0.6223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.867255926132202, "rewards/margins": 0.9738706350326538, "rewards/rejected": 1.8933851718902588, "step": 19690 }, { "epoch": 0.9146199916430661, "grad_norm": 61.1889762878418, "learning_rate": 2.451590138817958e-07, "logits/chosen": -20.145450592041016, "logits/rejected": -18.610240936279297, "logps/chosen": -417.4895935058594, "logps/rejected": -344.0660400390625, "loss": 0.451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0188803672790527, "rewards/margins": 1.1197221279144287, "rewards/rejected": 1.899158239364624, "step": 19700 }, { "epoch": 0.9150842657504991, "grad_norm": 69.97624969482422, "learning_rate": 2.4513115743534983e-07, "logits/chosen": -18.171152114868164, "logits/rejected": -17.10379981994629, "logps/chosen": -388.640625, "logps/rejected": -327.6057434082031, "loss": 0.6155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.481358051300049, "rewards/margins": 0.6174275279045105, "rewards/rejected": 1.863930344581604, "step": 19710 }, { "epoch": 0.9155485398579322, "grad_norm": 64.65400695800781, "learning_rate": 2.451033009889038e-07, "logits/chosen": -18.763607025146484, "logits/rejected": -17.427127838134766, "logps/chosen": -406.28472900390625, "logps/rejected": -349.1229553222656, "loss": 0.4466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.992480516433716, "rewards/margins": 0.9919337034225464, "rewards/rejected": 2.0005462169647217, "step": 19720 }, { "epoch": 0.9160128139653652, "grad_norm": 169.1866455078125, "learning_rate": 2.4507544454245786e-07, "logits/chosen": -19.232803344726562, "logits/rejected": -18.515117645263672, "logps/chosen": -400.15032958984375, "logps/rejected": -360.75433349609375, "loss": 0.7224, "rewards/accuracies": 0.5, "rewards/chosen": 2.517432689666748, "rewards/margins": 0.1307174563407898, "rewards/rejected": 2.3867151737213135, "step": 19730 }, { "epoch": 0.9164770880727982, "grad_norm": 80.99140167236328, "learning_rate": 2.450475880960119e-07, "logits/chosen": -18.927623748779297, "logits/rejected": -17.739795684814453, "logps/chosen": -459.1819763183594, "logps/rejected": -346.5413818359375, "loss": 0.4523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4608154296875, "rewards/margins": 1.095017433166504, "rewards/rejected": 2.365797996520996, "step": 19740 }, { "epoch": 0.9169413621802313, "grad_norm": 28.715200424194336, "learning_rate": 2.450197316495659e-07, "logits/chosen": -19.378917694091797, "logits/rejected": -18.851268768310547, "logps/chosen": -373.07037353515625, "logps/rejected": -291.2516174316406, "loss": 0.515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6975157260894775, "rewards/margins": 0.6853405833244324, "rewards/rejected": 2.0121750831604004, "step": 19750 }, { "epoch": 0.9174056362876643, "grad_norm": 1.789194107055664, "learning_rate": 2.4499187520311993e-07, "logits/chosen": -19.03136444091797, "logits/rejected": -18.096479415893555, "logps/chosen": -452.2952575683594, "logps/rejected": -365.6811218261719, "loss": 0.583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5506999492645264, "rewards/margins": 1.0465826988220215, "rewards/rejected": 2.504117012023926, "step": 19760 }, { "epoch": 0.9178699103950972, "grad_norm": 194.73385620117188, "learning_rate": 2.449640187566739e-07, "logits/chosen": -17.72372817993164, "logits/rejected": -17.653194427490234, "logps/chosen": -305.63739013671875, "logps/rejected": -271.6598815917969, "loss": 0.871, "rewards/accuracies": 0.5, "rewards/chosen": 2.185081958770752, "rewards/margins": 0.059113048017024994, "rewards/rejected": 2.1259684562683105, "step": 19770 }, { "epoch": 0.9183341845025303, "grad_norm": 4.131118297576904, "learning_rate": 2.4493616231022796e-07, "logits/chosen": -18.52714729309082, "logits/rejected": -17.532535552978516, "logps/chosen": -379.30364990234375, "logps/rejected": -293.3728942871094, "loss": 0.709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.238983154296875, "rewards/margins": 1.0952064990997314, "rewards/rejected": 2.1437771320343018, "step": 19780 }, { "epoch": 0.9187984586099633, "grad_norm": 54.02349853515625, "learning_rate": 2.4490830586378194e-07, "logits/chosen": -18.8555908203125, "logits/rejected": -18.146575927734375, "logps/chosen": -379.35028076171875, "logps/rejected": -328.81805419921875, "loss": 0.6451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.885009527206421, "rewards/margins": 0.39596956968307495, "rewards/rejected": 2.489039897918701, "step": 19790 }, { "epoch": 0.9192627327173963, "grad_norm": 17.259037017822266, "learning_rate": 2.44880449417336e-07, "logits/chosen": -19.4715633392334, "logits/rejected": -18.404911041259766, "logps/chosen": -464.44195556640625, "logps/rejected": -368.9598083496094, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.064241647720337, "rewards/margins": 1.0210438966751099, "rewards/rejected": 2.0431976318359375, "step": 19800 }, { "epoch": 0.9197270068248293, "grad_norm": 39.461849212646484, "learning_rate": 2.4485259297089003e-07, "logits/chosen": -17.794261932373047, "logits/rejected": -17.242494583129883, "logps/chosen": -467.869873046875, "logps/rejected": -337.63665771484375, "loss": 0.6863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3546109199523926, "rewards/margins": 0.5613322257995605, "rewards/rejected": 1.793278694152832, "step": 19810 }, { "epoch": 0.9201912809322624, "grad_norm": 172.9276580810547, "learning_rate": 2.44824736524444e-07, "logits/chosen": -17.949338912963867, "logits/rejected": -17.920459747314453, "logps/chosen": -355.1413269042969, "logps/rejected": -393.22991943359375, "loss": 1.0758, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.313676118850708, "rewards/margins": -0.37121379375457764, "rewards/rejected": 2.684889793395996, "step": 19820 }, { "epoch": 0.9206555550396954, "grad_norm": 37.34249496459961, "learning_rate": 2.44796880077998e-07, "logits/chosen": -18.547208786010742, "logits/rejected": -18.364551544189453, "logps/chosen": -272.765869140625, "logps/rejected": -269.54522705078125, "loss": 0.9633, "rewards/accuracies": 0.5, "rewards/chosen": 2.0209784507751465, "rewards/margins": -0.15870647132396698, "rewards/rejected": 2.179685115814209, "step": 19830 }, { "epoch": 0.9211198291471284, "grad_norm": 73.61318969726562, "learning_rate": 2.4476902363155204e-07, "logits/chosen": -18.536808013916016, "logits/rejected": -17.660547256469727, "logps/chosen": -473.96954345703125, "logps/rejected": -323.57012939453125, "loss": 0.652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4666218757629395, "rewards/margins": 1.2131881713867188, "rewards/rejected": 2.2534337043762207, "step": 19840 }, { "epoch": 0.9215841032545615, "grad_norm": 198.07200622558594, "learning_rate": 2.447411671851061e-07, "logits/chosen": -17.290363311767578, "logits/rejected": -17.258358001708984, "logps/chosen": -426.1293029785156, "logps/rejected": -481.9009704589844, "loss": 1.5197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3891396522521973, "rewards/margins": -0.5424455404281616, "rewards/rejected": 2.9315853118896484, "step": 19850 }, { "epoch": 0.9220483773619945, "grad_norm": 99.65678405761719, "learning_rate": 2.447133107386601e-07, "logits/chosen": -18.299144744873047, "logits/rejected": -17.25640869140625, "logps/chosen": -507.0938415527344, "logps/rejected": -378.6040954589844, "loss": 0.4329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6232681274414062, "rewards/margins": 1.2478973865509033, "rewards/rejected": 2.375370979309082, "step": 19860 }, { "epoch": 0.9225126514694275, "grad_norm": 68.89532470703125, "learning_rate": 2.446854542922141e-07, "logits/chosen": -18.2239990234375, "logits/rejected": -18.723798751831055, "logps/chosen": -312.80780029296875, "logps/rejected": -385.8122863769531, "loss": 1.1197, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 2.168891429901123, "rewards/margins": -0.6204087138175964, "rewards/rejected": 2.789299964904785, "step": 19870 }, { "epoch": 0.9229769255768606, "grad_norm": 122.556396484375, "learning_rate": 2.446575978457681e-07, "logits/chosen": -18.056884765625, "logits/rejected": -17.445850372314453, "logps/chosen": -449.71710205078125, "logps/rejected": -380.2426452636719, "loss": 0.697, "rewards/accuracies": 0.5, "rewards/chosen": 3.0931718349456787, "rewards/margins": 0.33953341841697693, "rewards/rejected": 2.753638505935669, "step": 19880 }, { "epoch": 0.9234411996842936, "grad_norm": 36.67473220825195, "learning_rate": 2.4462974139932214e-07, "logits/chosen": -18.686447143554688, "logits/rejected": -18.36496353149414, "logps/chosen": -330.9744567871094, "logps/rejected": -178.0302734375, "loss": 0.4006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.316512107849121, "rewards/margins": 1.2926750183105469, "rewards/rejected": 1.0238368511199951, "step": 19890 }, { "epoch": 0.9239054737917266, "grad_norm": 145.83970642089844, "learning_rate": 2.446018849528762e-07, "logits/chosen": -19.063255310058594, "logits/rejected": -18.668827056884766, "logps/chosen": -348.4018249511719, "logps/rejected": -301.91473388671875, "loss": 0.7089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.937498092651367, "rewards/margins": 0.1821984350681305, "rewards/rejected": 2.7552998065948486, "step": 19900 }, { "epoch": 0.9243697478991597, "grad_norm": 52.46497344970703, "learning_rate": 2.4457402850643017e-07, "logits/chosen": -18.65105628967285, "logits/rejected": -17.816884994506836, "logps/chosen": -445.01104736328125, "logps/rejected": -341.2037353515625, "loss": 0.4985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6946821212768555, "rewards/margins": 0.6756223440170288, "rewards/rejected": 2.0190601348876953, "step": 19910 }, { "epoch": 0.9248340220065927, "grad_norm": 78.89923095703125, "learning_rate": 2.445461720599842e-07, "logits/chosen": -18.73893928527832, "logits/rejected": -18.037761688232422, "logps/chosen": -404.05694580078125, "logps/rejected": -324.62811279296875, "loss": 0.6021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8506646156311035, "rewards/margins": 0.8883504867553711, "rewards/rejected": 1.9623138904571533, "step": 19920 }, { "epoch": 0.9252982961140257, "grad_norm": 122.36552429199219, "learning_rate": 2.4451831561353825e-07, "logits/chosen": -18.778051376342773, "logits/rejected": -17.98902702331543, "logps/chosen": -521.4923095703125, "logps/rejected": -379.31573486328125, "loss": 0.5578, "rewards/accuracies": 0.5, "rewards/chosen": 3.4296703338623047, "rewards/margins": 0.9297984838485718, "rewards/rejected": 2.4998717308044434, "step": 19930 }, { "epoch": 0.9257625702214588, "grad_norm": 57.101505279541016, "learning_rate": 2.4449045916709224e-07, "logits/chosen": -18.630992889404297, "logits/rejected": -18.33799934387207, "logps/chosen": -399.54351806640625, "logps/rejected": -270.54156494140625, "loss": 0.3288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0750412940979004, "rewards/margins": 1.3011614084243774, "rewards/rejected": 1.7738797664642334, "step": 19940 }, { "epoch": 0.9262268443288918, "grad_norm": 60.908653259277344, "learning_rate": 2.4446260272064623e-07, "logits/chosen": -18.25868034362793, "logits/rejected": -18.096769332885742, "logps/chosen": -393.14544677734375, "logps/rejected": -318.9289855957031, "loss": 0.8143, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.781991481781006, "rewards/margins": 0.6315956711769104, "rewards/rejected": 2.1503958702087402, "step": 19950 }, { "epoch": 0.9266911184363248, "grad_norm": 73.81555938720703, "learning_rate": 2.4443474627420027e-07, "logits/chosen": -17.99000358581543, "logits/rejected": -17.638124465942383, "logps/chosen": -333.91619873046875, "logps/rejected": -371.360595703125, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.879451274871826, "rewards/margins": 0.3756498396396637, "rewards/rejected": 2.503801107406616, "step": 19960 }, { "epoch": 0.9271553925437578, "grad_norm": 87.8531494140625, "learning_rate": 2.444068898277543e-07, "logits/chosen": -19.60619354248047, "logits/rejected": -19.420690536499023, "logps/chosen": -410.62786865234375, "logps/rejected": -399.39581298828125, "loss": 0.4121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3392441272735596, "rewards/margins": 1.0753313302993774, "rewards/rejected": 2.2639126777648926, "step": 19970 }, { "epoch": 0.9276196666511909, "grad_norm": 208.78982543945312, "learning_rate": 2.443790333813083e-07, "logits/chosen": -18.657882690429688, "logits/rejected": -17.812580108642578, "logps/chosen": -436.1883850097656, "logps/rejected": -374.04791259765625, "loss": 0.5978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.543506622314453, "rewards/margins": 1.1631925106048584, "rewards/rejected": 2.380314350128174, "step": 19980 }, { "epoch": 0.9280839407586239, "grad_norm": 107.3463134765625, "learning_rate": 2.4435117693486234e-07, "logits/chosen": -18.432235717773438, "logits/rejected": -18.04633140563965, "logps/chosen": -285.8485107421875, "logps/rejected": -273.9736633300781, "loss": 0.8606, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.71061110496521, "rewards/margins": -0.1066126823425293, "rewards/rejected": 1.8172237873077393, "step": 19990 }, { "epoch": 0.9285482148660569, "grad_norm": 84.76298522949219, "learning_rate": 2.4432332048841633e-07, "logits/chosen": -18.938207626342773, "logits/rejected": -18.10098648071289, "logps/chosen": -333.82891845703125, "logps/rejected": -279.13201904296875, "loss": 0.4954, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.470620632171631, "rewards/margins": 0.8586393594741821, "rewards/rejected": 1.6119813919067383, "step": 20000 }, { "epoch": 0.92901248897349, "grad_norm": 217.3573455810547, "learning_rate": 2.4429546404197037e-07, "logits/chosen": -19.28224754333496, "logits/rejected": -18.720428466796875, "logps/chosen": -463.81805419921875, "logps/rejected": -426.5409240722656, "loss": 0.8159, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.996239185333252, "rewards/margins": -0.035595983266830444, "rewards/rejected": 3.0318355560302734, "step": 20010 }, { "epoch": 0.929476763080923, "grad_norm": 26.156600952148438, "learning_rate": 2.4426760759552436e-07, "logits/chosen": -19.5154972076416, "logits/rejected": -18.86081886291504, "logps/chosen": -443.6253967285156, "logps/rejected": -381.38983154296875, "loss": 0.5316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1469545364379883, "rewards/margins": 0.5374883413314819, "rewards/rejected": 2.609466075897217, "step": 20020 }, { "epoch": 0.929941037188356, "grad_norm": 70.62584686279297, "learning_rate": 2.442397511490784e-07, "logits/chosen": -19.274564743041992, "logits/rejected": -19.12702178955078, "logps/chosen": -396.29638671875, "logps/rejected": -415.967529296875, "loss": 0.6431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9052963256835938, "rewards/margins": 0.3453636169433594, "rewards/rejected": 2.5599324703216553, "step": 20030 }, { "epoch": 0.9304053112957891, "grad_norm": 83.9288558959961, "learning_rate": 2.4421189470263244e-07, "logits/chosen": -18.299238204956055, "logits/rejected": -17.13041877746582, "logps/chosen": -412.5948791503906, "logps/rejected": -217.7850799560547, "loss": 0.3818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1066877841949463, "rewards/margins": 1.6516135931015015, "rewards/rejected": 1.4550740718841553, "step": 20040 }, { "epoch": 0.9308695854032221, "grad_norm": 71.14289855957031, "learning_rate": 2.4418403825618643e-07, "logits/chosen": -18.45238494873047, "logits/rejected": -18.40727996826172, "logps/chosen": -389.14691162109375, "logps/rejected": -414.11663818359375, "loss": 1.1577, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.960731029510498, "rewards/margins": -0.18256230652332306, "rewards/rejected": 3.1432933807373047, "step": 20050 }, { "epoch": 0.9313338595106551, "grad_norm": 88.23921203613281, "learning_rate": 2.4415618180974047e-07, "logits/chosen": -18.81261444091797, "logits/rejected": -18.05331039428711, "logps/chosen": -432.339599609375, "logps/rejected": -355.59368896484375, "loss": 0.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2788853645324707, "rewards/margins": 0.3973059058189392, "rewards/rejected": 2.8815791606903076, "step": 20060 }, { "epoch": 0.9317981336180882, "grad_norm": 33.172794342041016, "learning_rate": 2.4412832536329446e-07, "logits/chosen": -18.92972183227539, "logits/rejected": -18.259098052978516, "logps/chosen": -387.2914123535156, "logps/rejected": -369.0691833496094, "loss": 1.0499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2815773487091064, "rewards/margins": -0.24632211029529572, "rewards/rejected": 2.5278992652893066, "step": 20070 }, { "epoch": 0.9322624077255212, "grad_norm": 114.66118621826172, "learning_rate": 2.441004689168485e-07, "logits/chosen": -18.625507354736328, "logits/rejected": -17.505325317382812, "logps/chosen": -417.809326171875, "logps/rejected": -266.9476013183594, "loss": 0.915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3304202556610107, "rewards/margins": 0.3851584494113922, "rewards/rejected": 1.9452617168426514, "step": 20080 }, { "epoch": 0.9327266818329542, "grad_norm": 35.670745849609375, "learning_rate": 2.4407261247040254e-07, "logits/chosen": -18.63443374633789, "logits/rejected": -17.112123489379883, "logps/chosen": -384.37945556640625, "logps/rejected": -240.0674285888672, "loss": 0.3389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.445826768875122, "rewards/margins": 1.771855354309082, "rewards/rejected": 1.67397141456604, "step": 20090 }, { "epoch": 0.9331909559403873, "grad_norm": 29.136106491088867, "learning_rate": 2.4404475602395653e-07, "logits/chosen": -18.112844467163086, "logits/rejected": -17.46695327758789, "logps/chosen": -439.73931884765625, "logps/rejected": -357.42108154296875, "loss": 0.7538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8712406158447266, "rewards/margins": 0.19776077568531036, "rewards/rejected": 2.6734797954559326, "step": 20100 }, { "epoch": 0.9336552300478203, "grad_norm": 90.48795318603516, "learning_rate": 2.4401689957751057e-07, "logits/chosen": -19.30154037475586, "logits/rejected": -18.93529510498047, "logps/chosen": -447.1344299316406, "logps/rejected": -346.4186706542969, "loss": 0.608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8897013664245605, "rewards/margins": 0.526455819606781, "rewards/rejected": 2.3632452487945557, "step": 20110 }, { "epoch": 0.9341195041552532, "grad_norm": 24.14893913269043, "learning_rate": 2.4398904313106456e-07, "logits/chosen": -18.972213745117188, "logits/rejected": -18.40311050415039, "logps/chosen": -438.01556396484375, "logps/rejected": -358.185546875, "loss": 0.7716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.122124195098877, "rewards/margins": 0.4503743648529053, "rewards/rejected": 2.6717495918273926, "step": 20120 }, { "epoch": 0.9345837782626862, "grad_norm": 173.10470581054688, "learning_rate": 2.439611866846186e-07, "logits/chosen": -19.534360885620117, "logits/rejected": -18.485065460205078, "logps/chosen": -430.043701171875, "logps/rejected": -311.7687072753906, "loss": 0.4965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.332383394241333, "rewards/margins": 0.7005282640457153, "rewards/rejected": 1.6318552494049072, "step": 20130 }, { "epoch": 0.9350480523701193, "grad_norm": 169.50376892089844, "learning_rate": 2.439333302381726e-07, "logits/chosen": -19.15752410888672, "logits/rejected": -18.830163955688477, "logps/chosen": -414.244873046875, "logps/rejected": -381.5523681640625, "loss": 0.5249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9501304626464844, "rewards/margins": 0.5212963819503784, "rewards/rejected": 2.4288344383239746, "step": 20140 }, { "epoch": 0.9355123264775523, "grad_norm": 50.92372512817383, "learning_rate": 2.4390547379172663e-07, "logits/chosen": -18.844141006469727, "logits/rejected": -18.34808349609375, "logps/chosen": -478.6512145996094, "logps/rejected": -414.0409240722656, "loss": 0.6341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7593936920166016, "rewards/margins": 0.2794664204120636, "rewards/rejected": 2.4799273014068604, "step": 20150 }, { "epoch": 0.9359766005849853, "grad_norm": 94.53909301757812, "learning_rate": 2.4387761734528067e-07, "logits/chosen": -18.891145706176758, "logits/rejected": -18.53583526611328, "logps/chosen": -383.0144348144531, "logps/rejected": -321.52691650390625, "loss": 0.8344, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.1417198181152344, "rewards/margins": -0.03580514341592789, "rewards/rejected": 2.1775248050689697, "step": 20160 }, { "epoch": 0.9364408746924184, "grad_norm": 62.03891372680664, "learning_rate": 2.4384976089883466e-07, "logits/chosen": -18.809043884277344, "logits/rejected": -18.56195068359375, "logps/chosen": -317.03680419921875, "logps/rejected": -244.93563842773438, "loss": 0.7088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7765165567398071, "rewards/margins": 0.2625555396080017, "rewards/rejected": 1.5139610767364502, "step": 20170 }, { "epoch": 0.9369051487998514, "grad_norm": 27.8289852142334, "learning_rate": 2.438219044523887e-07, "logits/chosen": -18.86368179321289, "logits/rejected": -18.023645401000977, "logps/chosen": -414.604248046875, "logps/rejected": -373.1541442871094, "loss": 0.6007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.787248373031616, "rewards/margins": 0.4121319651603699, "rewards/rejected": 2.3751163482666016, "step": 20180 }, { "epoch": 0.9373694229072844, "grad_norm": 38.32358169555664, "learning_rate": 2.437940480059427e-07, "logits/chosen": -19.407949447631836, "logits/rejected": -17.5104923248291, "logps/chosen": -275.15032958984375, "logps/rejected": -160.4896697998047, "loss": 0.395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1148695945739746, "rewards/margins": 1.066300868988037, "rewards/rejected": 1.0485684871673584, "step": 20190 }, { "epoch": 0.9378336970147175, "grad_norm": 69.34165954589844, "learning_rate": 2.437661915594967e-07, "logits/chosen": -18.55815315246582, "logits/rejected": -17.889331817626953, "logps/chosen": -303.2253723144531, "logps/rejected": -241.6479949951172, "loss": 0.4994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2367448806762695, "rewards/margins": 0.5126790404319763, "rewards/rejected": 1.7240657806396484, "step": 20200 }, { "epoch": 0.9382979711221505, "grad_norm": 49.12765121459961, "learning_rate": 2.437383351130507e-07, "logits/chosen": -18.90143585205078, "logits/rejected": -18.229236602783203, "logps/chosen": -454.2369079589844, "logps/rejected": -389.7986755371094, "loss": 0.6446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.682638645172119, "rewards/margins": 0.36987027525901794, "rewards/rejected": 2.3127682209014893, "step": 20210 }, { "epoch": 0.9387622452295835, "grad_norm": 72.3390884399414, "learning_rate": 2.4371047866660476e-07, "logits/chosen": -18.14175796508789, "logits/rejected": -18.22462272644043, "logps/chosen": -351.1178894042969, "logps/rejected": -386.7228088378906, "loss": 1.0615, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6777634620666504, "rewards/margins": -0.33153918385505676, "rewards/rejected": 3.0093026161193848, "step": 20220 }, { "epoch": 0.9392265193370166, "grad_norm": 110.64962768554688, "learning_rate": 2.436826222201588e-07, "logits/chosen": -18.104639053344727, "logits/rejected": -18.386472702026367, "logps/chosen": -388.8214111328125, "logps/rejected": -432.0992126464844, "loss": 0.9426, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6535544395446777, "rewards/margins": -0.07981090247631073, "rewards/rejected": 2.733365297317505, "step": 20230 }, { "epoch": 0.9396907934444496, "grad_norm": 62.95847702026367, "learning_rate": 2.436547657737128e-07, "logits/chosen": -19.367368698120117, "logits/rejected": -18.3323974609375, "logps/chosen": -490.07806396484375, "logps/rejected": -371.3105163574219, "loss": 0.5624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9475455284118652, "rewards/margins": 0.5735946893692017, "rewards/rejected": 2.3739511966705322, "step": 20240 }, { "epoch": 0.9401550675518826, "grad_norm": 21.391754150390625, "learning_rate": 2.4362690932726677e-07, "logits/chosen": -18.870922088623047, "logits/rejected": -18.371644973754883, "logps/chosen": -473.38671875, "logps/rejected": -368.78485107421875, "loss": 0.4557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.138275146484375, "rewards/margins": 1.0838367938995361, "rewards/rejected": 2.054438352584839, "step": 20250 }, { "epoch": 0.9406193416593157, "grad_norm": 50.24077606201172, "learning_rate": 2.435990528808208e-07, "logits/chosen": -18.510257720947266, "logits/rejected": -18.948823928833008, "logps/chosen": -188.53436279296875, "logps/rejected": -240.6068115234375, "loss": 1.0622, "rewards/accuracies": 0.5, "rewards/chosen": 1.3172444105148315, "rewards/margins": -0.42042359709739685, "rewards/rejected": 1.7376680374145508, "step": 20260 }, { "epoch": 0.9410836157667487, "grad_norm": 43.25446319580078, "learning_rate": 2.4357119643437485e-07, "logits/chosen": -18.5415096282959, "logits/rejected": -18.178007125854492, "logps/chosen": -423.1353454589844, "logps/rejected": -290.7975158691406, "loss": 0.5538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9881763458251953, "rewards/margins": 0.7314821481704712, "rewards/rejected": 2.2566940784454346, "step": 20270 }, { "epoch": 0.9415478898741817, "grad_norm": 67.51314544677734, "learning_rate": 2.435433399879289e-07, "logits/chosen": -19.88479232788086, "logits/rejected": -18.68858528137207, "logps/chosen": -393.5616760253906, "logps/rejected": -394.94036865234375, "loss": 0.756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.977630138397217, "rewards/margins": 0.26374223828315735, "rewards/rejected": 2.7138876914978027, "step": 20280 }, { "epoch": 0.9420121639816147, "grad_norm": 209.0025177001953, "learning_rate": 2.435154835414829e-07, "logits/chosen": -19.54983901977539, "logits/rejected": -18.32821273803711, "logps/chosen": -540.3135986328125, "logps/rejected": -375.8413391113281, "loss": 0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0654802322387695, "rewards/margins": 1.2456659078598022, "rewards/rejected": 2.8198139667510986, "step": 20290 }, { "epoch": 0.9424764380890478, "grad_norm": 287.3117980957031, "learning_rate": 2.4348762709503687e-07, "logits/chosen": -18.604846954345703, "logits/rejected": -18.43038558959961, "logps/chosen": -352.40289306640625, "logps/rejected": -401.25714111328125, "loss": 1.3601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.694964647293091, "rewards/margins": -0.19572663307189941, "rewards/rejected": 2.890691041946411, "step": 20300 }, { "epoch": 0.9429407121964808, "grad_norm": 52.81477737426758, "learning_rate": 2.434597706485909e-07, "logits/chosen": -19.127899169921875, "logits/rejected": -18.080604553222656, "logps/chosen": -442.1630859375, "logps/rejected": -286.15057373046875, "loss": 0.604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1035683155059814, "rewards/margins": 0.9620596170425415, "rewards/rejected": 2.1415085792541504, "step": 20310 }, { "epoch": 0.9434049863039138, "grad_norm": 123.08409881591797, "learning_rate": 2.4343191420214495e-07, "logits/chosen": -18.290145874023438, "logits/rejected": -18.15365219116211, "logps/chosen": -384.0400695800781, "logps/rejected": -342.8251037597656, "loss": 0.7784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8726439476013184, "rewards/margins": 0.26938337087631226, "rewards/rejected": 2.6032609939575195, "step": 20320 }, { "epoch": 0.9438692604113469, "grad_norm": 37.21461486816406, "learning_rate": 2.4340405775569894e-07, "logits/chosen": -18.737079620361328, "logits/rejected": -17.905136108398438, "logps/chosen": -430.35687255859375, "logps/rejected": -370.854736328125, "loss": 0.6846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.23002290725708, "rewards/margins": 0.5612504482269287, "rewards/rejected": 2.6687726974487305, "step": 20330 }, { "epoch": 0.9443335345187799, "grad_norm": 166.0794677734375, "learning_rate": 2.43376201309253e-07, "logits/chosen": -20.25225257873535, "logits/rejected": -18.253389358520508, "logps/chosen": -408.14337158203125, "logps/rejected": -289.2590637207031, "loss": 0.4732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.355311870574951, "rewards/margins": 1.4293768405914307, "rewards/rejected": 1.92593514919281, "step": 20340 }, { "epoch": 0.9447978086262129, "grad_norm": 104.27459716796875, "learning_rate": 2.43348344862807e-07, "logits/chosen": -18.380319595336914, "logits/rejected": -17.888851165771484, "logps/chosen": -451.13116455078125, "logps/rejected": -398.41845703125, "loss": 0.5928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.036400079727173, "rewards/margins": 0.5076542496681213, "rewards/rejected": 2.528745651245117, "step": 20350 }, { "epoch": 0.945262082733646, "grad_norm": 84.12064361572266, "learning_rate": 2.43320488416361e-07, "logits/chosen": -17.54698371887207, "logits/rejected": -17.30733299255371, "logps/chosen": -281.9017028808594, "logps/rejected": -273.3544006347656, "loss": 0.8592, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2916059494018555, "rewards/margins": 0.131286159157753, "rewards/rejected": 2.1603198051452637, "step": 20360 }, { "epoch": 0.945726356841079, "grad_norm": 61.06726837158203, "learning_rate": 2.43292631969915e-07, "logits/chosen": -18.88711929321289, "logits/rejected": -18.407814025878906, "logps/chosen": -483.00299072265625, "logps/rejected": -275.1528625488281, "loss": 0.8262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1424777507781982, "rewards/margins": 1.019474983215332, "rewards/rejected": 2.123002529144287, "step": 20370 }, { "epoch": 0.946190630948512, "grad_norm": 169.29254150390625, "learning_rate": 2.4326477552346904e-07, "logits/chosen": -18.58125877380371, "logits/rejected": -17.997970581054688, "logps/chosen": -481.1875915527344, "logps/rejected": -431.78564453125, "loss": 0.5148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8588974475860596, "rewards/margins": 0.9543401002883911, "rewards/rejected": 1.9045569896697998, "step": 20380 }, { "epoch": 0.9466549050559451, "grad_norm": 190.04879760742188, "learning_rate": 2.432369190770231e-07, "logits/chosen": -19.25209617614746, "logits/rejected": -18.29265594482422, "logps/chosen": -417.37030029296875, "logps/rejected": -373.26513671875, "loss": 0.5574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4971790313720703, "rewards/margins": 0.5927161574363708, "rewards/rejected": 1.9044628143310547, "step": 20390 }, { "epoch": 0.9471191791633781, "grad_norm": 47.987701416015625, "learning_rate": 2.4320906263057707e-07, "logits/chosen": -18.866289138793945, "logits/rejected": -18.75702476501465, "logps/chosen": -399.9764099121094, "logps/rejected": -395.01715087890625, "loss": 0.6447, "rewards/accuracies": 0.5, "rewards/chosen": 2.5259029865264893, "rewards/margins": 0.21379761397838593, "rewards/rejected": 2.312105178833008, "step": 20400 }, { "epoch": 0.9475834532708111, "grad_norm": 235.06484985351562, "learning_rate": 2.431812061841311e-07, "logits/chosen": -18.64822769165039, "logits/rejected": -17.695547103881836, "logps/chosen": -358.80096435546875, "logps/rejected": -286.27947998046875, "loss": 0.8463, "rewards/accuracies": 0.5, "rewards/chosen": 2.4170992374420166, "rewards/margins": 0.17565229535102844, "rewards/rejected": 2.2414469718933105, "step": 20410 }, { "epoch": 0.9480477273782442, "grad_norm": 30.191308975219727, "learning_rate": 2.431533497376851e-07, "logits/chosen": -19.227693557739258, "logits/rejected": -17.431283950805664, "logps/chosen": -430.8219299316406, "logps/rejected": -207.1918182373047, "loss": 0.2936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0912961959838867, "rewards/margins": 1.917207956314087, "rewards/rejected": 1.1740883588790894, "step": 20420 }, { "epoch": 0.9485120014856772, "grad_norm": 58.81120681762695, "learning_rate": 2.4312549329123914e-07, "logits/chosen": -18.825130462646484, "logits/rejected": -19.30121421813965, "logps/chosen": -333.9123229980469, "logps/rejected": -367.44158935546875, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": 2.3155531883239746, "rewards/margins": 0.24242189526557922, "rewards/rejected": 2.0731310844421387, "step": 20430 }, { "epoch": 0.9489762755931102, "grad_norm": 148.8240509033203, "learning_rate": 2.4309763684479313e-07, "logits/chosen": -19.12548828125, "logits/rejected": -18.09963035583496, "logps/chosen": -475.5892639160156, "logps/rejected": -336.9183044433594, "loss": 0.6525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0501437187194824, "rewards/margins": 0.9414867162704468, "rewards/rejected": 2.108656883239746, "step": 20440 }, { "epoch": 0.9494405497005431, "grad_norm": 65.21855926513672, "learning_rate": 2.4306978039834717e-07, "logits/chosen": -18.880992889404297, "logits/rejected": -17.866315841674805, "logps/chosen": -331.5992431640625, "logps/rejected": -265.83721923828125, "loss": 0.7828, "rewards/accuracies": 0.5, "rewards/chosen": 1.894107460975647, "rewards/margins": 0.1073300689458847, "rewards/rejected": 1.7867774963378906, "step": 20450 }, { "epoch": 0.9499048238079763, "grad_norm": 22.53732681274414, "learning_rate": 2.430419239519012e-07, "logits/chosen": -18.806577682495117, "logits/rejected": -18.087512969970703, "logps/chosen": -515.9894409179688, "logps/rejected": -376.01715087890625, "loss": 0.7628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.186127185821533, "rewards/margins": 0.5104553699493408, "rewards/rejected": 2.6756720542907715, "step": 20460 }, { "epoch": 0.9503690979154092, "grad_norm": 93.40220642089844, "learning_rate": 2.430140675054552e-07, "logits/chosen": -19.622617721557617, "logits/rejected": -18.365514755249023, "logps/chosen": -431.35443115234375, "logps/rejected": -342.6749267578125, "loss": 0.5438, "rewards/accuracies": 0.5, "rewards/chosen": 3.392427921295166, "rewards/margins": 0.9063812494277954, "rewards/rejected": 2.486046552658081, "step": 20470 }, { "epoch": 0.9508333720228422, "grad_norm": 9.813693046569824, "learning_rate": 2.4298621105900924e-07, "logits/chosen": -18.806800842285156, "logits/rejected": -17.806549072265625, "logps/chosen": -346.78631591796875, "logps/rejected": -257.04620361328125, "loss": 0.6714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4856035709381104, "rewards/margins": 0.30257853865623474, "rewards/rejected": 2.1830248832702637, "step": 20480 }, { "epoch": 0.9512976461302753, "grad_norm": 281.1492004394531, "learning_rate": 2.4295835461256323e-07, "logits/chosen": -19.321130752563477, "logits/rejected": -18.404117584228516, "logps/chosen": -461.6766662597656, "logps/rejected": -377.0672912597656, "loss": 0.7871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.596605062484741, "rewards/margins": 0.3507365584373474, "rewards/rejected": 2.245868682861328, "step": 20490 }, { "epoch": 0.9517619202377083, "grad_norm": 24.4541015625, "learning_rate": 2.4293049816611727e-07, "logits/chosen": -19.307645797729492, "logits/rejected": -17.626022338867188, "logps/chosen": -370.3136291503906, "logps/rejected": -219.12631225585938, "loss": 0.7188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4537198543548584, "rewards/margins": 1.3730411529541016, "rewards/rejected": 1.0806790590286255, "step": 20500 }, { "epoch": 0.9522261943451413, "grad_norm": 48.544097900390625, "learning_rate": 2.429026417196713e-07, "logits/chosen": -19.118175506591797, "logits/rejected": -18.144550323486328, "logps/chosen": -338.3911437988281, "logps/rejected": -232.4357452392578, "loss": 0.4405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.448914051055908, "rewards/margins": 1.0882525444030762, "rewards/rejected": 1.3606617450714111, "step": 20510 }, { "epoch": 0.9526904684525744, "grad_norm": 5.3093061447143555, "learning_rate": 2.428747852732253e-07, "logits/chosen": -19.22370719909668, "logits/rejected": -19.12260627746582, "logps/chosen": -422.6112365722656, "logps/rejected": -365.70855712890625, "loss": 0.6684, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8523902893066406, "rewards/margins": 0.7161380052566528, "rewards/rejected": 3.1362526416778564, "step": 20520 }, { "epoch": 0.9531547425600074, "grad_norm": 105.04380798339844, "learning_rate": 2.4284692882677934e-07, "logits/chosen": -19.200801849365234, "logits/rejected": -18.316953659057617, "logps/chosen": -310.1661682128906, "logps/rejected": -295.6565246582031, "loss": 0.6401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9928414821624756, "rewards/margins": 0.32114988565444946, "rewards/rejected": 1.671691656112671, "step": 20530 }, { "epoch": 0.9536190166674404, "grad_norm": 293.7852783203125, "learning_rate": 2.428190723803333e-07, "logits/chosen": -19.039608001708984, "logits/rejected": -18.196311950683594, "logps/chosen": -400.3170471191406, "logps/rejected": -360.9009094238281, "loss": 0.615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0701844692230225, "rewards/margins": 0.7309759855270386, "rewards/rejected": 2.3392088413238525, "step": 20540 }, { "epoch": 0.9540832907748735, "grad_norm": 25.88773536682129, "learning_rate": 2.4279121593388737e-07, "logits/chosen": -18.798913955688477, "logits/rejected": -18.279247283935547, "logps/chosen": -437.71771240234375, "logps/rejected": -427.83868408203125, "loss": 0.7641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5857303142547607, "rewards/margins": 0.22686652839183807, "rewards/rejected": 2.3588638305664062, "step": 20550 }, { "epoch": 0.9545475648823065, "grad_norm": 2.0662546157836914, "learning_rate": 2.4276335948744136e-07, "logits/chosen": -18.244693756103516, "logits/rejected": -16.742467880249023, "logps/chosen": -323.63507080078125, "logps/rejected": -251.84127807617188, "loss": 0.4028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.978297233581543, "rewards/margins": 1.7296419143676758, "rewards/rejected": 1.2486555576324463, "step": 20560 }, { "epoch": 0.9550118389897395, "grad_norm": 256.27862548828125, "learning_rate": 2.427355030409954e-07, "logits/chosen": -18.622455596923828, "logits/rejected": -18.781496047973633, "logps/chosen": -406.1847229003906, "logps/rejected": -445.9791564941406, "loss": 0.9917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0548338890075684, "rewards/margins": 0.14828427135944366, "rewards/rejected": 2.9065492153167725, "step": 20570 }, { "epoch": 0.9554761130971726, "grad_norm": 151.2677001953125, "learning_rate": 2.4270764659454944e-07, "logits/chosen": -18.893539428710938, "logits/rejected": -18.62128257751465, "logps/chosen": -389.4562072753906, "logps/rejected": -347.9086608886719, "loss": 0.6766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9694766998291016, "rewards/margins": 0.38193923234939575, "rewards/rejected": 2.5875372886657715, "step": 20580 }, { "epoch": 0.9559403872046056, "grad_norm": 15.915486335754395, "learning_rate": 2.426797901481034e-07, "logits/chosen": -19.0126953125, "logits/rejected": -18.56436538696289, "logps/chosen": -359.2879333496094, "logps/rejected": -304.4205017089844, "loss": 0.873, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.048795700073242, "rewards/margins": -0.03044872358441353, "rewards/rejected": 2.079244375228882, "step": 20590 }, { "epoch": 0.9564046613120386, "grad_norm": 21.756126403808594, "learning_rate": 2.4265193370165747e-07, "logits/chosen": -18.55098533630371, "logits/rejected": -18.385343551635742, "logps/chosen": -384.0273742675781, "logps/rejected": -342.63037109375, "loss": 0.6166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1631484031677246, "rewards/margins": 0.6440606117248535, "rewards/rejected": 2.519087553024292, "step": 20600 }, { "epoch": 0.9568689354194717, "grad_norm": 39.53655242919922, "learning_rate": 2.4262407725521145e-07, "logits/chosen": -19.093082427978516, "logits/rejected": -18.609832763671875, "logps/chosen": -332.9226989746094, "logps/rejected": -352.3563232421875, "loss": 0.9894, "rewards/accuracies": 0.5, "rewards/chosen": 2.477830410003662, "rewards/margins": 0.08939981460571289, "rewards/rejected": 2.38843035697937, "step": 20610 }, { "epoch": 0.9573332095269047, "grad_norm": 82.61290740966797, "learning_rate": 2.425962208087655e-07, "logits/chosen": -19.01239013671875, "logits/rejected": -18.441909790039062, "logps/chosen": -381.2186584472656, "logps/rejected": -293.93829345703125, "loss": 0.6334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9783077239990234, "rewards/margins": 0.8876174688339233, "rewards/rejected": 2.0906901359558105, "step": 20620 }, { "epoch": 0.9577974836343377, "grad_norm": 130.6522979736328, "learning_rate": 2.425683643623195e-07, "logits/chosen": -20.058284759521484, "logits/rejected": -18.931150436401367, "logps/chosen": -372.13043212890625, "logps/rejected": -316.4005432128906, "loss": 0.7356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.131474733352661, "rewards/margins": 0.3749983608722687, "rewards/rejected": 2.756476402282715, "step": 20630 }, { "epoch": 0.9582617577417707, "grad_norm": 3.7685043811798096, "learning_rate": 2.425405079158735e-07, "logits/chosen": -18.539926528930664, "logits/rejected": -17.361873626708984, "logps/chosen": -372.1537170410156, "logps/rejected": -216.9752197265625, "loss": 0.3984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0060486793518066, "rewards/margins": 1.2228469848632812, "rewards/rejected": 1.7832015752792358, "step": 20640 }, { "epoch": 0.9587260318492038, "grad_norm": 59.064613342285156, "learning_rate": 2.4251265146942757e-07, "logits/chosen": -19.14889907836914, "logits/rejected": -19.11370849609375, "logps/chosen": -402.87225341796875, "logps/rejected": -281.8335266113281, "loss": 0.8821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6072609424591064, "rewards/margins": 0.41454213857650757, "rewards/rejected": 2.192718505859375, "step": 20650 }, { "epoch": 0.9591903059566368, "grad_norm": 12.527081489562988, "learning_rate": 2.4248479502298155e-07, "logits/chosen": -18.849178314208984, "logits/rejected": -18.184526443481445, "logps/chosen": -405.4967956542969, "logps/rejected": -352.04498291015625, "loss": 0.6515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.998345971107483, "rewards/margins": 0.46297329664230347, "rewards/rejected": 1.5353727340698242, "step": 20660 }, { "epoch": 0.9596545800640698, "grad_norm": 65.13081359863281, "learning_rate": 2.4245693857653554e-07, "logits/chosen": -19.68012046813965, "logits/rejected": -18.866519927978516, "logps/chosen": -310.95074462890625, "logps/rejected": -265.16680908203125, "loss": 0.7472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2761826515197754, "rewards/margins": 0.25219249725341797, "rewards/rejected": 2.0239901542663574, "step": 20670 }, { "epoch": 0.9601188541715029, "grad_norm": 115.50590515136719, "learning_rate": 2.424290821300896e-07, "logits/chosen": -18.786609649658203, "logits/rejected": -17.347389221191406, "logps/chosen": -347.61285400390625, "logps/rejected": -223.1951446533203, "loss": 0.2791, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.471952438354492, "rewards/margins": 1.757974624633789, "rewards/rejected": 1.7139778137207031, "step": 20680 }, { "epoch": 0.9605831282789359, "grad_norm": 11.52908992767334, "learning_rate": 2.424012256836436e-07, "logits/chosen": -19.21662712097168, "logits/rejected": -18.07376480102539, "logps/chosen": -391.2644958496094, "logps/rejected": -261.2808837890625, "loss": 0.487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9227676391601562, "rewards/margins": 0.7813154458999634, "rewards/rejected": 2.1414520740509033, "step": 20690 }, { "epoch": 0.9610474023863689, "grad_norm": 5.749521732330322, "learning_rate": 2.4237336923719766e-07, "logits/chosen": -18.62506103515625, "logits/rejected": -18.50562286376953, "logps/chosen": -306.25054931640625, "logps/rejected": -262.7430114746094, "loss": 0.6714, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5757827758789062, "rewards/margins": 0.8706458806991577, "rewards/rejected": 1.7051372528076172, "step": 20700 }, { "epoch": 0.961511676493802, "grad_norm": 19.441226959228516, "learning_rate": 2.4234551279075165e-07, "logits/chosen": -19.708431243896484, "logits/rejected": -19.235111236572266, "logps/chosen": -369.84075927734375, "logps/rejected": -304.95538330078125, "loss": 0.466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5879335403442383, "rewards/margins": 0.6540980339050293, "rewards/rejected": 1.9338356256484985, "step": 20710 }, { "epoch": 0.961975950601235, "grad_norm": 35.223304748535156, "learning_rate": 2.4231765634430564e-07, "logits/chosen": -19.52290916442871, "logits/rejected": -18.691308975219727, "logps/chosen": -431.18707275390625, "logps/rejected": -354.2045593261719, "loss": 0.5568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.326054811477661, "rewards/margins": 0.534070611000061, "rewards/rejected": 2.7919838428497314, "step": 20720 }, { "epoch": 0.962440224708668, "grad_norm": 116.34671783447266, "learning_rate": 2.422897998978597e-07, "logits/chosen": -20.12729835510254, "logits/rejected": -18.80353355407715, "logps/chosen": -465.9170837402344, "logps/rejected": -311.1917724609375, "loss": 0.5369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7560055255889893, "rewards/margins": 0.776689350605011, "rewards/rejected": 1.9793163537979126, "step": 20730 }, { "epoch": 0.9629044988161011, "grad_norm": 54.61654281616211, "learning_rate": 2.422619434514137e-07, "logits/chosen": -18.270490646362305, "logits/rejected": -17.867725372314453, "logps/chosen": -340.8927917480469, "logps/rejected": -255.07968139648438, "loss": 0.7326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.181342124938965, "rewards/margins": 0.5498939752578735, "rewards/rejected": 1.6314481496810913, "step": 20740 }, { "epoch": 0.9633687729235341, "grad_norm": 144.80523681640625, "learning_rate": 2.422340870049677e-07, "logits/chosen": -19.102909088134766, "logits/rejected": -17.73507308959961, "logps/chosen": -451.3819274902344, "logps/rejected": -366.4860534667969, "loss": 0.6165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.197854518890381, "rewards/margins": 0.806587815284729, "rewards/rejected": 2.3912668228149414, "step": 20750 }, { "epoch": 0.9638330470309671, "grad_norm": 3.5049350261688232, "learning_rate": 2.4220623055852175e-07, "logits/chosen": -18.5966796875, "logits/rejected": -17.35857391357422, "logps/chosen": -388.7454528808594, "logps/rejected": -300.68524169921875, "loss": 0.7788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.764580488204956, "rewards/margins": 1.037939190864563, "rewards/rejected": 1.726641297340393, "step": 20760 }, { "epoch": 0.9642973211384002, "grad_norm": 77.6693344116211, "learning_rate": 2.421783741120758e-07, "logits/chosen": -18.83106231689453, "logits/rejected": -18.498289108276367, "logps/chosen": -327.2667541503906, "logps/rejected": -263.6080322265625, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0520222187042236, "rewards/margins": 0.39044469594955444, "rewards/rejected": 1.6615774631500244, "step": 20770 }, { "epoch": 0.9647615952458332, "grad_norm": 247.63462829589844, "learning_rate": 2.421505176656298e-07, "logits/chosen": -19.566579818725586, "logits/rejected": -19.567115783691406, "logps/chosen": -542.97021484375, "logps/rejected": -518.3968505859375, "loss": 0.6049, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7304368019104004, "rewards/margins": 0.7402805089950562, "rewards/rejected": 2.9901561737060547, "step": 20780 }, { "epoch": 0.9652258693532662, "grad_norm": 22.10137176513672, "learning_rate": 2.4212266121918377e-07, "logits/chosen": -18.3774471282959, "logits/rejected": -18.37676429748535, "logps/chosen": -325.24456787109375, "logps/rejected": -315.32244873046875, "loss": 0.8396, "rewards/accuracies": 0.5, "rewards/chosen": 2.533033609390259, "rewards/margins": 0.04753117635846138, "rewards/rejected": 2.485502243041992, "step": 20790 }, { "epoch": 0.9656901434606991, "grad_norm": 13.357858657836914, "learning_rate": 2.420948047727378e-07, "logits/chosen": -18.166736602783203, "logits/rejected": -18.5794734954834, "logps/chosen": -366.5799865722656, "logps/rejected": -392.7134704589844, "loss": 1.4211, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3602914810180664, "rewards/margins": -0.5580175518989563, "rewards/rejected": 2.918309211730957, "step": 20800 }, { "epoch": 0.9661544175681323, "grad_norm": 133.69419860839844, "learning_rate": 2.4206694832629185e-07, "logits/chosen": -18.95076560974121, "logits/rejected": -18.409008026123047, "logps/chosen": -313.31280517578125, "logps/rejected": -234.1813507080078, "loss": 0.6172, "rewards/accuracies": 0.5, "rewards/chosen": 2.270315408706665, "rewards/margins": 0.5419884324073792, "rewards/rejected": 1.7283267974853516, "step": 20810 }, { "epoch": 0.9666186916755652, "grad_norm": 28.027610778808594, "learning_rate": 2.4203909187984584e-07, "logits/chosen": -19.346683502197266, "logits/rejected": -17.913089752197266, "logps/chosen": -363.2513122558594, "logps/rejected": -236.3734893798828, "loss": 0.5297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2578346729278564, "rewards/margins": 1.1440523862838745, "rewards/rejected": 2.1137824058532715, "step": 20820 }, { "epoch": 0.9670829657829982, "grad_norm": 131.2572479248047, "learning_rate": 2.420112354333999e-07, "logits/chosen": -18.254390716552734, "logits/rejected": -17.860790252685547, "logps/chosen": -422.0545959472656, "logps/rejected": -363.39788818359375, "loss": 0.429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3911900520324707, "rewards/margins": 0.9245802164077759, "rewards/rejected": 2.4666097164154053, "step": 20830 }, { "epoch": 0.9675472398904313, "grad_norm": 128.86502075195312, "learning_rate": 2.4198337898695387e-07, "logits/chosen": -17.91927719116211, "logits/rejected": -17.79293441772461, "logps/chosen": -353.29986572265625, "logps/rejected": -311.3824768066406, "loss": 1.0568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.287698268890381, "rewards/margins": -0.07092173397541046, "rewards/rejected": 2.3586201667785645, "step": 20840 }, { "epoch": 0.9680115139978643, "grad_norm": 240.775634765625, "learning_rate": 2.419555225405079e-07, "logits/chosen": -18.32805061340332, "logits/rejected": -18.560009002685547, "logps/chosen": -385.07342529296875, "logps/rejected": -388.70220947265625, "loss": 0.8533, "rewards/accuracies": 0.5, "rewards/chosen": 2.7004666328430176, "rewards/margins": 0.11781734228134155, "rewards/rejected": 2.5826494693756104, "step": 20850 }, { "epoch": 0.9684757881052973, "grad_norm": 172.51516723632812, "learning_rate": 2.419276660940619e-07, "logits/chosen": -17.973712921142578, "logits/rejected": -18.223957061767578, "logps/chosen": -311.91546630859375, "logps/rejected": -333.392333984375, "loss": 1.2915, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.529785394668579, "rewards/margins": -0.7233747243881226, "rewards/rejected": 2.253159999847412, "step": 20860 }, { "epoch": 0.9689400622127304, "grad_norm": 43.01150131225586, "learning_rate": 2.4189980964761594e-07, "logits/chosen": -18.290584564208984, "logits/rejected": -17.644695281982422, "logps/chosen": -360.55731201171875, "logps/rejected": -337.31988525390625, "loss": 0.5225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.927522659301758, "rewards/margins": 0.8357445001602173, "rewards/rejected": 2.091778039932251, "step": 20870 }, { "epoch": 0.9694043363201634, "grad_norm": 173.72840881347656, "learning_rate": 2.4187195320117e-07, "logits/chosen": -18.672748565673828, "logits/rejected": -18.34334373474121, "logps/chosen": -455.945068359375, "logps/rejected": -350.3865661621094, "loss": 0.6702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.236637830734253, "rewards/margins": 0.6277218461036682, "rewards/rejected": 2.6089160442352295, "step": 20880 }, { "epoch": 0.9698686104275964, "grad_norm": 19.173463821411133, "learning_rate": 2.4184409675472397e-07, "logits/chosen": -18.80147933959961, "logits/rejected": -18.002460479736328, "logps/chosen": -446.55767822265625, "logps/rejected": -288.7930603027344, "loss": 0.7617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.675612211227417, "rewards/margins": 0.730681300163269, "rewards/rejected": 1.9449307918548584, "step": 20890 }, { "epoch": 0.9703328845350295, "grad_norm": 245.54905700683594, "learning_rate": 2.41816240308278e-07, "logits/chosen": -17.912614822387695, "logits/rejected": -17.45652198791504, "logps/chosen": -501.75970458984375, "logps/rejected": -364.89398193359375, "loss": 0.6913, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7772274017333984, "rewards/margins": 0.5836722254753113, "rewards/rejected": 2.1935553550720215, "step": 20900 }, { "epoch": 0.9707971586424625, "grad_norm": 65.09909057617188, "learning_rate": 2.41788383861832e-07, "logits/chosen": -18.697690963745117, "logits/rejected": -17.682453155517578, "logps/chosen": -374.3916931152344, "logps/rejected": -273.2067565917969, "loss": 0.5768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4752697944641113, "rewards/margins": 0.6925532817840576, "rewards/rejected": 1.7827165126800537, "step": 20910 }, { "epoch": 0.9712614327498955, "grad_norm": 92.39125061035156, "learning_rate": 2.4176052741538604e-07, "logits/chosen": -19.24541473388672, "logits/rejected": -16.93024253845215, "logps/chosen": -455.91436767578125, "logps/rejected": -202.005859375, "loss": 0.3098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2839488983154297, "rewards/margins": 1.9684242010116577, "rewards/rejected": 1.3155248165130615, "step": 20920 }, { "epoch": 0.9717257068573286, "grad_norm": 48.131656646728516, "learning_rate": 2.417326709689401e-07, "logits/chosen": -18.62669563293457, "logits/rejected": -18.066944122314453, "logps/chosen": -458.616455078125, "logps/rejected": -358.69085693359375, "loss": 0.7911, "rewards/accuracies": 0.5, "rewards/chosen": 2.9967923164367676, "rewards/margins": 0.44263410568237305, "rewards/rejected": 2.5541579723358154, "step": 20930 }, { "epoch": 0.9721899809647616, "grad_norm": 26.353469848632812, "learning_rate": 2.4170481452249407e-07, "logits/chosen": -17.578611373901367, "logits/rejected": -16.878135681152344, "logps/chosen": -359.1676025390625, "logps/rejected": -286.13397216796875, "loss": 0.4107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.643735885620117, "rewards/margins": 1.3490606546401978, "rewards/rejected": 1.2946752309799194, "step": 20940 }, { "epoch": 0.9726542550721946, "grad_norm": 138.60079956054688, "learning_rate": 2.416769580760481e-07, "logits/chosen": -18.67506980895996, "logits/rejected": -18.198917388916016, "logps/chosen": -399.5739440917969, "logps/rejected": -306.4136962890625, "loss": 0.9496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.809504985809326, "rewards/margins": 0.020899271592497826, "rewards/rejected": 2.7886059284210205, "step": 20950 }, { "epoch": 0.9731185291796276, "grad_norm": 85.96636199951172, "learning_rate": 2.416491016296021e-07, "logits/chosen": -17.93380355834961, "logits/rejected": -17.333965301513672, "logps/chosen": -415.6966857910156, "logps/rejected": -355.810546875, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7207534313201904, "rewards/margins": 0.8025045394897461, "rewards/rejected": 1.9182491302490234, "step": 20960 }, { "epoch": 0.9735828032870607, "grad_norm": 9.361270904541016, "learning_rate": 2.416212451831561e-07, "logits/chosen": -18.76861000061035, "logits/rejected": -17.572097778320312, "logps/chosen": -350.16241455078125, "logps/rejected": -239.3689727783203, "loss": 0.5393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9965012073516846, "rewards/margins": 1.3230831623077393, "rewards/rejected": 1.6734182834625244, "step": 20970 }, { "epoch": 0.9740470773944937, "grad_norm": 21.679481506347656, "learning_rate": 2.415933887367101e-07, "logits/chosen": -18.218124389648438, "logits/rejected": -18.391164779663086, "logps/chosen": -338.080322265625, "logps/rejected": -357.42779541015625, "loss": 1.3228, "rewards/accuracies": 0.5, "rewards/chosen": 2.730584144592285, "rewards/margins": -0.18537601828575134, "rewards/rejected": 2.9159600734710693, "step": 20980 }, { "epoch": 0.9745113515019267, "grad_norm": 75.77027893066406, "learning_rate": 2.4156553229026417e-07, "logits/chosen": -19.283321380615234, "logits/rejected": -17.883882522583008, "logps/chosen": -387.41827392578125, "logps/rejected": -245.34207153320312, "loss": 0.327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.964454412460327, "rewards/margins": 1.4303009510040283, "rewards/rejected": 1.5341533422470093, "step": 20990 }, { "epoch": 0.9749756256093598, "grad_norm": 55.01731872558594, "learning_rate": 2.415376758438182e-07, "logits/chosen": -19.584909439086914, "logits/rejected": -17.790775299072266, "logps/chosen": -358.89764404296875, "logps/rejected": -241.46902465820312, "loss": 0.6229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.32771635055542, "rewards/margins": 0.39482277631759644, "rewards/rejected": 1.9328933954238892, "step": 21000 }, { "epoch": 0.9754398997167928, "grad_norm": 67.36219024658203, "learning_rate": 2.415098193973722e-07, "logits/chosen": -18.288955688476562, "logits/rejected": -17.779552459716797, "logps/chosen": -450.0345764160156, "logps/rejected": -371.27642822265625, "loss": 0.6187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8519790172576904, "rewards/margins": 0.6392045617103577, "rewards/rejected": 3.2127747535705566, "step": 21010 }, { "epoch": 0.9759041738242258, "grad_norm": 94.11001586914062, "learning_rate": 2.4148196295092624e-07, "logits/chosen": -18.694149017333984, "logits/rejected": -18.1081485748291, "logps/chosen": -378.6706848144531, "logps/rejected": -332.2809143066406, "loss": 0.588, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.328253984451294, "rewards/margins": 0.7382160425186157, "rewards/rejected": 2.5900378227233887, "step": 21020 }, { "epoch": 0.9763684479316589, "grad_norm": 72.73444366455078, "learning_rate": 2.414541065044802e-07, "logits/chosen": -19.285579681396484, "logits/rejected": -17.449594497680664, "logps/chosen": -492.0865783691406, "logps/rejected": -331.129638671875, "loss": 0.4882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.536848545074463, "rewards/margins": 1.3090969324111938, "rewards/rejected": 2.2277512550354004, "step": 21030 }, { "epoch": 0.9768327220390919, "grad_norm": 59.160316467285156, "learning_rate": 2.4142625005803426e-07, "logits/chosen": -18.442636489868164, "logits/rejected": -17.915481567382812, "logps/chosen": -287.9095153808594, "logps/rejected": -251.1481475830078, "loss": 0.6452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.943853735923767, "rewards/margins": 0.533036470413208, "rewards/rejected": 1.4108171463012695, "step": 21040 }, { "epoch": 0.9772969961465249, "grad_norm": 34.58491516113281, "learning_rate": 2.4139839361158825e-07, "logits/chosen": -18.54486083984375, "logits/rejected": -18.028553009033203, "logps/chosen": -416.75311279296875, "logps/rejected": -320.3389892578125, "loss": 0.4278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.861295223236084, "rewards/margins": 0.8561975359916687, "rewards/rejected": 2.0050971508026123, "step": 21050 }, { "epoch": 0.977761270253958, "grad_norm": 32.15159606933594, "learning_rate": 2.413705371651423e-07, "logits/chosen": -18.381885528564453, "logits/rejected": -17.965627670288086, "logps/chosen": -451.91229248046875, "logps/rejected": -400.446533203125, "loss": 0.7686, "rewards/accuracies": 0.5, "rewards/chosen": 2.9912619590759277, "rewards/margins": 0.21917609870433807, "rewards/rejected": 2.772085666656494, "step": 21060 }, { "epoch": 0.978225544361391, "grad_norm": 156.9772491455078, "learning_rate": 2.4134268071869633e-07, "logits/chosen": -19.703014373779297, "logits/rejected": -18.69711685180664, "logps/chosen": -480.4188537597656, "logps/rejected": -390.016357421875, "loss": 0.5687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.797534227371216, "rewards/margins": 1.1335302591323853, "rewards/rejected": 2.664003849029541, "step": 21070 }, { "epoch": 0.978689818468824, "grad_norm": 277.8682861328125, "learning_rate": 2.413148242722503e-07, "logits/chosen": -17.85569190979004, "logits/rejected": -16.885990142822266, "logps/chosen": -407.5410461425781, "logps/rejected": -350.1913146972656, "loss": 0.7161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8839547634124756, "rewards/margins": 0.826947033405304, "rewards/rejected": 2.0570075511932373, "step": 21080 }, { "epoch": 0.9791540925762571, "grad_norm": 122.89956665039062, "learning_rate": 2.412869678258043e-07, "logits/chosen": -18.49737548828125, "logits/rejected": -17.718360900878906, "logps/chosen": -373.86260986328125, "logps/rejected": -276.4549560546875, "loss": 0.6183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3679027557373047, "rewards/margins": 0.4532049298286438, "rewards/rejected": 1.9146978855133057, "step": 21090 }, { "epoch": 0.9796183666836901, "grad_norm": 38.521568298339844, "learning_rate": 2.4125911137935835e-07, "logits/chosen": -18.30678939819336, "logits/rejected": -17.597454071044922, "logps/chosen": -415.34417724609375, "logps/rejected": -328.5213317871094, "loss": 0.7251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6666808128356934, "rewards/margins": 0.2555478513240814, "rewards/rejected": 2.411133050918579, "step": 21100 }, { "epoch": 0.9800826407911231, "grad_norm": 29.32243537902832, "learning_rate": 2.412312549329124e-07, "logits/chosen": -19.165164947509766, "logits/rejected": -18.31668472290039, "logps/chosen": -429.27740478515625, "logps/rejected": -289.05712890625, "loss": 0.4654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.912440538406372, "rewards/margins": 1.0587446689605713, "rewards/rejected": 1.8536958694458008, "step": 21110 }, { "epoch": 0.980546914898556, "grad_norm": 81.91728210449219, "learning_rate": 2.4120339848646643e-07, "logits/chosen": -20.174623489379883, "logits/rejected": -19.490245819091797, "logps/chosen": -445.18426513671875, "logps/rejected": -388.1654357910156, "loss": 0.5539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1384470462799072, "rewards/margins": 0.42745256423950195, "rewards/rejected": 2.7109947204589844, "step": 21120 }, { "epoch": 0.9810111890059892, "grad_norm": 30.403539657592773, "learning_rate": 2.411755420400204e-07, "logits/chosen": -19.11136245727539, "logits/rejected": -18.669519424438477, "logps/chosen": -569.4951171875, "logps/rejected": -468.67138671875, "loss": 0.45, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.327226638793945, "rewards/margins": 0.8971498608589172, "rewards/rejected": 3.4300765991210938, "step": 21130 }, { "epoch": 0.9814754631134222, "grad_norm": 45.30847930908203, "learning_rate": 2.411476855935744e-07, "logits/chosen": -18.06039810180664, "logits/rejected": -17.175334930419922, "logps/chosen": -410.84283447265625, "logps/rejected": -302.9797058105469, "loss": 0.293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1026928424835205, "rewards/margins": 1.6106488704681396, "rewards/rejected": 1.4920438528060913, "step": 21140 }, { "epoch": 0.9819397372208551, "grad_norm": 239.25108337402344, "learning_rate": 2.4111982914712845e-07, "logits/chosen": -19.818506240844727, "logits/rejected": -18.937198638916016, "logps/chosen": -511.77008056640625, "logps/rejected": -385.7794494628906, "loss": 0.5572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5330028533935547, "rewards/margins": 1.1505917310714722, "rewards/rejected": 2.382411479949951, "step": 21150 }, { "epoch": 0.9824040113282883, "grad_norm": 89.74585723876953, "learning_rate": 2.4109197270068244e-07, "logits/chosen": -19.85243034362793, "logits/rejected": -18.79391860961914, "logps/chosen": -496.60113525390625, "logps/rejected": -370.3004455566406, "loss": 0.6709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5369162559509277, "rewards/margins": 0.6908614039421082, "rewards/rejected": 2.846055269241333, "step": 21160 }, { "epoch": 0.9828682854357212, "grad_norm": 64.55964660644531, "learning_rate": 2.410641162542365e-07, "logits/chosen": -18.52135467529297, "logits/rejected": -18.223163604736328, "logps/chosen": -385.37860107421875, "logps/rejected": -357.95965576171875, "loss": 0.6466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5758790969848633, "rewards/margins": 0.3512020707130432, "rewards/rejected": 2.2246766090393066, "step": 21170 }, { "epoch": 0.9833325595431542, "grad_norm": 98.04158782958984, "learning_rate": 2.410362598077905e-07, "logits/chosen": -19.228687286376953, "logits/rejected": -18.418737411499023, "logps/chosen": -515.212646484375, "logps/rejected": -425.03887939453125, "loss": 0.6927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2021431922912598, "rewards/margins": 0.0749351978302002, "rewards/rejected": 3.1272079944610596, "step": 21180 }, { "epoch": 0.9837968336505873, "grad_norm": 11.051469802856445, "learning_rate": 2.4100840336134456e-07, "logits/chosen": -17.78104591369629, "logits/rejected": -17.718631744384766, "logps/chosen": -315.46673583984375, "logps/rejected": -238.20608520507812, "loss": 0.8164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9997713565826416, "rewards/margins": 0.36404430866241455, "rewards/rejected": 1.6357269287109375, "step": 21190 }, { "epoch": 0.9842611077580203, "grad_norm": 196.54452514648438, "learning_rate": 2.4098054691489855e-07, "logits/chosen": -19.730518341064453, "logits/rejected": -18.973936080932617, "logps/chosen": -598.2975463867188, "logps/rejected": -410.2093811035156, "loss": 0.5815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.555363893508911, "rewards/margins": 1.1330550909042358, "rewards/rejected": 2.4223084449768066, "step": 21200 }, { "epoch": 0.9847253818654533, "grad_norm": 89.48567199707031, "learning_rate": 2.4095269046845254e-07, "logits/chosen": -17.868112564086914, "logits/rejected": -17.062028884887695, "logps/chosen": -430.13153076171875, "logps/rejected": -280.8673095703125, "loss": 0.5966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.132554292678833, "rewards/margins": 0.5861418843269348, "rewards/rejected": 1.546412467956543, "step": 21210 }, { "epoch": 0.9851896559728864, "grad_norm": 184.3759307861328, "learning_rate": 2.409248340220066e-07, "logits/chosen": -18.536758422851562, "logits/rejected": -17.58734893798828, "logps/chosen": -395.2417907714844, "logps/rejected": -271.79486083984375, "loss": 0.6568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.636183023452759, "rewards/margins": 0.6126433610916138, "rewards/rejected": 2.0235395431518555, "step": 21220 }, { "epoch": 0.9856539300803194, "grad_norm": 9.510436058044434, "learning_rate": 2.408969775755606e-07, "logits/chosen": -18.029233932495117, "logits/rejected": -17.73089027404785, "logps/chosen": -478.56787109375, "logps/rejected": -427.8880310058594, "loss": 0.7245, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.399120330810547, "rewards/margins": 0.31827595829963684, "rewards/rejected": 3.0808441638946533, "step": 21230 }, { "epoch": 0.9861182041877524, "grad_norm": 20.078685760498047, "learning_rate": 2.408691211291146e-07, "logits/chosen": -18.45351791381836, "logits/rejected": -17.781452178955078, "logps/chosen": -467.9912109375, "logps/rejected": -361.1719055175781, "loss": 0.9143, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4690749645233154, "rewards/margins": 0.6898983716964722, "rewards/rejected": 2.779176712036133, "step": 21240 }, { "epoch": 0.9865824782951855, "grad_norm": 31.867233276367188, "learning_rate": 2.4084126468266865e-07, "logits/chosen": -19.560352325439453, "logits/rejected": -17.762794494628906, "logps/chosen": -425.3675231933594, "logps/rejected": -308.7383728027344, "loss": 0.3229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4305527210235596, "rewards/margins": 1.4275485277175903, "rewards/rejected": 2.0030040740966797, "step": 21250 }, { "epoch": 0.9870467524026185, "grad_norm": 105.7501449584961, "learning_rate": 2.4081340823622264e-07, "logits/chosen": -18.055681228637695, "logits/rejected": -17.876802444458008, "logps/chosen": -346.72174072265625, "logps/rejected": -343.73724365234375, "loss": 1.208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5528597831726074, "rewards/margins": -0.11758210510015488, "rewards/rejected": 2.6704418659210205, "step": 21260 }, { "epoch": 0.9875110265100515, "grad_norm": 19.994239807128906, "learning_rate": 2.407855517897767e-07, "logits/chosen": -18.530420303344727, "logits/rejected": -18.614315032958984, "logps/chosen": -342.85211181640625, "logps/rejected": -410.67291259765625, "loss": 0.5198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.288520336151123, "rewards/margins": 0.49035030603408813, "rewards/rejected": 1.7981698513031006, "step": 21270 }, { "epoch": 0.9879753006174845, "grad_norm": 13.571608543395996, "learning_rate": 2.4075769534333067e-07, "logits/chosen": -19.251262664794922, "logits/rejected": -17.473724365234375, "logps/chosen": -437.4131774902344, "logps/rejected": -289.28656005859375, "loss": 0.242, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.610712766647339, "rewards/margins": 1.843286156654358, "rewards/rejected": 1.7674267292022705, "step": 21280 }, { "epoch": 0.9884395747249176, "grad_norm": 19.710002899169922, "learning_rate": 2.407298388968847e-07, "logits/chosen": -19.616107940673828, "logits/rejected": -18.183887481689453, "logps/chosen": -510.37579345703125, "logps/rejected": -370.4143981933594, "loss": 0.7238, "rewards/accuracies": 0.5, "rewards/chosen": 3.848790407180786, "rewards/margins": 0.6685779690742493, "rewards/rejected": 3.1802122592926025, "step": 21290 }, { "epoch": 0.9889038488323506, "grad_norm": 156.6053009033203, "learning_rate": 2.4070198245043875e-07, "logits/chosen": -18.917699813842773, "logits/rejected": -18.35397720336914, "logps/chosen": -482.8443908691406, "logps/rejected": -364.5286560058594, "loss": 0.7396, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7754671573638916, "rewards/margins": 0.20694954693317413, "rewards/rejected": 2.5685174465179443, "step": 21300 }, { "epoch": 0.9893681229397836, "grad_norm": 43.557891845703125, "learning_rate": 2.4067412600399274e-07, "logits/chosen": -19.196826934814453, "logits/rejected": -17.75238609313965, "logps/chosen": -406.3460388183594, "logps/rejected": -284.1103210449219, "loss": 0.7288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0992281436920166, "rewards/margins": 0.6666619777679443, "rewards/rejected": 2.4325664043426514, "step": 21310 }, { "epoch": 0.9898323970472167, "grad_norm": 66.5528335571289, "learning_rate": 2.406462695575468e-07, "logits/chosen": -18.41220474243164, "logits/rejected": -16.97612762451172, "logps/chosen": -369.82391357421875, "logps/rejected": -266.2668151855469, "loss": 0.5102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2912521362304688, "rewards/margins": 1.2430837154388428, "rewards/rejected": 2.048168182373047, "step": 21320 }, { "epoch": 0.9902966711546497, "grad_norm": 23.197952270507812, "learning_rate": 2.4061841311110077e-07, "logits/chosen": -17.74526596069336, "logits/rejected": -16.806049346923828, "logps/chosen": -349.00054931640625, "logps/rejected": -230.6898193359375, "loss": 0.3916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7743639945983887, "rewards/margins": 1.2480530738830566, "rewards/rejected": 1.5263110399246216, "step": 21330 }, { "epoch": 0.9907609452620827, "grad_norm": 77.03560638427734, "learning_rate": 2.405905566646548e-07, "logits/chosen": -19.227344512939453, "logits/rejected": -18.225900650024414, "logps/chosen": -464.415283203125, "logps/rejected": -294.39581298828125, "loss": 0.4225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0990726947784424, "rewards/margins": 1.4276092052459717, "rewards/rejected": 1.6714633703231812, "step": 21340 }, { "epoch": 0.9912252193695158, "grad_norm": 27.8839111328125, "learning_rate": 2.4056270021820885e-07, "logits/chosen": -17.991558074951172, "logits/rejected": -17.041967391967773, "logps/chosen": -396.71429443359375, "logps/rejected": -214.36666870117188, "loss": 0.7893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3702681064605713, "rewards/margins": 0.7884491682052612, "rewards/rejected": 1.58181893825531, "step": 21350 }, { "epoch": 0.9916894934769488, "grad_norm": 98.5675048828125, "learning_rate": 2.4053484377176284e-07, "logits/chosen": -18.337121963500977, "logits/rejected": -17.326749801635742, "logps/chosen": -380.5214538574219, "logps/rejected": -290.1826171875, "loss": 0.503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.24211049079895, "rewards/margins": 0.7465342283248901, "rewards/rejected": 1.4955763816833496, "step": 21360 }, { "epoch": 0.9921537675843818, "grad_norm": 45.42047119140625, "learning_rate": 2.405069873253169e-07, "logits/chosen": -19.081130981445312, "logits/rejected": -18.922637939453125, "logps/chosen": -416.4317321777344, "logps/rejected": -364.7292175292969, "loss": 0.7765, "rewards/accuracies": 0.5, "rewards/chosen": 2.4289095401763916, "rewards/margins": 0.1291993111371994, "rewards/rejected": 2.299710273742676, "step": 21370 }, { "epoch": 0.9926180416918149, "grad_norm": 67.17512512207031, "learning_rate": 2.4047913087887086e-07, "logits/chosen": -18.738176345825195, "logits/rejected": -18.350370407104492, "logps/chosen": -268.6308288574219, "logps/rejected": -254.46713256835938, "loss": 0.7673, "rewards/accuracies": 0.5, "rewards/chosen": 1.9324432611465454, "rewards/margins": 0.4748566150665283, "rewards/rejected": 1.457586646080017, "step": 21380 }, { "epoch": 0.9930823157992479, "grad_norm": 180.3765411376953, "learning_rate": 2.4045127443242485e-07, "logits/chosen": -19.196910858154297, "logits/rejected": -18.20933723449707, "logps/chosen": -457.58624267578125, "logps/rejected": -334.90777587890625, "loss": 0.6985, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1045899391174316, "rewards/margins": 0.7882585525512695, "rewards/rejected": 2.316331386566162, "step": 21390 }, { "epoch": 0.9935465899066809, "grad_norm": 134.63812255859375, "learning_rate": 2.404234179859789e-07, "logits/chosen": -19.84215545654297, "logits/rejected": -18.65430450439453, "logps/chosen": -464.03179931640625, "logps/rejected": -394.82379150390625, "loss": 0.5685, "rewards/accuracies": 0.5, "rewards/chosen": 3.9585330486297607, "rewards/margins": 0.882441520690918, "rewards/rejected": 3.0760910511016846, "step": 21400 }, { "epoch": 0.994010864014114, "grad_norm": 57.75420379638672, "learning_rate": 2.4039556153953294e-07, "logits/chosen": -18.051584243774414, "logits/rejected": -17.107088088989258, "logps/chosen": -318.3204040527344, "logps/rejected": -237.89047241210938, "loss": 0.4506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5487782955169678, "rewards/margins": 1.1148409843444824, "rewards/rejected": 1.433937430381775, "step": 21410 }, { "epoch": 0.994475138121547, "grad_norm": 184.7110137939453, "learning_rate": 2.40367705093087e-07, "logits/chosen": -18.561702728271484, "logits/rejected": -18.682470321655273, "logps/chosen": -353.4746398925781, "logps/rejected": -438.6805114746094, "loss": 1.0231, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.1674208641052246, "rewards/margins": -0.10538582503795624, "rewards/rejected": 3.2728066444396973, "step": 21420 }, { "epoch": 0.99493941222898, "grad_norm": 27.358278274536133, "learning_rate": 2.4033984864664096e-07, "logits/chosen": -18.511632919311523, "logits/rejected": -18.21030044555664, "logps/chosen": -368.70831298828125, "logps/rejected": -386.13031005859375, "loss": 0.9233, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8366386890411377, "rewards/margins": 0.18840746581554413, "rewards/rejected": 2.648231029510498, "step": 21430 }, { "epoch": 0.995403686336413, "grad_norm": 91.99036407470703, "learning_rate": 2.40311992200195e-07, "logits/chosen": -17.554975509643555, "logits/rejected": -17.300159454345703, "logps/chosen": -323.02984619140625, "logps/rejected": -267.91192626953125, "loss": 0.6197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.650505304336548, "rewards/margins": 0.8040727376937866, "rewards/rejected": 1.8464324474334717, "step": 21440 }, { "epoch": 0.9958679604438461, "grad_norm": 11.700161933898926, "learning_rate": 2.40284135753749e-07, "logits/chosen": -18.94779396057129, "logits/rejected": -18.851099014282227, "logps/chosen": -223.4447784423828, "logps/rejected": -262.2383117675781, "loss": 0.6975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8420015573501587, "rewards/margins": 0.0858200341463089, "rewards/rejected": 1.7561814785003662, "step": 21450 }, { "epoch": 0.9963322345512791, "grad_norm": 104.43504333496094, "learning_rate": 2.4025627930730303e-07, "logits/chosen": -17.762405395507812, "logits/rejected": -18.08595848083496, "logps/chosen": -266.16571044921875, "logps/rejected": -310.53717041015625, "loss": 0.892, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.4408369064331055, "rewards/margins": -0.2661723792552948, "rewards/rejected": 1.7070090770721436, "step": 21460 }, { "epoch": 0.996796508658712, "grad_norm": 75.52791595458984, "learning_rate": 2.40228422860857e-07, "logits/chosen": -18.37563705444336, "logits/rejected": -18.357847213745117, "logps/chosen": -387.4335021972656, "logps/rejected": -396.8916931152344, "loss": 0.9975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7148947715759277, "rewards/margins": -0.12154705822467804, "rewards/rejected": 2.836442232131958, "step": 21470 }, { "epoch": 0.9972607827661452, "grad_norm": 15.612990379333496, "learning_rate": 2.4020056641441106e-07, "logits/chosen": -17.608409881591797, "logits/rejected": -17.64681625366211, "logps/chosen": -358.66729736328125, "logps/rejected": -285.52679443359375, "loss": 0.7655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.462141275405884, "rewards/margins": 0.16924509406089783, "rewards/rejected": 2.292896032333374, "step": 21480 }, { "epoch": 0.9977250568735782, "grad_norm": 178.9767303466797, "learning_rate": 2.401727099679651e-07, "logits/chosen": -19.130861282348633, "logits/rejected": -17.906909942626953, "logps/chosen": -509.8016052246094, "logps/rejected": -368.00152587890625, "loss": 0.4545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.255478620529175, "rewards/margins": 1.1525156497955322, "rewards/rejected": 2.1029627323150635, "step": 21490 }, { "epoch": 0.9981893309810111, "grad_norm": 175.7614288330078, "learning_rate": 2.401448535215191e-07, "logits/chosen": -18.933116912841797, "logits/rejected": -17.895795822143555, "logps/chosen": -612.8489990234375, "logps/rejected": -489.30078125, "loss": 0.7841, "rewards/accuracies": 0.5, "rewards/chosen": 3.6973507404327393, "rewards/margins": 0.6875113248825073, "rewards/rejected": 3.0098395347595215, "step": 21500 }, { "epoch": 0.9986536050884443, "grad_norm": 31.964120864868164, "learning_rate": 2.401169970750731e-07, "logits/chosen": -19.291322708129883, "logits/rejected": -18.159297943115234, "logps/chosen": -404.26788330078125, "logps/rejected": -365.89886474609375, "loss": 0.6459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4136340618133545, "rewards/margins": 0.6146584749221802, "rewards/rejected": 1.7989753484725952, "step": 21510 }, { "epoch": 0.9991178791958772, "grad_norm": 55.562164306640625, "learning_rate": 2.400891406286271e-07, "logits/chosen": -17.87203025817871, "logits/rejected": -17.79766273498535, "logps/chosen": -338.53741455078125, "logps/rejected": -332.49139404296875, "loss": 0.8422, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9948759078979492, "rewards/margins": -0.035716284066438675, "rewards/rejected": 2.030592203140259, "step": 21520 }, { "epoch": 0.9995821533033102, "grad_norm": 94.29273986816406, "learning_rate": 2.4006128418218116e-07, "logits/chosen": -19.597454071044922, "logits/rejected": -18.396625518798828, "logps/chosen": -465.46124267578125, "logps/rejected": -332.5419921875, "loss": 0.5568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.254300594329834, "rewards/margins": 0.9020244479179382, "rewards/rejected": 2.35227632522583, "step": 21530 }, { "epoch": 1.0000464274107432, "grad_norm": 84.6627426147461, "learning_rate": 2.400334277357352e-07, "logits/chosen": -18.229427337646484, "logits/rejected": -16.967391967773438, "logps/chosen": -399.0006103515625, "logps/rejected": -231.47891235351562, "loss": 0.5693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.105912685394287, "rewards/margins": 1.3233981132507324, "rewards/rejected": 1.7825143337249756, "step": 21540 }, { "epoch": 1.0005107015181762, "grad_norm": 10.726170539855957, "learning_rate": 2.400055712892892e-07, "logits/chosen": -18.42273712158203, "logits/rejected": -17.440378189086914, "logps/chosen": -384.04278564453125, "logps/rejected": -290.21710205078125, "loss": 0.4502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1925976276397705, "rewards/margins": 1.4045469760894775, "rewards/rejected": 1.788050651550293, "step": 21550 }, { "epoch": 1.0009749756256094, "grad_norm": 7.9539923667907715, "learning_rate": 2.399777148428432e-07, "logits/chosen": -19.78354835510254, "logits/rejected": -19.96065330505371, "logps/chosen": -469.1485900878906, "logps/rejected": -422.8935546875, "loss": 0.7317, "rewards/accuracies": 0.5, "rewards/chosen": 3.8132083415985107, "rewards/margins": 0.3486361801624298, "rewards/rejected": 3.4645721912384033, "step": 21560 }, { "epoch": 1.0014392497330424, "grad_norm": 0.22385358810424805, "learning_rate": 2.399526440410418e-07, "logits/chosen": -18.850107192993164, "logits/rejected": -17.42901611328125, "logps/chosen": -488.7427673339844, "logps/rejected": -358.7039794921875, "loss": 1.0778, "rewards/accuracies": 0.5, "rewards/chosen": 3.8430442810058594, "rewards/margins": 1.0372310876846313, "rewards/rejected": 2.8058128356933594, "step": 21570 }, { "epoch": 1.0019035238404754, "grad_norm": 92.34454345703125, "learning_rate": 2.3992478759459584e-07, "logits/chosen": -19.030689239501953, "logits/rejected": -18.43028450012207, "logps/chosen": -383.76788330078125, "logps/rejected": -272.07830810546875, "loss": 0.4722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9766926765441895, "rewards/margins": 1.0618942975997925, "rewards/rejected": 1.914798378944397, "step": 21580 }, { "epoch": 1.0023677979479084, "grad_norm": 181.8019256591797, "learning_rate": 2.398969311481499e-07, "logits/chosen": -17.2821102142334, "logits/rejected": -17.707897186279297, "logps/chosen": -302.01055908203125, "logps/rejected": -354.7565612792969, "loss": 1.1585, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3270912170410156, "rewards/margins": -0.3730614185333252, "rewards/rejected": 2.70015287399292, "step": 21590 }, { "epoch": 1.0028320720553414, "grad_norm": 6.963881015777588, "learning_rate": 2.3986907470170387e-07, "logits/chosen": -18.660282135009766, "logits/rejected": -17.527759552001953, "logps/chosen": -367.0307312011719, "logps/rejected": -216.1371612548828, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4422988891601562, "rewards/margins": 0.6943722367286682, "rewards/rejected": 1.7479264736175537, "step": 21600 }, { "epoch": 1.0032963461627744, "grad_norm": 5.521093845367432, "learning_rate": 2.398412182552579e-07, "logits/chosen": -18.409481048583984, "logits/rejected": -16.789203643798828, "logps/chosen": -432.03814697265625, "logps/rejected": -239.6392059326172, "loss": 0.3046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9637467861175537, "rewards/margins": 1.5715672969818115, "rewards/rejected": 1.3921793699264526, "step": 21610 }, { "epoch": 1.0037606202702076, "grad_norm": 74.38572692871094, "learning_rate": 2.398133618088119e-07, "logits/chosen": -17.18222427368164, "logits/rejected": -17.572620391845703, "logps/chosen": -294.9849548339844, "logps/rejected": -291.2200012207031, "loss": 0.8193, "rewards/accuracies": 0.5, "rewards/chosen": 1.7372363805770874, "rewards/margins": -0.017219746485352516, "rewards/rejected": 1.7544561624526978, "step": 21620 }, { "epoch": 1.0042248943776406, "grad_norm": 37.99829864501953, "learning_rate": 2.397855053623659e-07, "logits/chosen": -18.369098663330078, "logits/rejected": -18.39431381225586, "logps/chosen": -357.6648864746094, "logps/rejected": -342.05047607421875, "loss": 1.0117, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.349259376525879, "rewards/margins": -0.10843907296657562, "rewards/rejected": 2.457698345184326, "step": 21630 }, { "epoch": 1.0046891684850736, "grad_norm": 110.97793579101562, "learning_rate": 2.3975764891591993e-07, "logits/chosen": -19.164459228515625, "logits/rejected": -18.426097869873047, "logps/chosen": -399.2301330566406, "logps/rejected": -306.70196533203125, "loss": 0.7548, "rewards/accuracies": 0.5, "rewards/chosen": 2.9889464378356934, "rewards/margins": 0.6013345718383789, "rewards/rejected": 2.3876121044158936, "step": 21640 }, { "epoch": 1.0051534425925066, "grad_norm": 5.706279277801514, "learning_rate": 2.3972979246947397e-07, "logits/chosen": -18.572696685791016, "logits/rejected": -18.122303009033203, "logps/chosen": -329.2222595214844, "logps/rejected": -261.6236267089844, "loss": 0.6456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7781014442443848, "rewards/margins": 0.8930050134658813, "rewards/rejected": 1.885096549987793, "step": 21650 }, { "epoch": 1.0056177166999396, "grad_norm": 98.09994506835938, "learning_rate": 2.39701936023028e-07, "logits/chosen": -18.512157440185547, "logits/rejected": -18.276369094848633, "logps/chosen": -357.4679260253906, "logps/rejected": -370.9734802246094, "loss": 1.198, "rewards/accuracies": 0.5, "rewards/chosen": 2.3389105796813965, "rewards/margins": 0.038114357739686966, "rewards/rejected": 2.3007962703704834, "step": 21660 }, { "epoch": 1.0060819908073726, "grad_norm": 46.83769607543945, "learning_rate": 2.39674079576582e-07, "logits/chosen": -18.919023513793945, "logits/rejected": -17.938785552978516, "logps/chosen": -522.7526245117188, "logps/rejected": -359.8689880371094, "loss": 0.4689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2184648513793945, "rewards/margins": 1.3899829387664795, "rewards/rejected": 2.828481912612915, "step": 21670 }, { "epoch": 1.0065462649148058, "grad_norm": 176.1453857421875, "learning_rate": 2.3964622313013604e-07, "logits/chosen": -18.763076782226562, "logits/rejected": -17.685522079467773, "logps/chosen": -368.9934997558594, "logps/rejected": -278.28680419921875, "loss": 0.3455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.627639055252075, "rewards/margins": 1.966566801071167, "rewards/rejected": 1.6610723733901978, "step": 21680 }, { "epoch": 1.0070105390222388, "grad_norm": 63.16496658325195, "learning_rate": 2.3961836668369003e-07, "logits/chosen": -18.822973251342773, "logits/rejected": -18.763465881347656, "logps/chosen": -457.95782470703125, "logps/rejected": -431.2998962402344, "loss": 0.6779, "rewards/accuracies": 0.5, "rewards/chosen": 2.951364278793335, "rewards/margins": 0.2240767925977707, "rewards/rejected": 2.727287530899048, "step": 21690 }, { "epoch": 1.0074748131296718, "grad_norm": 80.23242950439453, "learning_rate": 2.3959051023724407e-07, "logits/chosen": -19.106035232543945, "logits/rejected": -18.707700729370117, "logps/chosen": -453.18109130859375, "logps/rejected": -395.3197021484375, "loss": 0.6022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9977500438690186, "rewards/margins": 0.7282992005348206, "rewards/rejected": 2.2694506645202637, "step": 21700 }, { "epoch": 1.0079390872371048, "grad_norm": 5.65567684173584, "learning_rate": 2.3956265379079806e-07, "logits/chosen": -19.30196189880371, "logits/rejected": -19.043245315551758, "logps/chosen": -562.2272338867188, "logps/rejected": -469.9535217285156, "loss": 0.5754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5553202629089355, "rewards/margins": 1.182633399963379, "rewards/rejected": 3.3726868629455566, "step": 21710 }, { "epoch": 1.0084033613445378, "grad_norm": 19.987173080444336, "learning_rate": 2.395347973443521e-07, "logits/chosen": -18.31161880493164, "logits/rejected": -17.394994735717773, "logps/chosen": -330.41650390625, "logps/rejected": -246.7578125, "loss": 0.7861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.448209524154663, "rewards/margins": 0.3984508514404297, "rewards/rejected": 2.0497584342956543, "step": 21720 }, { "epoch": 1.0088676354519708, "grad_norm": 91.3105239868164, "learning_rate": 2.3950694089790614e-07, "logits/chosen": -18.460189819335938, "logits/rejected": -18.070173263549805, "logps/chosen": -428.4134826660156, "logps/rejected": -414.9630432128906, "loss": 0.769, "rewards/accuracies": 0.5, "rewards/chosen": 2.6401102542877197, "rewards/margins": 0.12223577499389648, "rewards/rejected": 2.517874240875244, "step": 21730 }, { "epoch": 1.0093319095594038, "grad_norm": 66.58158111572266, "learning_rate": 2.3947908445146013e-07, "logits/chosen": -18.518909454345703, "logits/rejected": -16.754575729370117, "logps/chosen": -332.73297119140625, "logps/rejected": -164.64407348632812, "loss": 0.3522, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.757903575897217, "rewards/margins": 1.7568986415863037, "rewards/rejected": 1.001004695892334, "step": 21740 }, { "epoch": 1.009796183666837, "grad_norm": 125.86105346679688, "learning_rate": 2.394512280050141e-07, "logits/chosen": -18.77765464782715, "logits/rejected": -18.0032958984375, "logps/chosen": -457.4369201660156, "logps/rejected": -440.2593688964844, "loss": 0.7501, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.332756519317627, "rewards/margins": 0.7280723452568054, "rewards/rejected": 2.6046838760375977, "step": 21750 }, { "epoch": 1.01026045777427, "grad_norm": 176.3226318359375, "learning_rate": 2.3942337155856816e-07, "logits/chosen": -18.262786865234375, "logits/rejected": -18.103580474853516, "logps/chosen": -332.4146423339844, "logps/rejected": -350.358154296875, "loss": 1.0049, "rewards/accuracies": 0.5, "rewards/chosen": 3.219139814376831, "rewards/margins": -0.1094822883605957, "rewards/rejected": 3.3286221027374268, "step": 21760 }, { "epoch": 1.010724731881703, "grad_norm": 44.008052825927734, "learning_rate": 2.393955151121222e-07, "logits/chosen": -18.40793228149414, "logits/rejected": -18.224782943725586, "logps/chosen": -416.217041015625, "logps/rejected": -442.5848083496094, "loss": 0.6257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.390347719192505, "rewards/margins": 0.7967535853385925, "rewards/rejected": 2.5935938358306885, "step": 21770 }, { "epoch": 1.011189005989136, "grad_norm": 94.64957427978516, "learning_rate": 2.3936765866567624e-07, "logits/chosen": -18.85373878479004, "logits/rejected": -18.843055725097656, "logps/chosen": -423.65594482421875, "logps/rejected": -392.4258728027344, "loss": 0.6899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.833979845046997, "rewards/margins": 0.36219674348831177, "rewards/rejected": 2.471782922744751, "step": 21780 }, { "epoch": 1.011653280096569, "grad_norm": 45.37017822265625, "learning_rate": 2.393398022192302e-07, "logits/chosen": -19.18056869506836, "logits/rejected": -17.78316879272461, "logps/chosen": -283.93212890625, "logps/rejected": -205.4832000732422, "loss": 0.5973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.560230255126953, "rewards/margins": 1.13631010055542, "rewards/rejected": 1.4239200353622437, "step": 21790 }, { "epoch": 1.012117554204002, "grad_norm": 78.99787139892578, "learning_rate": 2.393119457727842e-07, "logits/chosen": -19.16388511657715, "logits/rejected": -19.058698654174805, "logps/chosen": -351.31243896484375, "logps/rejected": -331.7069091796875, "loss": 0.957, "rewards/accuracies": 0.5, "rewards/chosen": 2.797982692718506, "rewards/margins": 0.20753049850463867, "rewards/rejected": 2.590452194213867, "step": 21800 }, { "epoch": 1.0125818283114352, "grad_norm": 0.7932457327842712, "learning_rate": 2.3928408932633826e-07, "logits/chosen": -18.99460792541504, "logits/rejected": -17.283676147460938, "logps/chosen": -490.47039794921875, "logps/rejected": -292.71234130859375, "loss": 0.4752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.212940216064453, "rewards/margins": 1.1037890911102295, "rewards/rejected": 2.1091513633728027, "step": 21810 }, { "epoch": 1.0130461024188682, "grad_norm": 130.02296447753906, "learning_rate": 2.3925623287989224e-07, "logits/chosen": -18.309101104736328, "logits/rejected": -17.560523986816406, "logps/chosen": -349.6729431152344, "logps/rejected": -317.9148864746094, "loss": 0.8737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.426530361175537, "rewards/margins": 0.4717583656311035, "rewards/rejected": 1.9547719955444336, "step": 21820 }, { "epoch": 1.0135103765263012, "grad_norm": 26.72321319580078, "learning_rate": 2.392283764334463e-07, "logits/chosen": -19.469497680664062, "logits/rejected": -18.86574363708496, "logps/chosen": -308.04193115234375, "logps/rejected": -250.7322540283203, "loss": 0.3751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.142322063446045, "rewards/margins": 1.5560498237609863, "rewards/rejected": 1.5862720012664795, "step": 21830 }, { "epoch": 1.0139746506337342, "grad_norm": 13.608311653137207, "learning_rate": 2.392005199870003e-07, "logits/chosen": -19.15682029724121, "logits/rejected": -18.481555938720703, "logps/chosen": -403.6722106933594, "logps/rejected": -265.21160888671875, "loss": 0.3879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1596202850341797, "rewards/margins": 1.1011762619018555, "rewards/rejected": 2.058444023132324, "step": 21840 }, { "epoch": 1.0144389247411671, "grad_norm": 22.186519622802734, "learning_rate": 2.3917266354055437e-07, "logits/chosen": -19.6431884765625, "logits/rejected": -19.092981338500977, "logps/chosen": -447.298583984375, "logps/rejected": -336.59051513671875, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 3.313448429107666, "rewards/margins": 1.2120614051818848, "rewards/rejected": 2.1013870239257812, "step": 21850 }, { "epoch": 1.0149031988486001, "grad_norm": 10.834357261657715, "learning_rate": 2.3914480709410835e-07, "logits/chosen": -19.397584915161133, "logits/rejected": -18.563459396362305, "logps/chosen": -371.20013427734375, "logps/rejected": -343.41302490234375, "loss": 0.951, "rewards/accuracies": 0.5, "rewards/chosen": 3.0137221813201904, "rewards/margins": 0.15334942936897278, "rewards/rejected": 2.86037278175354, "step": 21860 }, { "epoch": 1.0153674729560331, "grad_norm": 65.40023803710938, "learning_rate": 2.3911695064766234e-07, "logits/chosen": -18.319355010986328, "logits/rejected": -18.283710479736328, "logps/chosen": -417.4076232910156, "logps/rejected": -309.25897216796875, "loss": 0.6696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.373511791229248, "rewards/margins": 0.8040969967842102, "rewards/rejected": 2.5694148540496826, "step": 21870 }, { "epoch": 1.0158317470634664, "grad_norm": 1.6628010272979736, "learning_rate": 2.390890942012164e-07, "logits/chosen": -18.453969955444336, "logits/rejected": -17.76837158203125, "logps/chosen": -290.19317626953125, "logps/rejected": -234.2562713623047, "loss": 0.4676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6106386184692383, "rewards/margins": 0.9773849248886108, "rewards/rejected": 1.6332536935806274, "step": 21880 }, { "epoch": 1.0162960211708993, "grad_norm": 63.08100128173828, "learning_rate": 2.390612377547704e-07, "logits/chosen": -19.272241592407227, "logits/rejected": -18.022119522094727, "logps/chosen": -513.3377685546875, "logps/rejected": -352.2947998046875, "loss": 0.4078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8396239280700684, "rewards/margins": 1.014611005783081, "rewards/rejected": 1.8250129222869873, "step": 21890 }, { "epoch": 1.0167602952783323, "grad_norm": 39.857994079589844, "learning_rate": 2.390333813083244e-07, "logits/chosen": -19.359067916870117, "logits/rejected": -18.667407989501953, "logps/chosen": -355.1852111816406, "logps/rejected": -257.246826171875, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": 3.320970058441162, "rewards/margins": 1.4527943134307861, "rewards/rejected": 1.8681758642196655, "step": 21900 }, { "epoch": 1.0172245693857653, "grad_norm": 18.25986671447754, "learning_rate": 2.3900552486187845e-07, "logits/chosen": -19.490310668945312, "logits/rejected": -18.366167068481445, "logps/chosen": -442.36962890625, "logps/rejected": -330.83367919921875, "loss": 0.6971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4985122680664062, "rewards/margins": 1.2307802438735962, "rewards/rejected": 2.2677316665649414, "step": 21910 }, { "epoch": 1.0176888434931983, "grad_norm": 30.28508186340332, "learning_rate": 2.3897766841543244e-07, "logits/chosen": -19.792465209960938, "logits/rejected": -18.462003707885742, "logps/chosen": -345.4298095703125, "logps/rejected": -264.77685546875, "loss": 0.721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7452197074890137, "rewards/margins": 0.5328834056854248, "rewards/rejected": 2.212336301803589, "step": 21920 }, { "epoch": 1.0181531176006313, "grad_norm": 35.48616027832031, "learning_rate": 2.389498119689865e-07, "logits/chosen": -19.990997314453125, "logits/rejected": -19.729894638061523, "logps/chosen": -357.1988220214844, "logps/rejected": -318.4587097167969, "loss": 0.7044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0543735027313232, "rewards/margins": 0.5324773788452148, "rewards/rejected": 2.5218958854675293, "step": 21930 }, { "epoch": 1.0186173917080645, "grad_norm": 171.22592163085938, "learning_rate": 2.3892195552254047e-07, "logits/chosen": -19.738922119140625, "logits/rejected": -19.202861785888672, "logps/chosen": -440.70379638671875, "logps/rejected": -403.52459716796875, "loss": 0.8788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.429227113723755, "rewards/margins": 0.3374699056148529, "rewards/rejected": 3.09175705909729, "step": 21940 }, { "epoch": 1.0190816658154975, "grad_norm": 59.57865905761719, "learning_rate": 2.388940990760945e-07, "logits/chosen": -18.463336944580078, "logits/rejected": -18.592134475708008, "logps/chosen": -408.91387939453125, "logps/rejected": -360.69476318359375, "loss": 0.8108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0370781421661377, "rewards/margins": 0.5503044128417969, "rewards/rejected": 2.486773729324341, "step": 21950 }, { "epoch": 1.0195459399229305, "grad_norm": 22.04859733581543, "learning_rate": 2.3886624262964855e-07, "logits/chosen": -18.085693359375, "logits/rejected": -17.74291229248047, "logps/chosen": -407.6688537597656, "logps/rejected": -340.6572265625, "loss": 1.1124, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.462747097015381, "rewards/margins": 0.2675718665122986, "rewards/rejected": 3.1951751708984375, "step": 21960 }, { "epoch": 1.0200102140303635, "grad_norm": 140.3461456298828, "learning_rate": 2.3883838618320254e-07, "logits/chosen": -19.420989990234375, "logits/rejected": -18.043703079223633, "logps/chosen": -413.6529846191406, "logps/rejected": -288.61846923828125, "loss": 0.4558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.046607732772827, "rewards/margins": 1.1633288860321045, "rewards/rejected": 1.8832788467407227, "step": 21970 }, { "epoch": 1.0204744881377965, "grad_norm": 43.837703704833984, "learning_rate": 2.388105297367566e-07, "logits/chosen": -18.842166900634766, "logits/rejected": -17.879199981689453, "logps/chosen": -356.3682861328125, "logps/rejected": -246.3392333984375, "loss": 0.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.050251007080078, "rewards/margins": 0.7014716863632202, "rewards/rejected": 2.3487794399261475, "step": 21980 }, { "epoch": 1.0209387622452295, "grad_norm": 60.76342010498047, "learning_rate": 2.3878267329031057e-07, "logits/chosen": -19.205774307250977, "logits/rejected": -18.784679412841797, "logps/chosen": -381.60455322265625, "logps/rejected": -332.9681396484375, "loss": 0.6373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0846943855285645, "rewards/margins": 0.9062229990959167, "rewards/rejected": 2.178471565246582, "step": 21990 }, { "epoch": 1.0214030363526627, "grad_norm": 13.975828170776367, "learning_rate": 2.387548168438646e-07, "logits/chosen": -18.65142250061035, "logits/rejected": -18.026531219482422, "logps/chosen": -386.09991455078125, "logps/rejected": -310.57073974609375, "loss": 0.4929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6931803226470947, "rewards/margins": 0.7190347909927368, "rewards/rejected": 1.9741452932357788, "step": 22000 }, { "epoch": 1.0218673104600957, "grad_norm": 37.947052001953125, "learning_rate": 2.387269603974186e-07, "logits/chosen": -20.048851013183594, "logits/rejected": -19.752899169921875, "logps/chosen": -385.8157958984375, "logps/rejected": -328.0766296386719, "loss": 0.6601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6324362754821777, "rewards/margins": 0.23316629230976105, "rewards/rejected": 2.3992698192596436, "step": 22010 }, { "epoch": 1.0223315845675287, "grad_norm": 49.24042510986328, "learning_rate": 2.3869910395097264e-07, "logits/chosen": -18.974475860595703, "logits/rejected": -18.14960479736328, "logps/chosen": -383.7648620605469, "logps/rejected": -286.21142578125, "loss": 0.8999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9920647144317627, "rewards/margins": 0.5847082138061523, "rewards/rejected": 2.4073567390441895, "step": 22020 }, { "epoch": 1.0227958586749617, "grad_norm": 4.34745454788208, "learning_rate": 2.386712475045267e-07, "logits/chosen": -19.484838485717773, "logits/rejected": -18.12672996520996, "logps/chosen": -439.5361328125, "logps/rejected": -303.6593017578125, "loss": 0.7306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.431367874145508, "rewards/margins": 1.013670563697815, "rewards/rejected": 2.4176974296569824, "step": 22030 }, { "epoch": 1.0232601327823947, "grad_norm": 11.545674324035645, "learning_rate": 2.3864339105808067e-07, "logits/chosen": -18.596153259277344, "logits/rejected": -18.94126319885254, "logps/chosen": -448.63763427734375, "logps/rejected": -443.33709716796875, "loss": 1.0726, "rewards/accuracies": 0.5, "rewards/chosen": 2.744164228439331, "rewards/margins": -0.18938913941383362, "rewards/rejected": 2.9335532188415527, "step": 22040 }, { "epoch": 1.0237244068898277, "grad_norm": 176.5596160888672, "learning_rate": 2.3861553461163466e-07, "logits/chosen": -18.56145477294922, "logits/rejected": -18.46230697631836, "logps/chosen": -403.63616943359375, "logps/rejected": -298.4925231933594, "loss": 0.6979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.832156181335449, "rewards/margins": 0.7244376540184021, "rewards/rejected": 2.1077182292938232, "step": 22050 }, { "epoch": 1.0241886809972607, "grad_norm": 24.13832664489746, "learning_rate": 2.385876781651887e-07, "logits/chosen": -18.517545700073242, "logits/rejected": -18.312244415283203, "logps/chosen": -388.03228759765625, "logps/rejected": -312.14739990234375, "loss": 0.7142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.577098846435547, "rewards/margins": 0.32953864336013794, "rewards/rejected": 2.2475602626800537, "step": 22060 }, { "epoch": 1.024652955104694, "grad_norm": 118.86389923095703, "learning_rate": 2.3855982171874274e-07, "logits/chosen": -17.74078941345215, "logits/rejected": -17.56119155883789, "logps/chosen": -232.4047393798828, "logps/rejected": -236.43667602539062, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": 2.165588617324829, "rewards/margins": 0.641659140586853, "rewards/rejected": 1.5239293575286865, "step": 22070 }, { "epoch": 1.025117229212127, "grad_norm": 28.798500061035156, "learning_rate": 2.385319652722968e-07, "logits/chosen": -18.45040512084961, "logits/rejected": -17.910552978515625, "logps/chosen": -424.7957458496094, "logps/rejected": -415.25787353515625, "loss": 0.8158, "rewards/accuracies": 0.5, "rewards/chosen": 2.9022529125213623, "rewards/margins": 0.2943686842918396, "rewards/rejected": 2.607884168624878, "step": 22080 }, { "epoch": 1.0255815033195599, "grad_norm": 166.06710815429688, "learning_rate": 2.3850410882585077e-07, "logits/chosen": -18.553714752197266, "logits/rejected": -18.29936408996582, "logps/chosen": -364.829833984375, "logps/rejected": -345.40252685546875, "loss": 0.7685, "rewards/accuracies": 0.5, "rewards/chosen": 2.309417247772217, "rewards/margins": 0.16151303052902222, "rewards/rejected": 2.147904396057129, "step": 22090 }, { "epoch": 1.0260457774269929, "grad_norm": 25.694440841674805, "learning_rate": 2.384762523794048e-07, "logits/chosen": -19.429134368896484, "logits/rejected": -18.353912353515625, "logps/chosen": -454.218017578125, "logps/rejected": -252.9027557373047, "loss": 0.354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2368292808532715, "rewards/margins": 1.6634471416473389, "rewards/rejected": 1.5733819007873535, "step": 22100 }, { "epoch": 1.0265100515344259, "grad_norm": 154.9888916015625, "learning_rate": 2.384483959329588e-07, "logits/chosen": -19.40115737915039, "logits/rejected": -18.97138786315918, "logps/chosen": -444.76373291015625, "logps/rejected": -427.0077209472656, "loss": 0.9169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1536097526550293, "rewards/margins": 0.22118397057056427, "rewards/rejected": 2.9324257373809814, "step": 22110 }, { "epoch": 1.0269743256418589, "grad_norm": 4.1649885177612305, "learning_rate": 2.3842053948651284e-07, "logits/chosen": -18.142026901245117, "logits/rejected": -17.091276168823242, "logps/chosen": -365.7855224609375, "logps/rejected": -273.1417236328125, "loss": 0.7215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.072129249572754, "rewards/margins": 1.307166576385498, "rewards/rejected": 1.7649625539779663, "step": 22120 }, { "epoch": 1.027438599749292, "grad_norm": 221.72279357910156, "learning_rate": 2.3839268304006683e-07, "logits/chosen": -19.1533203125, "logits/rejected": -19.196077346801758, "logps/chosen": -511.4110412597656, "logps/rejected": -498.6888122558594, "loss": 0.5355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8026270866394043, "rewards/margins": 0.625287652015686, "rewards/rejected": 3.1773390769958496, "step": 22130 }, { "epoch": 1.027902873856725, "grad_norm": 14.404990196228027, "learning_rate": 2.3836482659362087e-07, "logits/chosen": -18.80169105529785, "logits/rejected": -17.705890655517578, "logps/chosen": -431.042236328125, "logps/rejected": -282.89495849609375, "loss": 0.3624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5148353576660156, "rewards/margins": 1.451250433921814, "rewards/rejected": 2.063584804534912, "step": 22140 }, { "epoch": 1.028367147964158, "grad_norm": 28.71943473815918, "learning_rate": 2.3833697014717488e-07, "logits/chosen": -19.277177810668945, "logits/rejected": -18.306842803955078, "logps/chosen": -383.7249450683594, "logps/rejected": -346.4025573730469, "loss": 0.9296, "rewards/accuracies": 0.5, "rewards/chosen": 2.6783859729766846, "rewards/margins": -0.058988023549318314, "rewards/rejected": 2.7373738288879395, "step": 22150 }, { "epoch": 1.028831422071591, "grad_norm": 48.65168380737305, "learning_rate": 2.3830911370072892e-07, "logits/chosen": -20.581607818603516, "logits/rejected": -18.7110595703125, "logps/chosen": -379.97821044921875, "logps/rejected": -223.4341583251953, "loss": 0.5431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.132516384124756, "rewards/margins": 0.6525750160217285, "rewards/rejected": 1.4799412488937378, "step": 22160 }, { "epoch": 1.029295696179024, "grad_norm": 29.19335174560547, "learning_rate": 2.382812572542829e-07, "logits/chosen": -18.94596290588379, "logits/rejected": -17.170969009399414, "logps/chosen": -370.4413146972656, "logps/rejected": -207.7113800048828, "loss": 0.3003, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.680997371673584, "rewards/margins": 1.5956003665924072, "rewards/rejected": 1.0853970050811768, "step": 22170 }, { "epoch": 1.029759970286457, "grad_norm": 4.3537983894348145, "learning_rate": 2.3825340080783693e-07, "logits/chosen": -18.566307067871094, "logits/rejected": -18.483882904052734, "logps/chosen": -340.965576171875, "logps/rejected": -371.7527160644531, "loss": 0.7581, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8978993892669678, "rewards/margins": 0.25027403235435486, "rewards/rejected": 2.64762544631958, "step": 22180 }, { "epoch": 1.03022424439389, "grad_norm": 190.77723693847656, "learning_rate": 2.3822554436139097e-07, "logits/chosen": -18.66915512084961, "logits/rejected": -18.098590850830078, "logps/chosen": -486.4130859375, "logps/rejected": -410.07867431640625, "loss": 0.6172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0565061569213867, "rewards/margins": 0.632944643497467, "rewards/rejected": 2.4235613346099854, "step": 22190 }, { "epoch": 1.0306885185013233, "grad_norm": 39.81309509277344, "learning_rate": 2.3819768791494498e-07, "logits/chosen": -18.663694381713867, "logits/rejected": -18.136938095092773, "logps/chosen": -320.11865234375, "logps/rejected": -276.9496765136719, "loss": 0.6166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.680727958679199, "rewards/margins": 0.867601752281189, "rewards/rejected": 1.8131262063980103, "step": 22200 }, { "epoch": 1.0311527926087563, "grad_norm": 39.35480499267578, "learning_rate": 2.38169831468499e-07, "logits/chosen": -19.060588836669922, "logits/rejected": -18.60521125793457, "logps/chosen": -369.92913818359375, "logps/rejected": -329.56756591796875, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 2.378124952316284, "rewards/margins": 0.2669500708580017, "rewards/rejected": 2.111175060272217, "step": 22210 }, { "epoch": 1.0316170667161892, "grad_norm": 96.10916900634766, "learning_rate": 2.38141975022053e-07, "logits/chosen": -19.157970428466797, "logits/rejected": -19.228336334228516, "logps/chosen": -431.6377868652344, "logps/rejected": -415.3260803222656, "loss": 0.6612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3554158210754395, "rewards/margins": 0.5672286152839661, "rewards/rejected": 2.788187026977539, "step": 22220 }, { "epoch": 1.0320813408236222, "grad_norm": 75.7420425415039, "learning_rate": 2.3811411857560702e-07, "logits/chosen": -18.360248565673828, "logits/rejected": -18.087066650390625, "logps/chosen": -365.5277099609375, "logps/rejected": -308.40399169921875, "loss": 0.605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0545480251312256, "rewards/margins": 0.7592307329177856, "rewards/rejected": 2.2953174114227295, "step": 22230 }, { "epoch": 1.0325456149310552, "grad_norm": 133.0104217529297, "learning_rate": 2.3808626212916104e-07, "logits/chosen": -18.616283416748047, "logits/rejected": -17.716909408569336, "logps/chosen": -370.5841064453125, "logps/rejected": -290.87884521484375, "loss": 0.5288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6749191284179688, "rewards/margins": 1.0414199829101562, "rewards/rejected": 1.6334991455078125, "step": 22240 }, { "epoch": 1.0330098890384882, "grad_norm": 197.6552734375, "learning_rate": 2.3805840568271505e-07, "logits/chosen": -18.988462448120117, "logits/rejected": -19.028575897216797, "logps/chosen": -330.45465087890625, "logps/rejected": -308.516357421875, "loss": 0.9023, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.864179849624634, "rewards/margins": 0.1281093806028366, "rewards/rejected": 2.736070156097412, "step": 22250 }, { "epoch": 1.0334741631459214, "grad_norm": 195.34864807128906, "learning_rate": 2.380305492362691e-07, "logits/chosen": -18.27051544189453, "logits/rejected": -16.939111709594727, "logps/chosen": -442.276123046875, "logps/rejected": -293.3707580566406, "loss": 0.5978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4868040084838867, "rewards/margins": 0.7314227819442749, "rewards/rejected": 1.7553813457489014, "step": 22260 }, { "epoch": 1.0339384372533544, "grad_norm": 42.92384338378906, "learning_rate": 2.380026927898231e-07, "logits/chosen": -19.129087448120117, "logits/rejected": -17.64040184020996, "logps/chosen": -392.0390625, "logps/rejected": -300.7165832519531, "loss": 0.7425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.167483329772949, "rewards/margins": 0.9663144946098328, "rewards/rejected": 2.2011687755584717, "step": 22270 }, { "epoch": 1.0344027113607874, "grad_norm": 71.95558166503906, "learning_rate": 2.379748363433771e-07, "logits/chosen": -18.80081558227539, "logits/rejected": -18.864837646484375, "logps/chosen": -351.22161865234375, "logps/rejected": -347.87164306640625, "loss": 0.788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5130159854888916, "rewards/margins": 0.5386803150177002, "rewards/rejected": 1.9743356704711914, "step": 22280 }, { "epoch": 1.0348669854682204, "grad_norm": 156.77536010742188, "learning_rate": 2.3794697989693114e-07, "logits/chosen": -19.531818389892578, "logits/rejected": -19.19968032836914, "logps/chosen": -444.968505859375, "logps/rejected": -417.11285400390625, "loss": 0.6568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.911436080932617, "rewards/margins": 0.19455139338970184, "rewards/rejected": 2.7168846130371094, "step": 22290 }, { "epoch": 1.0353312595756534, "grad_norm": 85.1632308959961, "learning_rate": 2.3791912345048515e-07, "logits/chosen": -18.785850524902344, "logits/rejected": -18.129823684692383, "logps/chosen": -384.21807861328125, "logps/rejected": -352.6146545410156, "loss": 0.4899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0446267127990723, "rewards/margins": 0.8552257418632507, "rewards/rejected": 2.1894006729125977, "step": 22300 }, { "epoch": 1.0357955336830864, "grad_norm": 200.79623413085938, "learning_rate": 2.378912670040392e-07, "logits/chosen": -18.190820693969727, "logits/rejected": -17.633804321289062, "logps/chosen": -442.3619079589844, "logps/rejected": -296.3648376464844, "loss": 0.6782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7811672687530518, "rewards/margins": 0.55085688829422, "rewards/rejected": 2.2303099632263184, "step": 22310 }, { "epoch": 1.0362598077905196, "grad_norm": 228.02072143554688, "learning_rate": 2.3786341055759318e-07, "logits/chosen": -17.79266357421875, "logits/rejected": -17.289798736572266, "logps/chosen": -373.49383544921875, "logps/rejected": -297.2209167480469, "loss": 0.6119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4756743907928467, "rewards/margins": 0.5338881611824036, "rewards/rejected": 1.9417864084243774, "step": 22320 }, { "epoch": 1.0367240818979526, "grad_norm": 1.812130093574524, "learning_rate": 2.378355541111472e-07, "logits/chosen": -18.101192474365234, "logits/rejected": -17.27963638305664, "logps/chosen": -385.779296875, "logps/rejected": -314.6846923828125, "loss": 0.6339, "rewards/accuracies": 0.5, "rewards/chosen": 2.8282384872436523, "rewards/margins": 1.030376672744751, "rewards/rejected": 1.7978618144989014, "step": 22330 }, { "epoch": 1.0371883560053856, "grad_norm": 39.674644470214844, "learning_rate": 2.3780769766470124e-07, "logits/chosen": -18.203510284423828, "logits/rejected": -17.123287200927734, "logps/chosen": -352.7478332519531, "logps/rejected": -248.59762573242188, "loss": 0.5034, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6970202922821045, "rewards/margins": 1.236463189125061, "rewards/rejected": 1.460557460784912, "step": 22340 }, { "epoch": 1.0376526301128186, "grad_norm": 16.94393539428711, "learning_rate": 2.3777984121825525e-07, "logits/chosen": -19.01755142211914, "logits/rejected": -17.89952278137207, "logps/chosen": -542.7056884765625, "logps/rejected": -424.60711669921875, "loss": 0.3616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.481554985046387, "rewards/margins": 1.3925344944000244, "rewards/rejected": 3.089020252227783, "step": 22350 }, { "epoch": 1.0381169042202516, "grad_norm": 86.05901336669922, "learning_rate": 2.3775198477180927e-07, "logits/chosen": -19.608125686645508, "logits/rejected": -19.060409545898438, "logps/chosen": -451.90155029296875, "logps/rejected": -364.41156005859375, "loss": 0.3546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8585376739501953, "rewards/margins": 1.8414208889007568, "rewards/rejected": 2.0171170234680176, "step": 22360 }, { "epoch": 1.0385811783276846, "grad_norm": 168.3948516845703, "learning_rate": 2.3772412832536328e-07, "logits/chosen": -18.240781784057617, "logits/rejected": -18.34273338317871, "logps/chosen": -349.36541748046875, "logps/rejected": -376.27117919921875, "loss": 0.8278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9747166633605957, "rewards/margins": 0.028277074918150902, "rewards/rejected": 2.946439266204834, "step": 22370 }, { "epoch": 1.0390454524351176, "grad_norm": 4.181264877319336, "learning_rate": 2.3769627187891732e-07, "logits/chosen": -18.20564079284668, "logits/rejected": -17.345882415771484, "logps/chosen": -352.3907775878906, "logps/rejected": -254.8579559326172, "loss": 0.3609, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8032119274139404, "rewards/margins": 1.3263704776763916, "rewards/rejected": 1.4768418073654175, "step": 22380 }, { "epoch": 1.0395097265425508, "grad_norm": 25.71256446838379, "learning_rate": 2.3766841543247134e-07, "logits/chosen": -18.570499420166016, "logits/rejected": -16.945842742919922, "logps/chosen": -365.1255187988281, "logps/rejected": -193.72329711914062, "loss": 0.3899, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9216630458831787, "rewards/margins": 1.7520465850830078, "rewards/rejected": 1.16961669921875, "step": 22390 }, { "epoch": 1.0399740006499838, "grad_norm": 269.7687683105469, "learning_rate": 2.3764055898602532e-07, "logits/chosen": -18.351478576660156, "logits/rejected": -17.52964973449707, "logps/chosen": -379.67413330078125, "logps/rejected": -378.3522033691406, "loss": 0.9632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9671077728271484, "rewards/margins": 0.32856667041778564, "rewards/rejected": 2.638540744781494, "step": 22400 }, { "epoch": 1.0404382747574168, "grad_norm": 7.463982582092285, "learning_rate": 2.3761270253957937e-07, "logits/chosen": -19.584571838378906, "logits/rejected": -18.29166030883789, "logps/chosen": -465.5724182128906, "logps/rejected": -330.5075988769531, "loss": 0.4591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.569514513015747, "rewards/margins": 0.8744619488716125, "rewards/rejected": 2.6950523853302, "step": 22410 }, { "epoch": 1.0409025488648498, "grad_norm": 25.31884765625, "learning_rate": 2.3758484609313338e-07, "logits/chosen": -19.266834259033203, "logits/rejected": -18.782329559326172, "logps/chosen": -327.3034973144531, "logps/rejected": -306.2454528808594, "loss": 0.6112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.873591661453247, "rewards/margins": 0.6912603378295898, "rewards/rejected": 2.1823313236236572, "step": 22420 }, { "epoch": 1.0413668229722828, "grad_norm": 69.78288269042969, "learning_rate": 2.3755698964668737e-07, "logits/chosen": -18.841995239257812, "logits/rejected": -17.719818115234375, "logps/chosen": -436.15338134765625, "logps/rejected": -291.1514892578125, "loss": 0.5554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3140759468078613, "rewards/margins": 0.6679633855819702, "rewards/rejected": 1.6461127996444702, "step": 22430 }, { "epoch": 1.0418310970797158, "grad_norm": 128.88494873046875, "learning_rate": 2.375291332002414e-07, "logits/chosen": -19.567333221435547, "logits/rejected": -19.22487449645996, "logps/chosen": -539.9241333007812, "logps/rejected": -480.86761474609375, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": 3.4938979148864746, "rewards/margins": 0.4235815405845642, "rewards/rejected": 3.0703165531158447, "step": 22440 }, { "epoch": 1.042295371187149, "grad_norm": 70.1510009765625, "learning_rate": 2.3750127675379542e-07, "logits/chosen": -18.690837860107422, "logits/rejected": -18.353553771972656, "logps/chosen": -375.75262451171875, "logps/rejected": -258.4176025390625, "loss": 0.5567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.254847764968872, "rewards/margins": 0.8297477960586548, "rewards/rejected": 2.4250998497009277, "step": 22450 }, { "epoch": 1.042759645294582, "grad_norm": 50.44823455810547, "learning_rate": 2.3747342030734946e-07, "logits/chosen": -18.434040069580078, "logits/rejected": -18.51473045349121, "logps/chosen": -313.19427490234375, "logps/rejected": -291.34222412109375, "loss": 0.7704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3103461265563965, "rewards/margins": 0.353920042514801, "rewards/rejected": 1.9564260244369507, "step": 22460 }, { "epoch": 1.043223919402015, "grad_norm": 194.40823364257812, "learning_rate": 2.3744556386090345e-07, "logits/chosen": -20.311235427856445, "logits/rejected": -19.330303192138672, "logps/chosen": -631.0428466796875, "logps/rejected": -464.9229431152344, "loss": 0.8467, "rewards/accuracies": 0.5, "rewards/chosen": 3.8197131156921387, "rewards/margins": 0.34767770767211914, "rewards/rejected": 3.4720351696014404, "step": 22470 }, { "epoch": 1.043688193509448, "grad_norm": 24.266706466674805, "learning_rate": 2.3741770741445747e-07, "logits/chosen": -18.525667190551758, "logits/rejected": -17.43465805053711, "logps/chosen": -326.94036865234375, "logps/rejected": -233.6077880859375, "loss": 0.578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.093010425567627, "rewards/margins": 0.49667805433273315, "rewards/rejected": 1.5963325500488281, "step": 22480 }, { "epoch": 1.044152467616881, "grad_norm": 38.38383102416992, "learning_rate": 2.373898509680115e-07, "logits/chosen": -19.706449508666992, "logits/rejected": -19.185260772705078, "logps/chosen": -432.50567626953125, "logps/rejected": -399.64202880859375, "loss": 0.7339, "rewards/accuracies": 0.5, "rewards/chosen": 3.1599345207214355, "rewards/margins": 0.33825960755348206, "rewards/rejected": 2.8216748237609863, "step": 22490 }, { "epoch": 1.044616741724314, "grad_norm": 120.74862670898438, "learning_rate": 2.3736199452156552e-07, "logits/chosen": -18.49054718017578, "logits/rejected": -18.320873260498047, "logps/chosen": -485.656494140625, "logps/rejected": -410.2786560058594, "loss": 0.6253, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.760690212249756, "rewards/margins": 0.8099912405014038, "rewards/rejected": 1.9506992101669312, "step": 22500 }, { "epoch": 1.0450810158317472, "grad_norm": 54.18418502807617, "learning_rate": 2.3733413807511954e-07, "logits/chosen": -18.751623153686523, "logits/rejected": -17.818920135498047, "logps/chosen": -545.0924072265625, "logps/rejected": -477.041015625, "loss": 0.8936, "rewards/accuracies": 0.5, "rewards/chosen": 3.380005359649658, "rewards/margins": 0.8075555562973022, "rewards/rejected": 2.5724499225616455, "step": 22510 }, { "epoch": 1.0455452899391802, "grad_norm": 29.686321258544922, "learning_rate": 2.3730628162867355e-07, "logits/chosen": -18.872655868530273, "logits/rejected": -18.361854553222656, "logps/chosen": -445.2152404785156, "logps/rejected": -349.39251708984375, "loss": 0.7016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.056579828262329, "rewards/margins": 0.4906177520751953, "rewards/rejected": 2.565962076187134, "step": 22520 }, { "epoch": 1.0460095640466132, "grad_norm": 62.37142562866211, "learning_rate": 2.372784251822276e-07, "logits/chosen": -18.63852882385254, "logits/rejected": -18.05098533630371, "logps/chosen": -305.6545715332031, "logps/rejected": -240.431396484375, "loss": 0.6045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0724475383758545, "rewards/margins": 0.5620837211608887, "rewards/rejected": 1.5103639364242554, "step": 22530 }, { "epoch": 1.0464738381540462, "grad_norm": 121.98523712158203, "learning_rate": 2.372505687357816e-07, "logits/chosen": -17.998950958251953, "logits/rejected": -17.705596923828125, "logps/chosen": -489.0765686035156, "logps/rejected": -327.5035095214844, "loss": 0.3371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.351386547088623, "rewards/margins": 1.6178340911865234, "rewards/rejected": 1.7335525751113892, "step": 22540 }, { "epoch": 1.0469381122614791, "grad_norm": 42.16164016723633, "learning_rate": 2.372227122893356e-07, "logits/chosen": -18.623111724853516, "logits/rejected": -17.991535186767578, "logps/chosen": -451.9461975097656, "logps/rejected": -376.21063232421875, "loss": 0.779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.884143352508545, "rewards/margins": 0.4171174466609955, "rewards/rejected": 2.4670255184173584, "step": 22550 }, { "epoch": 1.0474023863689121, "grad_norm": 70.50128173828125, "learning_rate": 2.3719485584288964e-07, "logits/chosen": -18.199676513671875, "logits/rejected": -18.025419235229492, "logps/chosen": -412.80291748046875, "logps/rejected": -367.09918212890625, "loss": 0.8952, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.823385715484619, "rewards/margins": -0.06235436350107193, "rewards/rejected": 2.885739803314209, "step": 22560 }, { "epoch": 1.0478666604763451, "grad_norm": 36.415245056152344, "learning_rate": 2.3716699939644365e-07, "logits/chosen": -18.517751693725586, "logits/rejected": -17.144147872924805, "logps/chosen": -387.17486572265625, "logps/rejected": -312.48614501953125, "loss": 0.8331, "rewards/accuracies": 0.5, "rewards/chosen": 2.697721004486084, "rewards/margins": 0.30194956064224243, "rewards/rejected": 2.3957715034484863, "step": 22570 }, { "epoch": 1.0483309345837784, "grad_norm": 30.98571014404297, "learning_rate": 2.371391429499977e-07, "logits/chosen": -17.990482330322266, "logits/rejected": -17.02402114868164, "logps/chosen": -397.22528076171875, "logps/rejected": -303.03533935546875, "loss": 0.5347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4348740577697754, "rewards/margins": 1.0387073755264282, "rewards/rejected": 2.3961663246154785, "step": 22580 }, { "epoch": 1.0487952086912113, "grad_norm": 152.0569305419922, "learning_rate": 2.3711128650355168e-07, "logits/chosen": -18.693105697631836, "logits/rejected": -17.091449737548828, "logps/chosen": -497.01885986328125, "logps/rejected": -289.9069519042969, "loss": 0.4703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.97131609916687, "rewards/margins": 1.5226318836212158, "rewards/rejected": 2.4486842155456543, "step": 22590 }, { "epoch": 1.0492594827986443, "grad_norm": 186.28172302246094, "learning_rate": 2.370834300571057e-07, "logits/chosen": -17.989299774169922, "logits/rejected": -18.391080856323242, "logps/chosen": -257.55499267578125, "logps/rejected": -315.73150634765625, "loss": 1.223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9810192584991455, "rewards/margins": -0.40555962920188904, "rewards/rejected": 2.3865790367126465, "step": 22600 }, { "epoch": 1.0497237569060773, "grad_norm": 178.71397399902344, "learning_rate": 2.3705557361065974e-07, "logits/chosen": -18.292999267578125, "logits/rejected": -17.283336639404297, "logps/chosen": -386.9603271484375, "logps/rejected": -242.3845977783203, "loss": 0.7468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3354601860046387, "rewards/margins": 1.2281397581100464, "rewards/rejected": 2.1073203086853027, "step": 22610 }, { "epoch": 1.0501880310135103, "grad_norm": 193.86761474609375, "learning_rate": 2.3702771716421372e-07, "logits/chosen": -18.44148063659668, "logits/rejected": -17.93107795715332, "logps/chosen": -458.064208984375, "logps/rejected": -427.62646484375, "loss": 0.9307, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1982219219207764, "rewards/margins": 0.16966864466667175, "rewards/rejected": 3.0285534858703613, "step": 22620 }, { "epoch": 1.0506523051209433, "grad_norm": 18.690608978271484, "learning_rate": 2.3699986071776777e-07, "logits/chosen": -19.97869300842285, "logits/rejected": -17.655929565429688, "logps/chosen": -393.6239318847656, "logps/rejected": -230.75888061523438, "loss": 0.4095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0947024822235107, "rewards/margins": 1.3443071842193604, "rewards/rejected": 1.7503955364227295, "step": 22630 }, { "epoch": 1.0511165792283765, "grad_norm": 35.40564727783203, "learning_rate": 2.3697200427132178e-07, "logits/chosen": -18.884231567382812, "logits/rejected": -18.422176361083984, "logps/chosen": -303.64959716796875, "logps/rejected": -286.3586120605469, "loss": 0.6592, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.038280963897705, "rewards/margins": 0.3554307222366333, "rewards/rejected": 1.6828498840332031, "step": 22640 }, { "epoch": 1.0515808533358095, "grad_norm": 22.5025634765625, "learning_rate": 2.369441478248758e-07, "logits/chosen": -19.088733673095703, "logits/rejected": -18.07349395751953, "logps/chosen": -412.37646484375, "logps/rejected": -340.94561767578125, "loss": 0.6083, "rewards/accuracies": 0.5, "rewards/chosen": 2.921236276626587, "rewards/margins": 0.7261257171630859, "rewards/rejected": 2.19511079788208, "step": 22650 }, { "epoch": 1.0520451274432425, "grad_norm": 26.81976318359375, "learning_rate": 2.369162913784298e-07, "logits/chosen": -19.330421447753906, "logits/rejected": -18.11387825012207, "logps/chosen": -412.4649353027344, "logps/rejected": -273.2723388671875, "loss": 0.5241, "rewards/accuracies": 0.5, "rewards/chosen": 3.0724334716796875, "rewards/margins": 1.485049843788147, "rewards/rejected": 1.5873836278915405, "step": 22660 }, { "epoch": 1.0525094015506755, "grad_norm": 208.90927124023438, "learning_rate": 2.3688843493198382e-07, "logits/chosen": -20.236120223999023, "logits/rejected": -18.635738372802734, "logps/chosen": -531.284423828125, "logps/rejected": -447.91473388671875, "loss": 0.7474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.309629440307617, "rewards/margins": 0.517706036567688, "rewards/rejected": 2.791923999786377, "step": 22670 }, { "epoch": 1.0529736756581085, "grad_norm": 46.66632843017578, "learning_rate": 2.3686057848553786e-07, "logits/chosen": -18.90218734741211, "logits/rejected": -18.383607864379883, "logps/chosen": -522.3806762695312, "logps/rejected": -454.63787841796875, "loss": 0.6879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3550868034362793, "rewards/margins": 0.4576565623283386, "rewards/rejected": 2.897429943084717, "step": 22680 }, { "epoch": 1.0534379497655415, "grad_norm": 34.47146224975586, "learning_rate": 2.3683272203909188e-07, "logits/chosen": -18.14767074584961, "logits/rejected": -18.19900131225586, "logps/chosen": -359.2087097167969, "logps/rejected": -371.2052307128906, "loss": 0.6245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7577805519104004, "rewards/margins": 0.4568510949611664, "rewards/rejected": 2.3009297847747803, "step": 22690 }, { "epoch": 1.0539022238729745, "grad_norm": 148.430908203125, "learning_rate": 2.3680486559264587e-07, "logits/chosen": -18.72308349609375, "logits/rejected": -18.660079956054688, "logps/chosen": -408.882568359375, "logps/rejected": -483.848876953125, "loss": 1.3692, "rewards/accuracies": 0.5, "rewards/chosen": 3.701925277709961, "rewards/margins": -0.1646738350391388, "rewards/rejected": 3.8665993213653564, "step": 22700 }, { "epoch": 1.0543664979804077, "grad_norm": 103.25174713134766, "learning_rate": 2.367770091461999e-07, "logits/chosen": -19.23680305480957, "logits/rejected": -18.96704864501953, "logps/chosen": -332.4515075683594, "logps/rejected": -285.7466125488281, "loss": 0.7137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.119955539703369, "rewards/margins": 0.5425604581832886, "rewards/rejected": 1.5773952007293701, "step": 22710 }, { "epoch": 1.0548307720878407, "grad_norm": 193.85720825195312, "learning_rate": 2.3674915269975392e-07, "logits/chosen": -18.393497467041016, "logits/rejected": -17.85983657836914, "logps/chosen": -344.0680236816406, "logps/rejected": -305.6195983886719, "loss": 0.6396, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1733791828155518, "rewards/margins": 0.5883447527885437, "rewards/rejected": 1.5850343704223633, "step": 22720 }, { "epoch": 1.0552950461952737, "grad_norm": 63.51322555541992, "learning_rate": 2.3672129625330796e-07, "logits/chosen": -18.368183135986328, "logits/rejected": -18.145214080810547, "logps/chosen": -375.8562316894531, "logps/rejected": -336.621826171875, "loss": 0.8199, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2122859954833984, "rewards/margins": 0.05428602546453476, "rewards/rejected": 2.1579999923706055, "step": 22730 }, { "epoch": 1.0557593203027067, "grad_norm": 110.78101348876953, "learning_rate": 2.3669343980686195e-07, "logits/chosen": -18.555330276489258, "logits/rejected": -18.489770889282227, "logps/chosen": -365.879150390625, "logps/rejected": -394.0786437988281, "loss": 0.7796, "rewards/accuracies": 0.5, "rewards/chosen": 2.831874370574951, "rewards/margins": 0.5249431729316711, "rewards/rejected": 2.306931257247925, "step": 22740 }, { "epoch": 1.0562235944101397, "grad_norm": 60.301231384277344, "learning_rate": 2.3666558336041597e-07, "logits/chosen": -18.60061264038086, "logits/rejected": -18.402950286865234, "logps/chosen": -355.51336669921875, "logps/rejected": -304.2140808105469, "loss": 0.8679, "rewards/accuracies": 0.5, "rewards/chosen": 3.0812277793884277, "rewards/margins": 0.25936752557754517, "rewards/rejected": 2.8218600749969482, "step": 22750 }, { "epoch": 1.0566878685175727, "grad_norm": 153.74652099609375, "learning_rate": 2.3663772691397e-07, "logits/chosen": -19.81959342956543, "logits/rejected": -18.5766658782959, "logps/chosen": -396.7963562011719, "logps/rejected": -375.32025146484375, "loss": 0.847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.114222288131714, "rewards/margins": 0.5709769129753113, "rewards/rejected": 2.543245315551758, "step": 22760 }, { "epoch": 1.057152142625006, "grad_norm": 100.56706237792969, "learning_rate": 2.3660987046752402e-07, "logits/chosen": -19.463642120361328, "logits/rejected": -18.92176055908203, "logps/chosen": -449.1792907714844, "logps/rejected": -363.8092346191406, "loss": 0.6521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4960620403289795, "rewards/margins": 0.7038542628288269, "rewards/rejected": 2.792207717895508, "step": 22770 }, { "epoch": 1.057616416732439, "grad_norm": 19.240840911865234, "learning_rate": 2.3658201402107804e-07, "logits/chosen": -18.95403480529785, "logits/rejected": -17.886390686035156, "logps/chosen": -430.56256103515625, "logps/rejected": -340.3604736328125, "loss": 0.3437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1521434783935547, "rewards/margins": 1.3798741102218628, "rewards/rejected": 1.772269606590271, "step": 22780 }, { "epoch": 1.0580806908398719, "grad_norm": 7.074061870574951, "learning_rate": 2.3655415757463205e-07, "logits/chosen": -19.046043395996094, "logits/rejected": -18.095726013183594, "logps/chosen": -452.6675720214844, "logps/rejected": -355.0009460449219, "loss": 0.5245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.152867078781128, "rewards/margins": 1.1837496757507324, "rewards/rejected": 1.9691177606582642, "step": 22790 }, { "epoch": 1.0585449649473049, "grad_norm": 31.81899642944336, "learning_rate": 2.365263011281861e-07, "logits/chosen": -18.450246810913086, "logits/rejected": -17.554807662963867, "logps/chosen": -369.4458923339844, "logps/rejected": -295.4534606933594, "loss": 0.4991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0217981338500977, "rewards/margins": 0.9409278035163879, "rewards/rejected": 2.0808701515197754, "step": 22800 }, { "epoch": 1.0590092390547379, "grad_norm": 178.4234619140625, "learning_rate": 2.3649844468174008e-07, "logits/chosen": -18.05059242248535, "logits/rejected": -17.004756927490234, "logps/chosen": -328.74639892578125, "logps/rejected": -252.0745849609375, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7929465770721436, "rewards/margins": 0.48864665627479553, "rewards/rejected": 1.3042999505996704, "step": 22810 }, { "epoch": 1.0594735131621709, "grad_norm": 5.715762615203857, "learning_rate": 2.364705882352941e-07, "logits/chosen": -17.531381607055664, "logits/rejected": -16.802778244018555, "logps/chosen": -335.2543029785156, "logps/rejected": -214.5463104248047, "loss": 0.5435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.428896427154541, "rewards/margins": 0.9839097857475281, "rewards/rejected": 1.4449865818023682, "step": 22820 }, { "epoch": 1.059937787269604, "grad_norm": 102.41553497314453, "learning_rate": 2.3644273178884814e-07, "logits/chosen": -19.110214233398438, "logits/rejected": -18.264352798461914, "logps/chosen": -470.4471740722656, "logps/rejected": -368.8863220214844, "loss": 0.4724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.583461046218872, "rewards/margins": 0.9425790905952454, "rewards/rejected": 2.6408820152282715, "step": 22830 }, { "epoch": 1.060402061377037, "grad_norm": 18.438276290893555, "learning_rate": 2.3641487534240215e-07, "logits/chosen": -18.178279876708984, "logits/rejected": -17.671995162963867, "logps/chosen": -307.08428955078125, "logps/rejected": -310.075439453125, "loss": 0.662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4925835132598877, "rewards/margins": 0.6463577151298523, "rewards/rejected": 1.8462259769439697, "step": 22840 }, { "epoch": 1.06086633548447, "grad_norm": 57.34502029418945, "learning_rate": 2.3638701889595614e-07, "logits/chosen": -19.687110900878906, "logits/rejected": -18.448570251464844, "logps/chosen": -463.8622131347656, "logps/rejected": -379.8113708496094, "loss": 0.5671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.111380577087402, "rewards/margins": 1.1165971755981445, "rewards/rejected": 2.9947831630706787, "step": 22850 }, { "epoch": 1.061330609591903, "grad_norm": 119.26652526855469, "learning_rate": 2.3635916244951018e-07, "logits/chosen": -18.91383171081543, "logits/rejected": -18.018585205078125, "logps/chosen": -492.4327697753906, "logps/rejected": -398.9065246582031, "loss": 0.5884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.394979476928711, "rewards/margins": 0.7690953016281128, "rewards/rejected": 2.6258842945098877, "step": 22860 }, { "epoch": 1.061794883699336, "grad_norm": 138.16302490234375, "learning_rate": 2.363313060030642e-07, "logits/chosen": -18.731584548950195, "logits/rejected": -18.05683708190918, "logps/chosen": -336.60589599609375, "logps/rejected": -321.2329406738281, "loss": 0.7645, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.466803550720215, "rewards/margins": 0.5509889721870422, "rewards/rejected": 1.9158146381378174, "step": 22870 }, { "epoch": 1.062259157806769, "grad_norm": 76.5791015625, "learning_rate": 2.3630344955661823e-07, "logits/chosen": -18.410114288330078, "logits/rejected": -17.902881622314453, "logps/chosen": -396.86309814453125, "logps/rejected": -322.0843200683594, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2876598834991455, "rewards/margins": 1.1598217487335205, "rewards/rejected": 2.127838134765625, "step": 22880 }, { "epoch": 1.062723431914202, "grad_norm": 21.808473587036133, "learning_rate": 2.3627559311017222e-07, "logits/chosen": -18.10752296447754, "logits/rejected": -16.87574005126953, "logps/chosen": -297.68255615234375, "logps/rejected": -195.80628967285156, "loss": 0.3152, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7284159660339355, "rewards/margins": 1.5221318006515503, "rewards/rejected": 1.2062841653823853, "step": 22890 }, { "epoch": 1.0631877060216353, "grad_norm": 220.0042724609375, "learning_rate": 2.3624773666372624e-07, "logits/chosen": -18.493276596069336, "logits/rejected": -17.389484405517578, "logps/chosen": -467.64190673828125, "logps/rejected": -271.7931213378906, "loss": 0.4201, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4813730716705322, "rewards/margins": 0.9683625102043152, "rewards/rejected": 1.5130105018615723, "step": 22900 }, { "epoch": 1.0636519801290683, "grad_norm": 69.50721740722656, "learning_rate": 2.3621988021728028e-07, "logits/chosen": -18.13918685913086, "logits/rejected": -17.40239715576172, "logps/chosen": -440.83612060546875, "logps/rejected": -320.0356140136719, "loss": 0.492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.658188819885254, "rewards/margins": 1.0548591613769531, "rewards/rejected": 1.6033296585083008, "step": 22910 }, { "epoch": 1.0641162542365012, "grad_norm": 26.799304962158203, "learning_rate": 2.361920237708343e-07, "logits/chosen": -17.68588638305664, "logits/rejected": -16.656511306762695, "logps/chosen": -424.44049072265625, "logps/rejected": -299.7814025878906, "loss": 0.4121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9011502265930176, "rewards/margins": 1.261307716369629, "rewards/rejected": 2.6398422718048096, "step": 22920 }, { "epoch": 1.0645805283439342, "grad_norm": 79.78792572021484, "learning_rate": 2.361641673243883e-07, "logits/chosen": -19.059764862060547, "logits/rejected": -18.264461517333984, "logps/chosen": -491.2911071777344, "logps/rejected": -348.03460693359375, "loss": 0.9643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.786557197570801, "rewards/margins": 0.30503249168395996, "rewards/rejected": 2.481524705886841, "step": 22930 }, { "epoch": 1.0650448024513672, "grad_norm": 42.642391204833984, "learning_rate": 2.3613631087794232e-07, "logits/chosen": -19.007801055908203, "logits/rejected": -18.75229835510254, "logps/chosen": -323.1712951660156, "logps/rejected": -281.22418212890625, "loss": 0.6699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.568300485610962, "rewards/margins": 0.4267595410346985, "rewards/rejected": 2.14154052734375, "step": 22940 }, { "epoch": 1.0655090765588002, "grad_norm": 56.661460876464844, "learning_rate": 2.3610845443149636e-07, "logits/chosen": -19.399307250976562, "logits/rejected": -18.378765106201172, "logps/chosen": -397.2916259765625, "logps/rejected": -253.3761444091797, "loss": 0.4821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9248948097229004, "rewards/margins": 1.1725928783416748, "rewards/rejected": 1.7523019313812256, "step": 22950 }, { "epoch": 1.0659733506662334, "grad_norm": 20.683012008666992, "learning_rate": 2.3608059798505038e-07, "logits/chosen": -18.525859832763672, "logits/rejected": -18.018770217895508, "logps/chosen": -362.086181640625, "logps/rejected": -367.0615539550781, "loss": 0.5975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1820530891418457, "rewards/margins": 0.5306394696235657, "rewards/rejected": 1.6514135599136353, "step": 22960 }, { "epoch": 1.0664376247736664, "grad_norm": 190.337890625, "learning_rate": 2.3605274153860437e-07, "logits/chosen": -18.404558181762695, "logits/rejected": -18.01307487487793, "logps/chosen": -509.56976318359375, "logps/rejected": -465.2068786621094, "loss": 0.5715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8405327796936035, "rewards/margins": 0.7353305220603943, "rewards/rejected": 3.1052021980285645, "step": 22970 }, { "epoch": 1.0669018988810994, "grad_norm": 138.3874053955078, "learning_rate": 2.360248850921584e-07, "logits/chosen": -19.095705032348633, "logits/rejected": -18.880308151245117, "logps/chosen": -347.07672119140625, "logps/rejected": -303.96856689453125, "loss": 0.5001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.991896152496338, "rewards/margins": 0.7358840107917786, "rewards/rejected": 2.256011962890625, "step": 22980 }, { "epoch": 1.0673661729885324, "grad_norm": 65.34537506103516, "learning_rate": 2.3599702864571242e-07, "logits/chosen": -18.786563873291016, "logits/rejected": -18.64527130126953, "logps/chosen": -377.3655090332031, "logps/rejected": -353.7533264160156, "loss": 0.6824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7854509353637695, "rewards/margins": 0.36231356859207153, "rewards/rejected": 2.4231371879577637, "step": 22990 }, { "epoch": 1.0678304470959654, "grad_norm": 287.08551025390625, "learning_rate": 2.359691721992664e-07, "logits/chosen": -19.98164939880371, "logits/rejected": -18.919864654541016, "logps/chosen": -524.4254150390625, "logps/rejected": -402.9707336425781, "loss": 0.5541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6278388500213623, "rewards/margins": 1.0518122911453247, "rewards/rejected": 2.576026439666748, "step": 23000 }, { "epoch": 1.0682947212033984, "grad_norm": 205.00331115722656, "learning_rate": 2.3594131575282045e-07, "logits/chosen": -19.5206356048584, "logits/rejected": -18.476497650146484, "logps/chosen": -372.75018310546875, "logps/rejected": -294.26239013671875, "loss": 0.7453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.725442886352539, "rewards/margins": 0.8335418701171875, "rewards/rejected": 1.8919010162353516, "step": 23010 }, { "epoch": 1.0687589953108314, "grad_norm": 278.40869140625, "learning_rate": 2.3591345930637446e-07, "logits/chosen": -18.46686553955078, "logits/rejected": -17.149744033813477, "logps/chosen": -462.9498596191406, "logps/rejected": -325.8370056152344, "loss": 0.5774, "rewards/accuracies": 0.5, "rewards/chosen": 3.265465497970581, "rewards/margins": 1.1674377918243408, "rewards/rejected": 2.0980277061462402, "step": 23020 }, { "epoch": 1.0692232694182646, "grad_norm": 65.3451919555664, "learning_rate": 2.358856028599285e-07, "logits/chosen": -17.549230575561523, "logits/rejected": -17.72077178955078, "logps/chosen": -299.3705139160156, "logps/rejected": -335.36767578125, "loss": 1.4569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2437644004821777, "rewards/margins": -0.5322353839874268, "rewards/rejected": 2.7759995460510254, "step": 23030 }, { "epoch": 1.0696875435256976, "grad_norm": 99.57756805419922, "learning_rate": 2.358577464134825e-07, "logits/chosen": -18.70667266845703, "logits/rejected": -17.691326141357422, "logps/chosen": -325.52178955078125, "logps/rejected": -215.85873413085938, "loss": 0.5369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6818270683288574, "rewards/margins": 1.2174313068389893, "rewards/rejected": 1.4643959999084473, "step": 23040 }, { "epoch": 1.0701518176331306, "grad_norm": 88.32393646240234, "learning_rate": 2.3582988996703653e-07, "logits/chosen": -18.979265213012695, "logits/rejected": -18.967958450317383, "logps/chosen": -299.1240539550781, "logps/rejected": -318.1752014160156, "loss": 0.8228, "rewards/accuracies": 0.5, "rewards/chosen": 2.3564341068267822, "rewards/margins": 0.15899844467639923, "rewards/rejected": 2.1974358558654785, "step": 23050 }, { "epoch": 1.0706160917405636, "grad_norm": 19.284992218017578, "learning_rate": 2.3580203352059055e-07, "logits/chosen": -19.202350616455078, "logits/rejected": -18.17324447631836, "logps/chosen": -448.48565673828125, "logps/rejected": -356.39898681640625, "loss": 0.5411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.091076850891113, "rewards/margins": 1.56282639503479, "rewards/rejected": 2.528250217437744, "step": 23060 }, { "epoch": 1.0710803658479966, "grad_norm": 52.57805252075195, "learning_rate": 2.3577417707414456e-07, "logits/chosen": -18.55828857421875, "logits/rejected": -17.615100860595703, "logps/chosen": -420.8431701660156, "logps/rejected": -330.87847900390625, "loss": 0.3657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.634986400604248, "rewards/margins": 1.1962449550628662, "rewards/rejected": 2.43874192237854, "step": 23070 }, { "epoch": 1.0715446399554296, "grad_norm": 176.92079162597656, "learning_rate": 2.3574632062769858e-07, "logits/chosen": -18.329471588134766, "logits/rejected": -17.928464889526367, "logps/chosen": -284.99932861328125, "logps/rejected": -243.9373016357422, "loss": 0.8461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.193469524383545, "rewards/margins": 0.18151581287384033, "rewards/rejected": 2.011953592300415, "step": 23080 }, { "epoch": 1.0720089140628628, "grad_norm": 43.40766906738281, "learning_rate": 2.357184641812526e-07, "logits/chosen": -19.07125473022461, "logits/rejected": -18.400663375854492, "logps/chosen": -411.97161865234375, "logps/rejected": -315.4972229003906, "loss": 0.6676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.155534267425537, "rewards/margins": 0.7917765974998474, "rewards/rejected": 2.363757848739624, "step": 23090 }, { "epoch": 1.0724731881702958, "grad_norm": 259.7353820800781, "learning_rate": 2.3569060773480663e-07, "logits/chosen": -18.384191513061523, "logits/rejected": -18.33824348449707, "logps/chosen": -312.6724548339844, "logps/rejected": -321.204345703125, "loss": 0.9877, "rewards/accuracies": 0.5, "rewards/chosen": 2.0618627071380615, "rewards/margins": -0.021475184708833694, "rewards/rejected": 2.0833377838134766, "step": 23100 }, { "epoch": 1.0729374622777288, "grad_norm": 83.23680877685547, "learning_rate": 2.3566275128836065e-07, "logits/chosen": -18.62158203125, "logits/rejected": -18.80561637878418, "logps/chosen": -386.215087890625, "logps/rejected": -428.1329040527344, "loss": 1.1487, "rewards/accuracies": 0.5, "rewards/chosen": 2.9568350315093994, "rewards/margins": -0.4262072443962097, "rewards/rejected": 3.383042097091675, "step": 23110 }, { "epoch": 1.0734017363851618, "grad_norm": 170.77964782714844, "learning_rate": 2.3563489484191464e-07, "logits/chosen": -19.088817596435547, "logits/rejected": -17.60464859008789, "logps/chosen": -423.799560546875, "logps/rejected": -278.79046630859375, "loss": 0.4966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.198572635650635, "rewards/margins": 1.731958031654358, "rewards/rejected": 2.4666144847869873, "step": 23120 }, { "epoch": 1.0738660104925948, "grad_norm": 110.48435974121094, "learning_rate": 2.3560703839546868e-07, "logits/chosen": -18.33454704284668, "logits/rejected": -17.238418579101562, "logps/chosen": -509.83807373046875, "logps/rejected": -314.35638427734375, "loss": 0.4407, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.54822039604187, "rewards/margins": 1.3147103786468506, "rewards/rejected": 2.2335104942321777, "step": 23130 }, { "epoch": 1.0743302846000278, "grad_norm": 49.12064743041992, "learning_rate": 2.355791819490227e-07, "logits/chosen": -19.476634979248047, "logits/rejected": -19.145435333251953, "logps/chosen": -478.1319274902344, "logps/rejected": -339.79034423828125, "loss": 0.4979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2158679962158203, "rewards/margins": 0.6336815357208252, "rewards/rejected": 2.582186222076416, "step": 23140 }, { "epoch": 1.074794558707461, "grad_norm": 29.75376319885254, "learning_rate": 2.3555132550257673e-07, "logits/chosen": -18.953847885131836, "logits/rejected": -17.617969512939453, "logps/chosen": -485.27093505859375, "logps/rejected": -361.1050720214844, "loss": 0.4834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.052980899810791, "rewards/margins": 1.7373348474502563, "rewards/rejected": 2.315645694732666, "step": 23150 }, { "epoch": 1.075258832814894, "grad_norm": 54.50069046020508, "learning_rate": 2.3552346905613072e-07, "logits/chosen": -19.753116607666016, "logits/rejected": -18.154056549072266, "logps/chosen": -491.08331298828125, "logps/rejected": -403.36907958984375, "loss": 0.2895, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.588856220245361, "rewards/margins": 1.9777024984359741, "rewards/rejected": 2.6111531257629395, "step": 23160 }, { "epoch": 1.075723106922327, "grad_norm": 139.59303283691406, "learning_rate": 2.3549561260968474e-07, "logits/chosen": -18.460453033447266, "logits/rejected": -18.3945255279541, "logps/chosen": -325.2479553222656, "logps/rejected": -288.76654052734375, "loss": 0.7393, "rewards/accuracies": 0.5, "rewards/chosen": 1.9865754842758179, "rewards/margins": 0.10667155683040619, "rewards/rejected": 1.879903793334961, "step": 23170 }, { "epoch": 1.07618738102976, "grad_norm": 59.55826950073242, "learning_rate": 2.3546775616323878e-07, "logits/chosen": -18.174785614013672, "logits/rejected": -17.694812774658203, "logps/chosen": -376.60504150390625, "logps/rejected": -281.19586181640625, "loss": 0.5822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4093124866485596, "rewards/margins": 1.085223913192749, "rewards/rejected": 2.3240885734558105, "step": 23180 }, { "epoch": 1.076651655137193, "grad_norm": 138.04620361328125, "learning_rate": 2.354398997167928e-07, "logits/chosen": -18.401885986328125, "logits/rejected": -17.239225387573242, "logps/chosen": -453.44586181640625, "logps/rejected": -299.07366943359375, "loss": 0.4517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.534635543823242, "rewards/margins": 1.5907974243164062, "rewards/rejected": 1.9438377618789673, "step": 23190 }, { "epoch": 1.077115929244626, "grad_norm": 125.43761444091797, "learning_rate": 2.354120432703468e-07, "logits/chosen": -18.44120979309082, "logits/rejected": -17.99869155883789, "logps/chosen": -400.1316223144531, "logps/rejected": -335.03118896484375, "loss": 0.7901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.898606777191162, "rewards/margins": 0.3611012101173401, "rewards/rejected": 2.537505626678467, "step": 23200 }, { "epoch": 1.077580203352059, "grad_norm": 64.25167846679688, "learning_rate": 2.3538418682390082e-07, "logits/chosen": -17.45220184326172, "logits/rejected": -18.023731231689453, "logps/chosen": -348.3139343261719, "logps/rejected": -414.7159118652344, "loss": 1.5351, "rewards/accuracies": 0.5, "rewards/chosen": 2.6095330715179443, "rewards/margins": -0.45978134870529175, "rewards/rejected": 3.069314479827881, "step": 23210 }, { "epoch": 1.0780444774594922, "grad_norm": 142.7454833984375, "learning_rate": 2.3535633037745486e-07, "logits/chosen": -18.37826919555664, "logits/rejected": -17.377992630004883, "logps/chosen": -362.5783386230469, "logps/rejected": -265.41876220703125, "loss": 0.3537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8566627502441406, "rewards/margins": 1.3270056247711182, "rewards/rejected": 1.5296571254730225, "step": 23220 }, { "epoch": 1.0785087515669252, "grad_norm": 92.68923950195312, "learning_rate": 2.3532847393100885e-07, "logits/chosen": -18.068729400634766, "logits/rejected": -18.7530517578125, "logps/chosen": -364.9539794921875, "logps/rejected": -306.1300048828125, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": 2.850766181945801, "rewards/margins": 0.4094582498073578, "rewards/rejected": 2.441307544708252, "step": 23230 }, { "epoch": 1.0789730256743582, "grad_norm": 132.7196807861328, "learning_rate": 2.3530061748456286e-07, "logits/chosen": -18.81279945373535, "logits/rejected": -18.636398315429688, "logps/chosen": -429.2544860839844, "logps/rejected": -403.54107666015625, "loss": 0.6172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.681067943572998, "rewards/margins": 0.3580932915210724, "rewards/rejected": 2.322974681854248, "step": 23240 }, { "epoch": 1.0794372997817911, "grad_norm": 10.871969223022461, "learning_rate": 2.352727610381169e-07, "logits/chosen": -19.176753997802734, "logits/rejected": -19.06784439086914, "logps/chosen": -355.40179443359375, "logps/rejected": -394.4081726074219, "loss": 1.197, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.314338207244873, "rewards/margins": -0.40517157316207886, "rewards/rejected": 2.7195096015930176, "step": 23250 }, { "epoch": 1.0799015738892241, "grad_norm": 61.43288803100586, "learning_rate": 2.3524490459167092e-07, "logits/chosen": -19.918729782104492, "logits/rejected": -18.903100967407227, "logps/chosen": -504.5184631347656, "logps/rejected": -347.9563903808594, "loss": 0.3433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.788881778717041, "rewards/margins": 1.4614267349243164, "rewards/rejected": 2.3274550437927246, "step": 23260 }, { "epoch": 1.0803658479966571, "grad_norm": 68.92878723144531, "learning_rate": 2.352170481452249e-07, "logits/chosen": -18.96927261352539, "logits/rejected": -17.846572875976562, "logps/chosen": -390.54754638671875, "logps/rejected": -306.36798095703125, "loss": 0.5489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0929086208343506, "rewards/margins": 1.0346801280975342, "rewards/rejected": 2.0582287311553955, "step": 23270 }, { "epoch": 1.0808301221040904, "grad_norm": 10.387775421142578, "learning_rate": 2.3518919169877895e-07, "logits/chosen": -18.56865882873535, "logits/rejected": -17.643447875976562, "logps/chosen": -402.83428955078125, "logps/rejected": -292.072021484375, "loss": 0.6094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.954320192337036, "rewards/margins": 0.9119365811347961, "rewards/rejected": 2.0423834323883057, "step": 23280 }, { "epoch": 1.0812943962115233, "grad_norm": 38.39263153076172, "learning_rate": 2.3516133525233296e-07, "logits/chosen": -19.08053970336914, "logits/rejected": -18.609203338623047, "logps/chosen": -447.683349609375, "logps/rejected": -465.796630859375, "loss": 0.7047, "rewards/accuracies": 0.5, "rewards/chosen": 3.4038150310516357, "rewards/margins": 0.5802645683288574, "rewards/rejected": 2.8235504627227783, "step": 23290 }, { "epoch": 1.0817586703189563, "grad_norm": 149.66812133789062, "learning_rate": 2.35133478805887e-07, "logits/chosen": -19.07756805419922, "logits/rejected": -18.3265323638916, "logps/chosen": -370.4060974121094, "logps/rejected": -391.5447998046875, "loss": 0.7462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.858079433441162, "rewards/margins": 0.24986115097999573, "rewards/rejected": 2.60821795463562, "step": 23300 }, { "epoch": 1.0822229444263893, "grad_norm": 89.50426483154297, "learning_rate": 2.35105622359441e-07, "logits/chosen": -19.465286254882812, "logits/rejected": -19.023773193359375, "logps/chosen": -377.2901306152344, "logps/rejected": -357.20013427734375, "loss": 0.6432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0884809494018555, "rewards/margins": 0.599408745765686, "rewards/rejected": 2.489072322845459, "step": 23310 }, { "epoch": 1.0826872185338223, "grad_norm": 3.3079397678375244, "learning_rate": 2.35077765912995e-07, "logits/chosen": -18.04123306274414, "logits/rejected": -16.7903995513916, "logps/chosen": -509.5963439941406, "logps/rejected": -324.1330261230469, "loss": 0.5175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.214110851287842, "rewards/margins": 1.2466135025024414, "rewards/rejected": 1.9674972295761108, "step": 23320 }, { "epoch": 1.0831514926412553, "grad_norm": 13.597923278808594, "learning_rate": 2.3504990946654905e-07, "logits/chosen": -18.583450317382812, "logits/rejected": -17.80376625061035, "logps/chosen": -521.4869995117188, "logps/rejected": -408.268310546875, "loss": 0.3549, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2709739208221436, "rewards/margins": 1.2280161380767822, "rewards/rejected": 2.0429577827453613, "step": 23330 }, { "epoch": 1.0836157667486885, "grad_norm": 53.58246612548828, "learning_rate": 2.3502205302010306e-07, "logits/chosen": -17.996492385864258, "logits/rejected": -17.46396255493164, "logps/chosen": -269.20770263671875, "logps/rejected": -235.5272216796875, "loss": 0.6339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8765220642089844, "rewards/margins": 0.3477631211280823, "rewards/rejected": 1.5287590026855469, "step": 23340 }, { "epoch": 1.0840800408561215, "grad_norm": 28.040193557739258, "learning_rate": 2.3499419657365708e-07, "logits/chosen": -18.59152603149414, "logits/rejected": -17.888147354125977, "logps/chosen": -516.042236328125, "logps/rejected": -363.2898864746094, "loss": 0.5564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8186802864074707, "rewards/margins": 1.2236652374267578, "rewards/rejected": 2.595015048980713, "step": 23350 }, { "epoch": 1.0845443149635545, "grad_norm": 46.232635498046875, "learning_rate": 2.349663401272111e-07, "logits/chosen": -18.450565338134766, "logits/rejected": -19.181123733520508, "logps/chosen": -404.67388916015625, "logps/rejected": -359.70013427734375, "loss": 0.8398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8970999717712402, "rewards/margins": 0.20470905303955078, "rewards/rejected": 2.6923911571502686, "step": 23360 }, { "epoch": 1.0850085890709875, "grad_norm": 0.6143584251403809, "learning_rate": 2.3493848368076513e-07, "logits/chosen": -20.017553329467773, "logits/rejected": -19.7340030670166, "logps/chosen": -385.55181884765625, "logps/rejected": -328.9388427734375, "loss": 0.785, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5664610862731934, "rewards/margins": 0.40456515550613403, "rewards/rejected": 2.161895990371704, "step": 23370 }, { "epoch": 1.0854728631784205, "grad_norm": 27.964767456054688, "learning_rate": 2.3491062723431915e-07, "logits/chosen": -19.450695037841797, "logits/rejected": -17.9097957611084, "logps/chosen": -379.3727722167969, "logps/rejected": -282.9169006347656, "loss": 0.4378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.083650588989258, "rewards/margins": 0.9680303335189819, "rewards/rejected": 2.1156201362609863, "step": 23380 }, { "epoch": 1.0859371372858535, "grad_norm": 7.551602840423584, "learning_rate": 2.3488277078787313e-07, "logits/chosen": -19.040302276611328, "logits/rejected": -18.85808563232422, "logps/chosen": -411.03924560546875, "logps/rejected": -381.52496337890625, "loss": 1.0427, "rewards/accuracies": 0.5, "rewards/chosen": 2.8869850635528564, "rewards/margins": -0.026840269565582275, "rewards/rejected": 2.913825273513794, "step": 23390 }, { "epoch": 1.0864014113932865, "grad_norm": 67.10152435302734, "learning_rate": 2.3485491434142718e-07, "logits/chosen": -18.580961227416992, "logits/rejected": -18.08378791809082, "logps/chosen": -369.17864990234375, "logps/rejected": -336.232666015625, "loss": 0.7939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5929198265075684, "rewards/margins": 0.3290136158466339, "rewards/rejected": 3.2639060020446777, "step": 23400 }, { "epoch": 1.0868656855007197, "grad_norm": 82.62651824951172, "learning_rate": 2.348270578949812e-07, "logits/chosen": -19.080608367919922, "logits/rejected": -18.089536666870117, "logps/chosen": -366.33087158203125, "logps/rejected": -336.9837341308594, "loss": 0.5345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.748425006866455, "rewards/margins": 0.9232792854309082, "rewards/rejected": 1.8251457214355469, "step": 23410 }, { "epoch": 1.0873299596081527, "grad_norm": 254.64749145507812, "learning_rate": 2.3479920144853518e-07, "logits/chosen": -17.924806594848633, "logits/rejected": -18.300670623779297, "logps/chosen": -323.6124572753906, "logps/rejected": -358.3525085449219, "loss": 1.1587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1801609992980957, "rewards/margins": -0.3339020609855652, "rewards/rejected": 2.5140633583068848, "step": 23420 }, { "epoch": 1.0877942337155857, "grad_norm": 129.6081085205078, "learning_rate": 2.3477134500208922e-07, "logits/chosen": -18.970712661743164, "logits/rejected": -18.98512077331543, "logps/chosen": -399.8154296875, "logps/rejected": -313.1138916015625, "loss": 0.7187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1913094520568848, "rewards/margins": 0.32386571168899536, "rewards/rejected": 2.867443799972534, "step": 23430 }, { "epoch": 1.0882585078230187, "grad_norm": 31.540735244750977, "learning_rate": 2.3474348855564323e-07, "logits/chosen": -17.85113525390625, "logits/rejected": -17.527931213378906, "logps/chosen": -300.2437744140625, "logps/rejected": -194.63616943359375, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1017508506774902, "rewards/margins": 0.6300625801086426, "rewards/rejected": 1.4716885089874268, "step": 23440 }, { "epoch": 1.0887227819304517, "grad_norm": 68.07840728759766, "learning_rate": 2.3471563210919727e-07, "logits/chosen": -18.995380401611328, "logits/rejected": -17.97195053100586, "logps/chosen": -478.76043701171875, "logps/rejected": -355.8778991699219, "loss": 1.0613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9231982231140137, "rewards/margins": 0.553702175617218, "rewards/rejected": 2.3694963455200195, "step": 23450 }, { "epoch": 1.0891870560378847, "grad_norm": 108.07132720947266, "learning_rate": 2.3468777566275126e-07, "logits/chosen": -18.149768829345703, "logits/rejected": -17.750362396240234, "logps/chosen": -395.19891357421875, "logps/rejected": -386.7384338378906, "loss": 0.7125, "rewards/accuracies": 0.5, "rewards/chosen": 2.545825481414795, "rewards/margins": 0.5155268907546997, "rewards/rejected": 2.0302987098693848, "step": 23460 }, { "epoch": 1.089651330145318, "grad_norm": 15.055057525634766, "learning_rate": 2.346599192163053e-07, "logits/chosen": -20.41596794128418, "logits/rejected": -18.501224517822266, "logps/chosen": -491.29742431640625, "logps/rejected": -245.4952392578125, "loss": 0.2554, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0148396492004395, "rewards/margins": 2.5817370414733887, "rewards/rejected": 1.4331023693084717, "step": 23470 }, { "epoch": 1.090115604252751, "grad_norm": 101.32254028320312, "learning_rate": 2.3463206276985932e-07, "logits/chosen": -19.634082794189453, "logits/rejected": -18.49907875061035, "logps/chosen": -382.32415771484375, "logps/rejected": -248.3306427001953, "loss": 0.6072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8126449584960938, "rewards/margins": 1.209463357925415, "rewards/rejected": 1.6031814813613892, "step": 23480 }, { "epoch": 1.0905798783601839, "grad_norm": 53.34271240234375, "learning_rate": 2.3460420632341333e-07, "logits/chosen": -18.440441131591797, "logits/rejected": -18.09459686279297, "logps/chosen": -363.2284240722656, "logps/rejected": -347.616455078125, "loss": 0.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.699937343597412, "rewards/margins": 0.32164496183395386, "rewards/rejected": 2.3782920837402344, "step": 23490 }, { "epoch": 1.0910441524676169, "grad_norm": 23.616840362548828, "learning_rate": 2.3457634987696735e-07, "logits/chosen": -18.34328842163086, "logits/rejected": -18.280948638916016, "logps/chosen": -463.8306579589844, "logps/rejected": -423.0440979003906, "loss": 0.6859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.927727460861206, "rewards/margins": 0.6369615197181702, "rewards/rejected": 2.2907660007476807, "step": 23500 }, { "epoch": 1.0915084265750499, "grad_norm": 141.10858154296875, "learning_rate": 2.3454849343052136e-07, "logits/chosen": -17.80512046813965, "logits/rejected": -17.933635711669922, "logps/chosen": -446.5555725097656, "logps/rejected": -404.96893310546875, "loss": 0.9457, "rewards/accuracies": 0.5, "rewards/chosen": 3.426225185394287, "rewards/margins": 0.39552879333496094, "rewards/rejected": 3.0306968688964844, "step": 23510 }, { "epoch": 1.0919727006824829, "grad_norm": 130.52027893066406, "learning_rate": 2.345206369840754e-07, "logits/chosen": -18.64670753479004, "logits/rejected": -17.511592864990234, "logps/chosen": -409.6202697753906, "logps/rejected": -378.7496337890625, "loss": 0.5446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0279157161712646, "rewards/margins": 1.1044529676437378, "rewards/rejected": 1.9234625101089478, "step": 23520 }, { "epoch": 1.092436974789916, "grad_norm": 8.420686721801758, "learning_rate": 2.3449278053762942e-07, "logits/chosen": -18.69698715209961, "logits/rejected": -18.11660385131836, "logps/chosen": -428.82318115234375, "logps/rejected": -355.5853271484375, "loss": 0.8042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.183948278427124, "rewards/margins": 0.5200909972190857, "rewards/rejected": 2.6638574600219727, "step": 23530 }, { "epoch": 1.092901248897349, "grad_norm": 47.166927337646484, "learning_rate": 2.344649240911834e-07, "logits/chosen": -18.619579315185547, "logits/rejected": -18.112346649169922, "logps/chosen": -402.36883544921875, "logps/rejected": -364.91717529296875, "loss": 0.6347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.759917736053467, "rewards/margins": 0.666357159614563, "rewards/rejected": 2.0935606956481934, "step": 23540 }, { "epoch": 1.093365523004782, "grad_norm": 125.74405670166016, "learning_rate": 2.3443706764473745e-07, "logits/chosen": -18.64808464050293, "logits/rejected": -18.10173988342285, "logps/chosen": -481.65081787109375, "logps/rejected": -391.84710693359375, "loss": 0.6268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.918682813644409, "rewards/margins": 0.5879215598106384, "rewards/rejected": 2.330760955810547, "step": 23550 }, { "epoch": 1.093829797112215, "grad_norm": 13.109357833862305, "learning_rate": 2.3440921119829146e-07, "logits/chosen": -19.899585723876953, "logits/rejected": -18.704126358032227, "logps/chosen": -414.9019470214844, "logps/rejected": -364.3253173828125, "loss": 0.536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9226596355438232, "rewards/margins": 0.9038742780685425, "rewards/rejected": 2.0187854766845703, "step": 23560 }, { "epoch": 1.094294071219648, "grad_norm": 2.72462797164917, "learning_rate": 2.343813547518455e-07, "logits/chosen": -18.736021041870117, "logits/rejected": -17.639320373535156, "logps/chosen": -451.70684814453125, "logps/rejected": -309.2541198730469, "loss": 0.4174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7553272247314453, "rewards/margins": 1.553877830505371, "rewards/rejected": 2.201449155807495, "step": 23570 }, { "epoch": 1.094758345327081, "grad_norm": 229.4574432373047, "learning_rate": 2.343534983053995e-07, "logits/chosen": -18.181182861328125, "logits/rejected": -18.088150024414062, "logps/chosen": -338.52392578125, "logps/rejected": -370.19415283203125, "loss": 0.9642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.171745777130127, "rewards/margins": -0.3267819285392761, "rewards/rejected": 2.498528003692627, "step": 23580 }, { "epoch": 1.095222619434514, "grad_norm": 16.33554458618164, "learning_rate": 2.343256418589535e-07, "logits/chosen": -20.05307388305664, "logits/rejected": -19.480051040649414, "logps/chosen": -398.68243408203125, "logps/rejected": -308.54425048828125, "loss": 0.3949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4363956451416016, "rewards/margins": 1.1740062236785889, "rewards/rejected": 2.2623894214630127, "step": 23590 }, { "epoch": 1.0956868935419473, "grad_norm": 140.2584228515625, "learning_rate": 2.3429778541250755e-07, "logits/chosen": -20.385793685913086, "logits/rejected": -18.978557586669922, "logps/chosen": -357.26556396484375, "logps/rejected": -267.61248779296875, "loss": 0.5285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5856175422668457, "rewards/margins": 1.0348005294799805, "rewards/rejected": 2.550816774368286, "step": 23600 }, { "epoch": 1.0961511676493803, "grad_norm": 4.375033855438232, "learning_rate": 2.3426992896606153e-07, "logits/chosen": -19.356971740722656, "logits/rejected": -18.25616455078125, "logps/chosen": -416.39129638671875, "logps/rejected": -310.3428039550781, "loss": 0.5655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.557976484298706, "rewards/margins": 1.2770435810089111, "rewards/rejected": 2.280933141708374, "step": 23610 }, { "epoch": 1.0966154417568132, "grad_norm": 75.27999114990234, "learning_rate": 2.3424207251961557e-07, "logits/chosen": -19.70428466796875, "logits/rejected": -17.865966796875, "logps/chosen": -432.38543701171875, "logps/rejected": -247.3412628173828, "loss": 0.2947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.466230869293213, "rewards/margins": 1.884836196899414, "rewards/rejected": 1.5813944339752197, "step": 23620 }, { "epoch": 1.0970797158642462, "grad_norm": 20.4556884765625, "learning_rate": 2.342142160731696e-07, "logits/chosen": -19.44094467163086, "logits/rejected": -18.622512817382812, "logps/chosen": -401.60888671875, "logps/rejected": -245.3892822265625, "loss": 0.6401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6353752613067627, "rewards/margins": 0.6170187592506409, "rewards/rejected": 2.0183563232421875, "step": 23630 }, { "epoch": 1.0975439899716792, "grad_norm": 56.24900436401367, "learning_rate": 2.3418635962672363e-07, "logits/chosen": -19.410987854003906, "logits/rejected": -18.808223724365234, "logps/chosen": -493.92138671875, "logps/rejected": -390.81219482421875, "loss": 0.5777, "rewards/accuracies": 0.5, "rewards/chosen": 3.7161865234375, "rewards/margins": 0.9414604902267456, "rewards/rejected": 2.774726152420044, "step": 23640 }, { "epoch": 1.0980082640791122, "grad_norm": 160.42018127441406, "learning_rate": 2.3415850318027762e-07, "logits/chosen": -19.379064559936523, "logits/rejected": -18.300201416015625, "logps/chosen": -429.3892517089844, "logps/rejected": -348.8871154785156, "loss": 0.7905, "rewards/accuracies": 0.5, "rewards/chosen": 2.9387576580047607, "rewards/margins": 0.23782309889793396, "rewards/rejected": 2.700934648513794, "step": 23650 }, { "epoch": 1.0984725381865452, "grad_norm": 105.2302474975586, "learning_rate": 2.3413064673383163e-07, "logits/chosen": -18.298307418823242, "logits/rejected": -18.732192993164062, "logps/chosen": -373.1392517089844, "logps/rejected": -341.5333251953125, "loss": 0.9357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.688485622406006, "rewards/margins": -0.09361829608678818, "rewards/rejected": 2.782104015350342, "step": 23660 }, { "epoch": 1.0989368122939784, "grad_norm": 14.251707077026367, "learning_rate": 2.3410279028738567e-07, "logits/chosen": -18.368602752685547, "logits/rejected": -18.3624267578125, "logps/chosen": -265.32171630859375, "logps/rejected": -279.3697204589844, "loss": 0.6245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.457731008529663, "rewards/margins": 0.4800213873386383, "rewards/rejected": 1.9777095317840576, "step": 23670 }, { "epoch": 1.0994010864014114, "grad_norm": 92.36599731445312, "learning_rate": 2.340749338409397e-07, "logits/chosen": -19.128643035888672, "logits/rejected": -18.44698715209961, "logps/chosen": -484.9970703125, "logps/rejected": -400.85943603515625, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": 3.1164186000823975, "rewards/margins": 0.32313671708106995, "rewards/rejected": 2.7932820320129395, "step": 23680 }, { "epoch": 1.0998653605088444, "grad_norm": 89.186767578125, "learning_rate": 2.3404707739449368e-07, "logits/chosen": -19.3995361328125, "logits/rejected": -19.161609649658203, "logps/chosen": -443.8963928222656, "logps/rejected": -385.36395263671875, "loss": 0.6011, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8083910942077637, "rewards/margins": 0.4086182117462158, "rewards/rejected": 3.399773120880127, "step": 23690 }, { "epoch": 1.1003296346162774, "grad_norm": 243.25900268554688, "learning_rate": 2.3401922094804772e-07, "logits/chosen": -17.5753231048584, "logits/rejected": -17.60504722595215, "logps/chosen": -352.2152099609375, "logps/rejected": -349.67633056640625, "loss": 1.3361, "rewards/accuracies": 0.5, "rewards/chosen": 2.7718122005462646, "rewards/margins": -0.550682544708252, "rewards/rejected": 3.3224945068359375, "step": 23700 }, { "epoch": 1.1007939087237104, "grad_norm": 145.21722412109375, "learning_rate": 2.3399136450160173e-07, "logits/chosen": -19.20829963684082, "logits/rejected": -18.57729721069336, "logps/chosen": -448.25775146484375, "logps/rejected": -318.0284423828125, "loss": 0.936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.478196620941162, "rewards/margins": 1.1883478164672852, "rewards/rejected": 2.289848804473877, "step": 23710 }, { "epoch": 1.1012581828311434, "grad_norm": 98.80973815917969, "learning_rate": 2.3396350805515577e-07, "logits/chosen": -20.009531021118164, "logits/rejected": -18.97560691833496, "logps/chosen": -423.3992614746094, "logps/rejected": -324.49884033203125, "loss": 0.7169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.635274887084961, "rewards/margins": 0.6944080591201782, "rewards/rejected": 2.9408669471740723, "step": 23720 }, { "epoch": 1.1017224569385766, "grad_norm": 84.26566314697266, "learning_rate": 2.3393565160870976e-07, "logits/chosen": -18.4862117767334, "logits/rejected": -18.16842269897461, "logps/chosen": -358.1749267578125, "logps/rejected": -348.53057861328125, "loss": 0.6758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3620097637176514, "rewards/margins": 0.1850709617137909, "rewards/rejected": 2.176938772201538, "step": 23730 }, { "epoch": 1.1021867310460096, "grad_norm": 21.519668579101562, "learning_rate": 2.3390779516226378e-07, "logits/chosen": -19.37173843383789, "logits/rejected": -18.71306610107422, "logps/chosen": -451.8998107910156, "logps/rejected": -368.7172546386719, "loss": 0.639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7762932777404785, "rewards/margins": 0.5484746694564819, "rewards/rejected": 2.227818727493286, "step": 23740 }, { "epoch": 1.1026510051534426, "grad_norm": 65.62327575683594, "learning_rate": 2.3387993871581782e-07, "logits/chosen": -18.433135986328125, "logits/rejected": -17.69972801208496, "logps/chosen": -335.136474609375, "logps/rejected": -254.1887969970703, "loss": 0.4974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7042198181152344, "rewards/margins": 1.177494764328003, "rewards/rejected": 1.5267250537872314, "step": 23750 }, { "epoch": 1.1031152792608756, "grad_norm": 6.607600212097168, "learning_rate": 2.3385208226937183e-07, "logits/chosen": -19.261165618896484, "logits/rejected": -17.3148193359375, "logps/chosen": -558.816650390625, "logps/rejected": -369.74224853515625, "loss": 0.2571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.163745403289795, "rewards/margins": 1.925967812538147, "rewards/rejected": 2.2377772331237793, "step": 23760 }, { "epoch": 1.1035795533683086, "grad_norm": 34.04473876953125, "learning_rate": 2.3382422582292585e-07, "logits/chosen": -18.677745819091797, "logits/rejected": -17.323707580566406, "logps/chosen": -395.79644775390625, "logps/rejected": -317.40496826171875, "loss": 0.4705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.16679048538208, "rewards/margins": 0.7762985229492188, "rewards/rejected": 2.3904922008514404, "step": 23770 }, { "epoch": 1.1040438274757416, "grad_norm": 49.16667938232422, "learning_rate": 2.3379636937647986e-07, "logits/chosen": -18.988122940063477, "logits/rejected": -17.81595802307129, "logps/chosen": -378.9319763183594, "logps/rejected": -262.9934997558594, "loss": 0.4283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.078481674194336, "rewards/margins": 1.4194180965423584, "rewards/rejected": 1.6590633392333984, "step": 23780 }, { "epoch": 1.1045081015831748, "grad_norm": 37.26877212524414, "learning_rate": 2.337685129300339e-07, "logits/chosen": -18.394939422607422, "logits/rejected": -17.818180084228516, "logps/chosen": -330.8591003417969, "logps/rejected": -228.9468994140625, "loss": 0.6075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7300455570220947, "rewards/margins": 0.8116661310195923, "rewards/rejected": 1.918379545211792, "step": 23790 }, { "epoch": 1.1049723756906078, "grad_norm": 132.4901885986328, "learning_rate": 2.337406564835879e-07, "logits/chosen": -18.997148513793945, "logits/rejected": -18.77273178100586, "logps/chosen": -403.89910888671875, "logps/rejected": -387.6661376953125, "loss": 0.8513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.515434741973877, "rewards/margins": 0.16679833829402924, "rewards/rejected": 3.3486361503601074, "step": 23800 }, { "epoch": 1.1054366497980408, "grad_norm": 1.4105784893035889, "learning_rate": 2.337128000371419e-07, "logits/chosen": -18.6169376373291, "logits/rejected": -18.0109920501709, "logps/chosen": -511.9164123535156, "logps/rejected": -397.0283508300781, "loss": 0.5619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.96484112739563, "rewards/margins": 0.8316572308540344, "rewards/rejected": 2.1331839561462402, "step": 23810 }, { "epoch": 1.1059009239054738, "grad_norm": 0.7459545135498047, "learning_rate": 2.3368494359069594e-07, "logits/chosen": -19.036378860473633, "logits/rejected": -17.47177505493164, "logps/chosen": -510.43212890625, "logps/rejected": -361.13311767578125, "loss": 0.369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.661637544631958, "rewards/margins": 1.4708060026168823, "rewards/rejected": 2.190831422805786, "step": 23820 }, { "epoch": 1.1063651980129068, "grad_norm": 8.749420166015625, "learning_rate": 2.3365708714424996e-07, "logits/chosen": -18.919536590576172, "logits/rejected": -17.73887062072754, "logps/chosen": -383.0301208496094, "logps/rejected": -221.8726348876953, "loss": 0.4467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9143612384796143, "rewards/margins": 1.0616018772125244, "rewards/rejected": 1.8527591228485107, "step": 23830 }, { "epoch": 1.1068294721203398, "grad_norm": 23.09894561767578, "learning_rate": 2.3362923069780395e-07, "logits/chosen": -19.515018463134766, "logits/rejected": -18.22696304321289, "logps/chosen": -312.3348693847656, "logps/rejected": -221.8218231201172, "loss": 0.4109, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0969176292419434, "rewards/margins": 1.3256809711456299, "rewards/rejected": 1.7712364196777344, "step": 23840 }, { "epoch": 1.1072937462277728, "grad_norm": 51.5621452331543, "learning_rate": 2.33601374251358e-07, "logits/chosen": -18.418874740600586, "logits/rejected": -17.125442504882812, "logps/chosen": -430.9339904785156, "logps/rejected": -273.27532958984375, "loss": 0.4249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7763009071350098, "rewards/margins": 0.8946260213851929, "rewards/rejected": 1.8816745281219482, "step": 23850 }, { "epoch": 1.107758020335206, "grad_norm": 37.289207458496094, "learning_rate": 2.33573517804912e-07, "logits/chosen": -18.106918334960938, "logits/rejected": -18.04049301147461, "logps/chosen": -312.6864318847656, "logps/rejected": -259.39483642578125, "loss": 1.0617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4443390369415283, "rewards/margins": 0.1619090735912323, "rewards/rejected": 2.2824294567108154, "step": 23860 }, { "epoch": 1.108222294442639, "grad_norm": 22.125097274780273, "learning_rate": 2.3354566135846604e-07, "logits/chosen": -19.433378219604492, "logits/rejected": -18.747411727905273, "logps/chosen": -455.040283203125, "logps/rejected": -314.0188903808594, "loss": 0.6682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0307891368865967, "rewards/margins": 0.8141992688179016, "rewards/rejected": 2.21658992767334, "step": 23870 }, { "epoch": 1.108686568550072, "grad_norm": 106.78963470458984, "learning_rate": 2.3351780491202003e-07, "logits/chosen": -17.776721954345703, "logits/rejected": -18.271465301513672, "logps/chosen": -326.4738464355469, "logps/rejected": -290.178955078125, "loss": 0.7584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3772454261779785, "rewards/margins": 0.14240312576293945, "rewards/rejected": 2.234842300415039, "step": 23880 }, { "epoch": 1.109150842657505, "grad_norm": 155.9320526123047, "learning_rate": 2.3348994846557407e-07, "logits/chosen": -18.710384368896484, "logits/rejected": -18.82706642150879, "logps/chosen": -389.02520751953125, "logps/rejected": -362.8385314941406, "loss": 0.704, "rewards/accuracies": 0.5, "rewards/chosen": 3.119920253753662, "rewards/margins": 0.39047765731811523, "rewards/rejected": 2.729442596435547, "step": 23890 }, { "epoch": 1.109615116764938, "grad_norm": 123.5234146118164, "learning_rate": 2.334620920191281e-07, "logits/chosen": -18.422489166259766, "logits/rejected": -17.453710556030273, "logps/chosen": -464.4932556152344, "logps/rejected": -342.3681335449219, "loss": 0.6918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.796668529510498, "rewards/margins": 1.2239841222763062, "rewards/rejected": 2.5726845264434814, "step": 23900 }, { "epoch": 1.110079390872371, "grad_norm": 15.174360275268555, "learning_rate": 2.334342355726821e-07, "logits/chosen": -18.081989288330078, "logits/rejected": -17.66452980041504, "logps/chosen": -462.62237548828125, "logps/rejected": -397.40838623046875, "loss": 0.8176, "rewards/accuracies": 0.5, "rewards/chosen": 3.1786375045776367, "rewards/margins": 0.545732319355011, "rewards/rejected": 2.6329052448272705, "step": 23910 }, { "epoch": 1.1105436649798042, "grad_norm": 93.06986999511719, "learning_rate": 2.3340637912623612e-07, "logits/chosen": -18.408937454223633, "logits/rejected": -18.379636764526367, "logps/chosen": -330.63037109375, "logps/rejected": -390.132080078125, "loss": 0.6341, "rewards/accuracies": 0.5, "rewards/chosen": 2.7488813400268555, "rewards/margins": 0.2992585599422455, "rewards/rejected": 2.449622869491577, "step": 23920 }, { "epoch": 1.1110079390872372, "grad_norm": 9.413131713867188, "learning_rate": 2.3337852267979013e-07, "logits/chosen": -18.63210678100586, "logits/rejected": -17.963809967041016, "logps/chosen": -355.80303955078125, "logps/rejected": -336.11993408203125, "loss": 0.7384, "rewards/accuracies": 0.5, "rewards/chosen": 2.939911365509033, "rewards/margins": 0.3726152777671814, "rewards/rejected": 2.567296266555786, "step": 23930 }, { "epoch": 1.1114722131946702, "grad_norm": 207.59043884277344, "learning_rate": 2.3335066623334417e-07, "logits/chosen": -18.644119262695312, "logits/rejected": -18.38530921936035, "logps/chosen": -418.08837890625, "logps/rejected": -425.53594970703125, "loss": 1.2905, "rewards/accuracies": 0.5, "rewards/chosen": 2.752307176589966, "rewards/margins": -0.42328938841819763, "rewards/rejected": 3.1755967140197754, "step": 23940 }, { "epoch": 1.1119364873021031, "grad_norm": 5.110988616943359, "learning_rate": 2.333228097868982e-07, "logits/chosen": -17.987714767456055, "logits/rejected": -16.961627960205078, "logps/chosen": -596.5084228515625, "logps/rejected": -385.6759338378906, "loss": 0.42, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.974116802215576, "rewards/margins": 1.913900375366211, "rewards/rejected": 2.0602166652679443, "step": 23950 }, { "epoch": 1.1124007614095361, "grad_norm": 240.72938537597656, "learning_rate": 2.3329495334045217e-07, "logits/chosen": -18.339046478271484, "logits/rejected": -17.987590789794922, "logps/chosen": -462.701416015625, "logps/rejected": -442.1275329589844, "loss": 0.9929, "rewards/accuracies": 0.5, "rewards/chosen": 2.9233450889587402, "rewards/margins": 0.2222648561000824, "rewards/rejected": 2.701080083847046, "step": 23960 }, { "epoch": 1.1128650355169691, "grad_norm": 71.65239715576172, "learning_rate": 2.3326709689400622e-07, "logits/chosen": -18.146528244018555, "logits/rejected": -17.3972110748291, "logps/chosen": -394.8650817871094, "logps/rejected": -257.84100341796875, "loss": 0.3029, "rewards/accuracies": 1.0, "rewards/chosen": 2.9760890007019043, "rewards/margins": 1.2327297925949097, "rewards/rejected": 1.7433593273162842, "step": 23970 }, { "epoch": 1.1133293096244024, "grad_norm": 213.16038513183594, "learning_rate": 2.3323924044756023e-07, "logits/chosen": -19.060028076171875, "logits/rejected": -18.201608657836914, "logps/chosen": -518.211181640625, "logps/rejected": -390.08514404296875, "loss": 0.6821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9506688117980957, "rewards/margins": 1.1558796167373657, "rewards/rejected": 2.7947890758514404, "step": 23980 }, { "epoch": 1.1137935837318353, "grad_norm": 44.2175178527832, "learning_rate": 2.3321138400111422e-07, "logits/chosen": -17.828359603881836, "logits/rejected": -16.40289306640625, "logps/chosen": -512.2268676757812, "logps/rejected": -321.8602294921875, "loss": 0.3067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9774410724639893, "rewards/margins": 2.0693178176879883, "rewards/rejected": 1.9081230163574219, "step": 23990 }, { "epoch": 1.1142578578392683, "grad_norm": 75.71199798583984, "learning_rate": 2.3318352755466826e-07, "logits/chosen": -18.441696166992188, "logits/rejected": -19.289997100830078, "logps/chosen": -469.62384033203125, "logps/rejected": -534.8359985351562, "loss": 1.0838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.171314239501953, "rewards/margins": -0.007339179515838623, "rewards/rejected": 4.178652763366699, "step": 24000 }, { "epoch": 1.1147221319467013, "grad_norm": 3.0016560554504395, "learning_rate": 2.3315567110822227e-07, "logits/chosen": -18.660316467285156, "logits/rejected": -17.869789123535156, "logps/chosen": -391.4615173339844, "logps/rejected": -230.289306640625, "loss": 0.7166, "rewards/accuracies": 0.5, "rewards/chosen": 2.793816566467285, "rewards/margins": 1.171121597290039, "rewards/rejected": 1.622694730758667, "step": 24010 }, { "epoch": 1.1151864060541343, "grad_norm": 22.623424530029297, "learning_rate": 2.3312781466177632e-07, "logits/chosen": -18.397489547729492, "logits/rejected": -18.132007598876953, "logps/chosen": -254.19174194335938, "logps/rejected": -250.109375, "loss": 0.5843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.820237398147583, "rewards/margins": 0.48588666319847107, "rewards/rejected": 1.3343505859375, "step": 24020 }, { "epoch": 1.1156506801615673, "grad_norm": 77.9597396850586, "learning_rate": 2.330999582153303e-07, "logits/chosen": -19.649982452392578, "logits/rejected": -18.767202377319336, "logps/chosen": -525.2984008789062, "logps/rejected": -396.83984375, "loss": 0.6652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2806811332702637, "rewards/margins": 0.27327847480773926, "rewards/rejected": 3.0074024200439453, "step": 24030 }, { "epoch": 1.1161149542690003, "grad_norm": 153.15597534179688, "learning_rate": 2.3307210176888434e-07, "logits/chosen": -18.82374382019043, "logits/rejected": -19.37152671813965, "logps/chosen": -435.88702392578125, "logps/rejected": -508.66229248046875, "loss": 0.7817, "rewards/accuracies": 0.5, "rewards/chosen": 3.0400547981262207, "rewards/margins": 0.1194804310798645, "rewards/rejected": 2.920574188232422, "step": 24040 }, { "epoch": 1.1165792283764335, "grad_norm": 28.868684768676758, "learning_rate": 2.3304424532243836e-07, "logits/chosen": -18.864168167114258, "logits/rejected": -18.553050994873047, "logps/chosen": -391.5315246582031, "logps/rejected": -246.9920654296875, "loss": 0.4464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.026538372039795, "rewards/margins": 1.40316903591156, "rewards/rejected": 1.6233694553375244, "step": 24050 }, { "epoch": 1.1170435024838665, "grad_norm": 44.98659133911133, "learning_rate": 2.330163888759924e-07, "logits/chosen": -19.61490249633789, "logits/rejected": -18.707517623901367, "logps/chosen": -274.6824035644531, "logps/rejected": -205.7314910888672, "loss": 0.4527, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2155075073242188, "rewards/margins": 0.6184402108192444, "rewards/rejected": 1.5970672369003296, "step": 24060 }, { "epoch": 1.1175077765912995, "grad_norm": 45.454105377197266, "learning_rate": 2.329885324295464e-07, "logits/chosen": -18.551610946655273, "logits/rejected": -17.697660446166992, "logps/chosen": -322.9637145996094, "logps/rejected": -311.9259948730469, "loss": 0.5285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8619112968444824, "rewards/margins": 0.7364807724952698, "rewards/rejected": 2.1254305839538574, "step": 24070 }, { "epoch": 1.1179720506987325, "grad_norm": 92.61617279052734, "learning_rate": 2.329606759831004e-07, "logits/chosen": -18.613807678222656, "logits/rejected": -18.230756759643555, "logps/chosen": -382.40203857421875, "logps/rejected": -307.2623596191406, "loss": 0.6668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9549150466918945, "rewards/margins": 0.6487458944320679, "rewards/rejected": 2.306168794631958, "step": 24080 }, { "epoch": 1.1184363248061655, "grad_norm": 15.354265213012695, "learning_rate": 2.3293281953665444e-07, "logits/chosen": -19.18792152404785, "logits/rejected": -18.049470901489258, "logps/chosen": -396.6446838378906, "logps/rejected": -319.9052734375, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6287147998809814, "rewards/margins": 1.4180606603622437, "rewards/rejected": 2.2106540203094482, "step": 24090 }, { "epoch": 1.1189005989135985, "grad_norm": 104.86848449707031, "learning_rate": 2.3290496309020846e-07, "logits/chosen": -19.208181381225586, "logits/rejected": -17.402563095092773, "logps/chosen": -434.34722900390625, "logps/rejected": -258.3559875488281, "loss": 0.3852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.159895658493042, "rewards/margins": 1.519156813621521, "rewards/rejected": 1.640738844871521, "step": 24100 }, { "epoch": 1.1193648730210317, "grad_norm": 259.2200622558594, "learning_rate": 2.3287710664376245e-07, "logits/chosen": -17.514537811279297, "logits/rejected": -17.74252700805664, "logps/chosen": -250.1202850341797, "logps/rejected": -269.2574462890625, "loss": 0.8636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6918766498565674, "rewards/margins": 0.07194747030735016, "rewards/rejected": 1.6199290752410889, "step": 24110 }, { "epoch": 1.1198291471284647, "grad_norm": 94.9755630493164, "learning_rate": 2.328492501973165e-07, "logits/chosen": -18.826778411865234, "logits/rejected": -18.150318145751953, "logps/chosen": -377.6490173339844, "logps/rejected": -338.4078674316406, "loss": 0.5538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4028420448303223, "rewards/margins": 0.7180137038230896, "rewards/rejected": 2.684828281402588, "step": 24120 }, { "epoch": 1.1202934212358977, "grad_norm": 46.59503936767578, "learning_rate": 2.328213937508705e-07, "logits/chosen": -18.64742660522461, "logits/rejected": -17.854366302490234, "logps/chosen": -434.68109130859375, "logps/rejected": -350.1424865722656, "loss": 0.4662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.81839656829834, "rewards/margins": 0.7183424830436707, "rewards/rejected": 2.1000545024871826, "step": 24130 }, { "epoch": 1.1207576953433307, "grad_norm": 14.006324768066406, "learning_rate": 2.3279353730442454e-07, "logits/chosen": -18.79324722290039, "logits/rejected": -18.974328994750977, "logps/chosen": -360.0517883300781, "logps/rejected": -320.2154541015625, "loss": 0.7843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8931922912597656, "rewards/margins": 0.07524079084396362, "rewards/rejected": 2.8179516792297363, "step": 24140 }, { "epoch": 1.1212219694507637, "grad_norm": 66.12642669677734, "learning_rate": 2.3276568085797853e-07, "logits/chosen": -19.579792022705078, "logits/rejected": -18.665233612060547, "logps/chosen": -475.3631286621094, "logps/rejected": -428.51495361328125, "loss": 0.9442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2828643321990967, "rewards/margins": 0.12966518104076385, "rewards/rejected": 3.1531994342803955, "step": 24150 }, { "epoch": 1.1216862435581967, "grad_norm": 18.568418502807617, "learning_rate": 2.3273782441153255e-07, "logits/chosen": -18.33364486694336, "logits/rejected": -17.388042449951172, "logps/chosen": -403.8465881347656, "logps/rejected": -283.1879577636719, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.175443172454834, "rewards/margins": 1.0590788125991821, "rewards/rejected": 2.1163644790649414, "step": 24160 }, { "epoch": 1.12215051766563, "grad_norm": 66.73400115966797, "learning_rate": 2.3270996796508659e-07, "logits/chosen": -18.372554779052734, "logits/rejected": -17.85502052307129, "logps/chosen": -297.653076171875, "logps/rejected": -225.07095336914062, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": 2.026346445083618, "rewards/margins": 0.267961323261261, "rewards/rejected": 1.7583850622177124, "step": 24170 }, { "epoch": 1.122614791773063, "grad_norm": 141.9724884033203, "learning_rate": 2.326821115186406e-07, "logits/chosen": -18.860761642456055, "logits/rejected": -18.506460189819336, "logps/chosen": -475.4823303222656, "logps/rejected": -336.7106628417969, "loss": 0.9149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.899268388748169, "rewards/margins": 0.7105605602264404, "rewards/rejected": 3.1887075901031494, "step": 24180 }, { "epoch": 1.1230790658804959, "grad_norm": 5.022098064422607, "learning_rate": 2.3265425507219462e-07, "logits/chosen": -19.314044952392578, "logits/rejected": -18.99930191040039, "logps/chosen": -349.3987731933594, "logps/rejected": -288.81402587890625, "loss": 0.8631, "rewards/accuracies": 0.5, "rewards/chosen": 2.3331825733184814, "rewards/margins": 0.3827732801437378, "rewards/rejected": 1.950409173965454, "step": 24190 }, { "epoch": 1.1235433399879289, "grad_norm": 126.07661437988281, "learning_rate": 2.3262639862574863e-07, "logits/chosen": -19.194988250732422, "logits/rejected": -18.630088806152344, "logps/chosen": -375.725830078125, "logps/rejected": -322.52569580078125, "loss": 0.6566, "rewards/accuracies": 0.5, "rewards/chosen": 2.384073495864868, "rewards/margins": 0.4123102128505707, "rewards/rejected": 1.9717633724212646, "step": 24200 }, { "epoch": 1.1240076140953619, "grad_norm": 79.05992889404297, "learning_rate": 2.3259854217930267e-07, "logits/chosen": -17.819677352905273, "logits/rejected": -16.73272705078125, "logps/chosen": -515.352294921875, "logps/rejected": -369.6036376953125, "loss": 0.4739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2702107429504395, "rewards/margins": 1.2214804887771606, "rewards/rejected": 2.04872989654541, "step": 24210 }, { "epoch": 1.1244718882027949, "grad_norm": 27.28386116027832, "learning_rate": 2.3257068573285666e-07, "logits/chosen": -18.69248390197754, "logits/rejected": -16.920469284057617, "logps/chosen": -355.7872314453125, "logps/rejected": -176.06871032714844, "loss": 0.6973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.83817195892334, "rewards/margins": 1.3735427856445312, "rewards/rejected": 1.4646294116973877, "step": 24220 }, { "epoch": 1.1249361623102279, "grad_norm": 1.6741291284561157, "learning_rate": 2.3254282928641067e-07, "logits/chosen": -18.11018943786621, "logits/rejected": -17.347393035888672, "logps/chosen": -416.18157958984375, "logps/rejected": -277.4557800292969, "loss": 0.5178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.463627815246582, "rewards/margins": 1.0669777393341064, "rewards/rejected": 1.396649956703186, "step": 24230 }, { "epoch": 1.125400436417661, "grad_norm": 140.16818237304688, "learning_rate": 2.3251497283996471e-07, "logits/chosen": -18.164844512939453, "logits/rejected": -17.285451889038086, "logps/chosen": -435.66595458984375, "logps/rejected": -243.87112426757812, "loss": 0.6775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.25862455368042, "rewards/margins": 1.2248567342758179, "rewards/rejected": 2.0337681770324707, "step": 24240 }, { "epoch": 1.125864710525094, "grad_norm": 102.78564453125, "learning_rate": 2.3248711639351873e-07, "logits/chosen": -19.68304443359375, "logits/rejected": -18.029325485229492, "logps/chosen": -406.48236083984375, "logps/rejected": -272.67059326171875, "loss": 0.3843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9234097003936768, "rewards/margins": 1.6172764301300049, "rewards/rejected": 1.306133508682251, "step": 24250 }, { "epoch": 1.126328984632527, "grad_norm": 28.235551834106445, "learning_rate": 2.3245925994707272e-07, "logits/chosen": -19.86056900024414, "logits/rejected": -18.480703353881836, "logps/chosen": -403.4568786621094, "logps/rejected": -293.22760009765625, "loss": 0.6482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6790313720703125, "rewards/margins": 0.6197096109390259, "rewards/rejected": 2.059321880340576, "step": 24260 }, { "epoch": 1.12679325873996, "grad_norm": 12.660358428955078, "learning_rate": 2.3243140350062676e-07, "logits/chosen": -20.274879455566406, "logits/rejected": -18.789005279541016, "logps/chosen": -387.8339538574219, "logps/rejected": -315.98223876953125, "loss": 0.4389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.086549758911133, "rewards/margins": 0.9299103021621704, "rewards/rejected": 2.156639575958252, "step": 24270 }, { "epoch": 1.127257532847393, "grad_norm": 62.93621063232422, "learning_rate": 2.3240354705418077e-07, "logits/chosen": -18.729217529296875, "logits/rejected": -18.774799346923828, "logps/chosen": -296.2144470214844, "logps/rejected": -396.5965576171875, "loss": 1.339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9463932514190674, "rewards/margins": -0.6835517287254333, "rewards/rejected": 2.6299450397491455, "step": 24280 }, { "epoch": 1.127721806954826, "grad_norm": 266.7148132324219, "learning_rate": 2.3237569060773481e-07, "logits/chosen": -18.877710342407227, "logits/rejected": -18.521347045898438, "logps/chosen": -464.10107421875, "logps/rejected": -414.84747314453125, "loss": 0.7569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2887539863586426, "rewards/margins": 0.243887260556221, "rewards/rejected": 3.0448668003082275, "step": 24290 }, { "epoch": 1.128186081062259, "grad_norm": 7.770655155181885, "learning_rate": 2.323478341612888e-07, "logits/chosen": -19.089387893676758, "logits/rejected": -18.60628890991211, "logps/chosen": -393.77728271484375, "logps/rejected": -352.451416015625, "loss": 0.6308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7949728965759277, "rewards/margins": 0.954508900642395, "rewards/rejected": 2.840463876724243, "step": 24300 }, { "epoch": 1.1286503551696923, "grad_norm": 1.5377906560897827, "learning_rate": 2.3231997771484284e-07, "logits/chosen": -18.53389549255371, "logits/rejected": -18.6148681640625, "logps/chosen": -417.769287109375, "logps/rejected": -429.77752685546875, "loss": 1.0551, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.135442018508911, "rewards/margins": 0.10451145470142365, "rewards/rejected": 3.030930519104004, "step": 24310 }, { "epoch": 1.1291146292771252, "grad_norm": 51.99894714355469, "learning_rate": 2.3229212126839686e-07, "logits/chosen": -19.291467666625977, "logits/rejected": -18.957616806030273, "logps/chosen": -488.8734436035156, "logps/rejected": -430.6068420410156, "loss": 0.8271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6779656410217285, "rewards/margins": 0.5400521159172058, "rewards/rejected": 3.137913465499878, "step": 24320 }, { "epoch": 1.1295789033845582, "grad_norm": 67.99976348876953, "learning_rate": 2.3226426482195087e-07, "logits/chosen": -18.051931381225586, "logits/rejected": -17.26953125, "logps/chosen": -317.7138366699219, "logps/rejected": -264.43170166015625, "loss": 0.629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.171865701675415, "rewards/margins": 0.7361939549446106, "rewards/rejected": 1.4356719255447388, "step": 24330 }, { "epoch": 1.1300431774919912, "grad_norm": 163.4481658935547, "learning_rate": 2.322391940201495e-07, "logits/chosen": -19.946809768676758, "logits/rejected": -19.425674438476562, "logps/chosen": -404.68902587890625, "logps/rejected": -379.31927490234375, "loss": 0.7603, "rewards/accuracies": 0.5, "rewards/chosen": 2.7422165870666504, "rewards/margins": 0.6674789190292358, "rewards/rejected": 2.074738025665283, "step": 24340 }, { "epoch": 1.1305074515994242, "grad_norm": 0.18753443658351898, "learning_rate": 2.3221133757370348e-07, "logits/chosen": -19.404144287109375, "logits/rejected": -18.204774856567383, "logps/chosen": -413.60687255859375, "logps/rejected": -312.0764465332031, "loss": 0.4701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7966010570526123, "rewards/margins": 1.7923743724822998, "rewards/rejected": 2.0042266845703125, "step": 24350 }, { "epoch": 1.1309717257068574, "grad_norm": 11.088709831237793, "learning_rate": 2.3218348112725752e-07, "logits/chosen": -18.669925689697266, "logits/rejected": -18.020456314086914, "logps/chosen": -459.36279296875, "logps/rejected": -348.2635803222656, "loss": 0.5222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4255013465881348, "rewards/margins": 0.9462637901306152, "rewards/rejected": 2.4792373180389404, "step": 24360 }, { "epoch": 1.1314359998142904, "grad_norm": 61.770408630371094, "learning_rate": 2.3215562468081154e-07, "logits/chosen": -18.28277587890625, "logits/rejected": -18.28512191772461, "logps/chosen": -384.0445861816406, "logps/rejected": -279.78314208984375, "loss": 0.8469, "rewards/accuracies": 0.5, "rewards/chosen": 1.9386497735977173, "rewards/margins": 0.002584826899692416, "rewards/rejected": 1.9360649585723877, "step": 24370 }, { "epoch": 1.1319002739217234, "grad_norm": 16.156787872314453, "learning_rate": 2.3212776823436558e-07, "logits/chosen": -17.75918197631836, "logits/rejected": -17.80380630493164, "logps/chosen": -434.39361572265625, "logps/rejected": -420.06488037109375, "loss": 1.2711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9990592002868652, "rewards/margins": 0.10808177292346954, "rewards/rejected": 2.890977144241333, "step": 24380 }, { "epoch": 1.1323645480291564, "grad_norm": 47.56979751586914, "learning_rate": 2.3209991178791957e-07, "logits/chosen": -19.259763717651367, "logits/rejected": -18.871105194091797, "logps/chosen": -489.4876403808594, "logps/rejected": -445.4497985839844, "loss": 0.4557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.05831241607666, "rewards/margins": 1.1614125967025757, "rewards/rejected": 2.896899700164795, "step": 24390 }, { "epoch": 1.1328288221365894, "grad_norm": 80.80441284179688, "learning_rate": 2.320720553414736e-07, "logits/chosen": -18.429662704467773, "logits/rejected": -18.478179931640625, "logps/chosen": -336.0533447265625, "logps/rejected": -387.4865417480469, "loss": 1.195, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.349958658218384, "rewards/margins": -0.36138439178466797, "rewards/rejected": 2.7113428115844727, "step": 24400 }, { "epoch": 1.1332930962440224, "grad_norm": 33.81291961669922, "learning_rate": 2.3204419889502762e-07, "logits/chosen": -18.507694244384766, "logits/rejected": -17.77579689025879, "logps/chosen": -482.0448303222656, "logps/rejected": -256.2014465332031, "loss": 0.5536, "rewards/accuracies": 0.5, "rewards/chosen": 3.0973455905914307, "rewards/margins": 1.1343187093734741, "rewards/rejected": 1.963026762008667, "step": 24410 }, { "epoch": 1.1337573703514554, "grad_norm": 5.710779190063477, "learning_rate": 2.3201634244858164e-07, "logits/chosen": -18.1677188873291, "logits/rejected": -17.350299835205078, "logps/chosen": -358.6073303222656, "logps/rejected": -302.7471008300781, "loss": 0.6084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3561606407165527, "rewards/margins": 0.701184093952179, "rewards/rejected": 1.6549766063690186, "step": 24420 }, { "epoch": 1.1342216444588886, "grad_norm": 187.190185546875, "learning_rate": 2.3198848600213565e-07, "logits/chosen": -17.529687881469727, "logits/rejected": -18.112642288208008, "logps/chosen": -255.83816528320312, "logps/rejected": -312.6164245605469, "loss": 1.1497, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7496551275253296, "rewards/margins": -0.6492700576782227, "rewards/rejected": 2.398925304412842, "step": 24430 }, { "epoch": 1.1346859185663216, "grad_norm": 127.74352264404297, "learning_rate": 2.3196062955568966e-07, "logits/chosen": -19.92841911315918, "logits/rejected": -18.565853118896484, "logps/chosen": -362.99407958984375, "logps/rejected": -258.0299377441406, "loss": 0.6291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.774303674697876, "rewards/margins": 0.6072408556938171, "rewards/rejected": 2.167062520980835, "step": 24440 }, { "epoch": 1.1351501926737546, "grad_norm": 91.04652404785156, "learning_rate": 2.319327731092437e-07, "logits/chosen": -19.453283309936523, "logits/rejected": -18.604984283447266, "logps/chosen": -431.46429443359375, "logps/rejected": -342.06854248046875, "loss": 0.5918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.208135604858398, "rewards/margins": 1.2466802597045898, "rewards/rejected": 2.9614551067352295, "step": 24450 }, { "epoch": 1.1356144667811876, "grad_norm": 17.863006591796875, "learning_rate": 2.319049166627977e-07, "logits/chosen": -20.68062973022461, "logits/rejected": -19.859424591064453, "logps/chosen": -344.4933776855469, "logps/rejected": -370.2170715332031, "loss": 0.615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.608415126800537, "rewards/margins": 0.4862369894981384, "rewards/rejected": 2.122178316116333, "step": 24460 }, { "epoch": 1.1360787408886206, "grad_norm": 46.14609909057617, "learning_rate": 2.318770602163517e-07, "logits/chosen": -18.804622650146484, "logits/rejected": -17.021312713623047, "logps/chosen": -423.63983154296875, "logps/rejected": -275.64276123046875, "loss": 0.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5069117546081543, "rewards/margins": 1.5116859674453735, "rewards/rejected": 1.9952255487442017, "step": 24470 }, { "epoch": 1.1365430149960536, "grad_norm": 47.9732780456543, "learning_rate": 2.3184920376990575e-07, "logits/chosen": -18.891572952270508, "logits/rejected": -17.407733917236328, "logps/chosen": -342.2648620605469, "logps/rejected": -269.41522216796875, "loss": 0.6222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9514896869659424, "rewards/margins": 1.2890074253082275, "rewards/rejected": 1.662481665611267, "step": 24480 }, { "epoch": 1.1370072891034866, "grad_norm": 44.334938049316406, "learning_rate": 2.3182134732345976e-07, "logits/chosen": -19.12904930114746, "logits/rejected": -18.603343963623047, "logps/chosen": -302.9671630859375, "logps/rejected": -267.7325134277344, "loss": 0.3695, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3440966606140137, "rewards/margins": 1.2059838771820068, "rewards/rejected": 1.1381127834320068, "step": 24490 }, { "epoch": 1.1374715632109198, "grad_norm": 55.072601318359375, "learning_rate": 2.3179349087701375e-07, "logits/chosen": -18.793132781982422, "logits/rejected": -18.933475494384766, "logps/chosen": -410.6122131347656, "logps/rejected": -392.210205078125, "loss": 0.6186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.768500566482544, "rewards/margins": 0.6476088762283325, "rewards/rejected": 2.1208913326263428, "step": 24500 }, { "epoch": 1.1379358373183528, "grad_norm": 131.56936645507812, "learning_rate": 2.317656344305678e-07, "logits/chosen": -18.353952407836914, "logits/rejected": -17.787540435791016, "logps/chosen": -398.72918701171875, "logps/rejected": -332.37945556640625, "loss": 0.9239, "rewards/accuracies": 0.5, "rewards/chosen": 3.8605499267578125, "rewards/margins": 0.5561213493347168, "rewards/rejected": 3.304428815841675, "step": 24510 }, { "epoch": 1.1384001114257858, "grad_norm": 56.4591178894043, "learning_rate": 2.317377779841218e-07, "logits/chosen": -18.51685905456543, "logits/rejected": -18.367042541503906, "logps/chosen": -389.97515869140625, "logps/rejected": -386.25323486328125, "loss": 0.7489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9388818740844727, "rewards/margins": 0.1931804120540619, "rewards/rejected": 2.745702028274536, "step": 24520 }, { "epoch": 1.1388643855332188, "grad_norm": 49.73676681518555, "learning_rate": 2.3170992153767585e-07, "logits/chosen": -19.167179107666016, "logits/rejected": -18.947179794311523, "logps/chosen": -434.17230224609375, "logps/rejected": -449.70172119140625, "loss": 0.8784, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.282463550567627, "rewards/margins": 0.01375799160450697, "rewards/rejected": 3.268704891204834, "step": 24530 }, { "epoch": 1.1393286596406518, "grad_norm": 40.11857223510742, "learning_rate": 2.3168206509122984e-07, "logits/chosen": -20.48246192932129, "logits/rejected": -20.129375457763672, "logps/chosen": -320.1930236816406, "logps/rejected": -287.4174499511719, "loss": 0.543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.771798610687256, "rewards/margins": 0.6915974617004395, "rewards/rejected": 2.0802011489868164, "step": 24540 }, { "epoch": 1.139792933748085, "grad_norm": 64.90733337402344, "learning_rate": 2.3165420864478388e-07, "logits/chosen": -17.931001663208008, "logits/rejected": -17.64967918395996, "logps/chosen": -257.47027587890625, "logps/rejected": -220.6314239501953, "loss": 0.6337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8292964696884155, "rewards/margins": 0.5404552221298218, "rewards/rejected": 1.2888414859771729, "step": 24550 }, { "epoch": 1.140257207855518, "grad_norm": 88.8658218383789, "learning_rate": 2.316263521983379e-07, "logits/chosen": -19.764068603515625, "logits/rejected": -19.15439224243164, "logps/chosen": -452.2509765625, "logps/rejected": -375.98516845703125, "loss": 0.7208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.749724864959717, "rewards/margins": 1.1675970554351807, "rewards/rejected": 2.5821280479431152, "step": 24560 }, { "epoch": 1.140721481962951, "grad_norm": 14.8189058303833, "learning_rate": 2.3159849575189193e-07, "logits/chosen": -18.181285858154297, "logits/rejected": -17.320892333984375, "logps/chosen": -353.0947570800781, "logps/rejected": -250.725341796875, "loss": 0.6466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2101850509643555, "rewards/margins": 0.6377929449081421, "rewards/rejected": 1.5723921060562134, "step": 24570 }, { "epoch": 1.141185756070384, "grad_norm": 14.582944869995117, "learning_rate": 2.3157063930544592e-07, "logits/chosen": -17.92072868347168, "logits/rejected": -16.66061782836914, "logps/chosen": -474.2835388183594, "logps/rejected": -260.0629577636719, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": 3.625621795654297, "rewards/margins": 1.8382351398468018, "rewards/rejected": 1.7873866558074951, "step": 24580 }, { "epoch": 1.141650030177817, "grad_norm": 190.0041961669922, "learning_rate": 2.3154278285899994e-07, "logits/chosen": -18.67954444885254, "logits/rejected": -18.037105560302734, "logps/chosen": -397.37591552734375, "logps/rejected": -312.7639465332031, "loss": 0.8231, "rewards/accuracies": 0.5, "rewards/chosen": 3.136197805404663, "rewards/margins": 0.7055668830871582, "rewards/rejected": 2.430630922317505, "step": 24590 }, { "epoch": 1.14211430428525, "grad_norm": 234.23712158203125, "learning_rate": 2.3151492641255398e-07, "logits/chosen": -19.51858139038086, "logits/rejected": -18.78078842163086, "logps/chosen": -442.811767578125, "logps/rejected": -282.5558776855469, "loss": 0.7637, "rewards/accuracies": 0.5, "rewards/chosen": 2.852647542953491, "rewards/margins": 0.509562611579895, "rewards/rejected": 2.3430848121643066, "step": 24600 }, { "epoch": 1.142578578392683, "grad_norm": 57.936981201171875, "learning_rate": 2.31487069966108e-07, "logits/chosen": -18.75131607055664, "logits/rejected": -17.98633575439453, "logps/chosen": -334.35028076171875, "logps/rejected": -285.0020751953125, "loss": 0.541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6674180030822754, "rewards/margins": 0.5043926239013672, "rewards/rejected": 2.163025379180908, "step": 24610 }, { "epoch": 1.1430428525001162, "grad_norm": 14.724791526794434, "learning_rate": 2.3145921351966198e-07, "logits/chosen": -17.917909622192383, "logits/rejected": -17.776630401611328, "logps/chosen": -354.39813232421875, "logps/rejected": -291.8137512207031, "loss": 0.6618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7536208629608154, "rewards/margins": 0.8994365930557251, "rewards/rejected": 1.8541843891143799, "step": 24620 }, { "epoch": 1.1435071266075492, "grad_norm": 33.56155014038086, "learning_rate": 2.3143135707321602e-07, "logits/chosen": -19.153549194335938, "logits/rejected": -17.805471420288086, "logps/chosen": -358.8225402832031, "logps/rejected": -318.6228332519531, "loss": 0.7601, "rewards/accuracies": 0.5, "rewards/chosen": 2.7133002281188965, "rewards/margins": 0.46165624260902405, "rewards/rejected": 2.2516438961029053, "step": 24630 }, { "epoch": 1.1439714007149822, "grad_norm": 168.40464782714844, "learning_rate": 2.3140350062677003e-07, "logits/chosen": -19.100337982177734, "logits/rejected": -18.252809524536133, "logps/chosen": -556.9487915039062, "logps/rejected": -417.4564514160156, "loss": 0.3688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7031643390655518, "rewards/margins": 1.3063760995864868, "rewards/rejected": 2.396787643432617, "step": 24640 }, { "epoch": 1.1444356748224151, "grad_norm": 61.822898864746094, "learning_rate": 2.3137564418032405e-07, "logits/chosen": -19.578569412231445, "logits/rejected": -18.77605438232422, "logps/chosen": -487.77215576171875, "logps/rejected": -396.17529296875, "loss": 0.781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.181243658065796, "rewards/margins": 0.8771944046020508, "rewards/rejected": 2.304049253463745, "step": 24650 }, { "epoch": 1.1448999489298481, "grad_norm": 45.79975891113281, "learning_rate": 2.3134778773387806e-07, "logits/chosen": -19.116283416748047, "logits/rejected": -18.872777938842773, "logps/chosen": -455.61383056640625, "logps/rejected": -387.520751953125, "loss": 1.1409, "rewards/accuracies": 0.5, "rewards/chosen": 3.131030559539795, "rewards/margins": -0.2497672587633133, "rewards/rejected": 3.3807976245880127, "step": 24660 }, { "epoch": 1.1453642230372811, "grad_norm": 26.460899353027344, "learning_rate": 2.3131993128743208e-07, "logits/chosen": -18.937679290771484, "logits/rejected": -18.222400665283203, "logps/chosen": -275.13519287109375, "logps/rejected": -217.7093963623047, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7281413078308105, "rewards/margins": 0.7369135618209839, "rewards/rejected": 1.9912281036376953, "step": 24670 }, { "epoch": 1.1458284971447141, "grad_norm": 23.56918716430664, "learning_rate": 2.3129207484098612e-07, "logits/chosen": -19.963787078857422, "logits/rejected": -18.157581329345703, "logps/chosen": -351.5709228515625, "logps/rejected": -236.80874633789062, "loss": 0.7012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.949216365814209, "rewards/margins": 1.0282608270645142, "rewards/rejected": 1.9209553003311157, "step": 24680 }, { "epoch": 1.1462927712521473, "grad_norm": 49.14011764526367, "learning_rate": 2.312642183945401e-07, "logits/chosen": -18.933944702148438, "logits/rejected": -18.821086883544922, "logps/chosen": -441.8084411621094, "logps/rejected": -422.66534423828125, "loss": 0.9356, "rewards/accuracies": 0.5, "rewards/chosen": 3.1000595092773438, "rewards/margins": 0.0826156884431839, "rewards/rejected": 3.0174436569213867, "step": 24690 }, { "epoch": 1.1467570453595803, "grad_norm": 45.85734558105469, "learning_rate": 2.3123636194809415e-07, "logits/chosen": -18.423412322998047, "logits/rejected": -17.275794982910156, "logps/chosen": -424.15814208984375, "logps/rejected": -268.22271728515625, "loss": 0.4909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1289045810699463, "rewards/margins": 1.0221513509750366, "rewards/rejected": 2.106753349304199, "step": 24700 }, { "epoch": 1.1472213194670133, "grad_norm": 19.94774055480957, "learning_rate": 2.3120850550164816e-07, "logits/chosen": -19.300437927246094, "logits/rejected": -18.36725425720215, "logps/chosen": -492.6963806152344, "logps/rejected": -436.3182678222656, "loss": 0.6302, "rewards/accuracies": 0.5, "rewards/chosen": 3.3473949432373047, "rewards/margins": 0.49673357605934143, "rewards/rejected": 2.850661516189575, "step": 24710 }, { "epoch": 1.1476855935744463, "grad_norm": 26.480192184448242, "learning_rate": 2.311806490552022e-07, "logits/chosen": -17.45543098449707, "logits/rejected": -16.996421813964844, "logps/chosen": -390.07025146484375, "logps/rejected": -273.031005859375, "loss": 0.7384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1286330223083496, "rewards/margins": 0.5533198118209839, "rewards/rejected": 1.5753133296966553, "step": 24720 }, { "epoch": 1.1481498676818793, "grad_norm": 156.10369873046875, "learning_rate": 2.311527926087562e-07, "logits/chosen": -19.818025588989258, "logits/rejected": -18.214710235595703, "logps/chosen": -373.6640930175781, "logps/rejected": -305.25213623046875, "loss": 0.365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392800807952881, "rewards/margins": 1.24282968044281, "rewards/rejected": 2.1499712467193604, "step": 24730 }, { "epoch": 1.1486141417893123, "grad_norm": 50.86033248901367, "learning_rate": 2.311249361623102e-07, "logits/chosen": -18.66566276550293, "logits/rejected": -17.42190933227539, "logps/chosen": -317.9676818847656, "logps/rejected": -233.7772674560547, "loss": 0.649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.398019313812256, "rewards/margins": 0.630205512046814, "rewards/rejected": 1.7678142786026, "step": 24740 }, { "epoch": 1.1490784158967455, "grad_norm": 94.56820678710938, "learning_rate": 2.3109707971586425e-07, "logits/chosen": -18.98788833618164, "logits/rejected": -19.370014190673828, "logps/chosen": -360.47906494140625, "logps/rejected": -399.56451416015625, "loss": 1.319, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7074663639068604, "rewards/margins": -0.7481219172477722, "rewards/rejected": 3.4555881023406982, "step": 24750 }, { "epoch": 1.1495426900041785, "grad_norm": 12.051746368408203, "learning_rate": 2.3106922326941826e-07, "logits/chosen": -19.743257522583008, "logits/rejected": -18.74258041381836, "logps/chosen": -487.3063049316406, "logps/rejected": -340.2850036621094, "loss": 0.4254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.435821056365967, "rewards/margins": 1.1823644638061523, "rewards/rejected": 2.2534568309783936, "step": 24760 }, { "epoch": 1.1500069641116115, "grad_norm": 58.20087814331055, "learning_rate": 2.3104136682297225e-07, "logits/chosen": -17.967655181884766, "logits/rejected": -17.52920913696289, "logps/chosen": -342.2914123535156, "logps/rejected": -264.23095703125, "loss": 0.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.369751453399658, "rewards/margins": 1.0112111568450928, "rewards/rejected": 1.3585405349731445, "step": 24770 }, { "epoch": 1.1504712382190445, "grad_norm": 2.4441256523132324, "learning_rate": 2.310135103765263e-07, "logits/chosen": -18.451648712158203, "logits/rejected": -17.79920768737793, "logps/chosen": -432.35089111328125, "logps/rejected": -313.916015625, "loss": 0.6206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3017241954803467, "rewards/margins": 1.0672274827957153, "rewards/rejected": 2.234496593475342, "step": 24780 }, { "epoch": 1.1509355123264775, "grad_norm": 25.569656372070312, "learning_rate": 2.309856539300803e-07, "logits/chosen": -18.120756149291992, "logits/rejected": -17.284971237182617, "logps/chosen": -389.4742126464844, "logps/rejected": -329.5943603515625, "loss": 0.61, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7186570167541504, "rewards/margins": 0.8955747485160828, "rewards/rejected": 1.8230822086334229, "step": 24790 }, { "epoch": 1.1513997864339105, "grad_norm": 22.179655075073242, "learning_rate": 2.3095779748363435e-07, "logits/chosen": -19.38514518737793, "logits/rejected": -18.70474624633789, "logps/chosen": -268.8356018066406, "logps/rejected": -240.29605102539062, "loss": 0.6785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4760501384735107, "rewards/margins": 0.560701310634613, "rewards/rejected": 1.915348768234253, "step": 24800 }, { "epoch": 1.1518640605413437, "grad_norm": 49.0268440246582, "learning_rate": 2.3092994103718833e-07, "logits/chosen": -17.746952056884766, "logits/rejected": -17.121997833251953, "logps/chosen": -394.10736083984375, "logps/rejected": -323.9619445800781, "loss": 0.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8262627124786377, "rewards/margins": 0.8853042721748352, "rewards/rejected": 1.9409583806991577, "step": 24810 }, { "epoch": 1.1523283346487767, "grad_norm": 77.47825622558594, "learning_rate": 2.3090208459074238e-07, "logits/chosen": -18.461246490478516, "logits/rejected": -16.983558654785156, "logps/chosen": -353.6230163574219, "logps/rejected": -229.68148803710938, "loss": 0.7694, "rewards/accuracies": 0.5, "rewards/chosen": 2.342447280883789, "rewards/margins": 0.6561403274536133, "rewards/rejected": 1.6863069534301758, "step": 24820 }, { "epoch": 1.1527926087562097, "grad_norm": 78.94287872314453, "learning_rate": 2.308742281442964e-07, "logits/chosen": -18.00530433654785, "logits/rejected": -18.017702102661133, "logps/chosen": -364.206787109375, "logps/rejected": -401.40625, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 2.9766488075256348, "rewards/margins": 0.4306471347808838, "rewards/rejected": 2.546001434326172, "step": 24830 }, { "epoch": 1.1532568828636427, "grad_norm": 125.89282989501953, "learning_rate": 2.3084637169785038e-07, "logits/chosen": -18.369380950927734, "logits/rejected": -17.463932037353516, "logps/chosen": -406.8461608886719, "logps/rejected": -260.81146240234375, "loss": 0.4788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.535003662109375, "rewards/margins": 1.301232099533081, "rewards/rejected": 2.233771562576294, "step": 24840 }, { "epoch": 1.1537211569710757, "grad_norm": 59.073360443115234, "learning_rate": 2.3081851525140442e-07, "logits/chosen": -18.009838104248047, "logits/rejected": -17.054439544677734, "logps/chosen": -390.884765625, "logps/rejected": -365.7491455078125, "loss": 0.471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2791171073913574, "rewards/margins": 0.879958987236023, "rewards/rejected": 2.399158000946045, "step": 24850 }, { "epoch": 1.1541854310785087, "grad_norm": 12.616838455200195, "learning_rate": 2.3079065880495843e-07, "logits/chosen": -19.517440795898438, "logits/rejected": -18.7778377532959, "logps/chosen": -416.83892822265625, "logps/rejected": -426.4002380371094, "loss": 0.5174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.263859987258911, "rewards/margins": 0.8800994753837585, "rewards/rejected": 2.383760452270508, "step": 24860 }, { "epoch": 1.1546497051859417, "grad_norm": 64.17056274414062, "learning_rate": 2.3076280235851247e-07, "logits/chosen": -19.516881942749023, "logits/rejected": -18.65766143798828, "logps/chosen": -491.9638671875, "logps/rejected": -445.06292724609375, "loss": 0.5712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3596882820129395, "rewards/margins": 0.9215296506881714, "rewards/rejected": 2.4381582736968994, "step": 24870 }, { "epoch": 1.155113979293375, "grad_norm": 27.76569175720215, "learning_rate": 2.3073494591206646e-07, "logits/chosen": -18.63046646118164, "logits/rejected": -18.573978424072266, "logps/chosen": -396.7411193847656, "logps/rejected": -403.84417724609375, "loss": 0.8251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3147644996643066, "rewards/margins": 0.6416038870811462, "rewards/rejected": 2.6731603145599365, "step": 24880 }, { "epoch": 1.1555782534008079, "grad_norm": 10.63296890258789, "learning_rate": 2.3070708946562048e-07, "logits/chosen": -18.565454483032227, "logits/rejected": -17.981693267822266, "logps/chosen": -477.5858459472656, "logps/rejected": -399.4411926269531, "loss": 1.0089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4382071495056152, "rewards/margins": 0.6900867223739624, "rewards/rejected": 2.7481203079223633, "step": 24890 }, { "epoch": 1.1560425275082409, "grad_norm": 11.328617095947266, "learning_rate": 2.3067923301917452e-07, "logits/chosen": -19.112895965576172, "logits/rejected": -18.451295852661133, "logps/chosen": -534.4278564453125, "logps/rejected": -347.9554138183594, "loss": 0.3697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8598990440368652, "rewards/margins": 1.480078935623169, "rewards/rejected": 2.3798201084136963, "step": 24900 }, { "epoch": 1.1565068016156739, "grad_norm": 54.46181869506836, "learning_rate": 2.3065137657272853e-07, "logits/chosen": -18.935344696044922, "logits/rejected": -17.98575782775879, "logps/chosen": -348.4180908203125, "logps/rejected": -217.5741729736328, "loss": 0.4916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.851160764694214, "rewards/margins": 0.9792073369026184, "rewards/rejected": 1.8719536066055298, "step": 24910 }, { "epoch": 1.1569710757231069, "grad_norm": 23.24711799621582, "learning_rate": 2.3062352012628252e-07, "logits/chosen": -18.507152557373047, "logits/rejected": -17.429672241210938, "logps/chosen": -393.9203186035156, "logps/rejected": -281.9635009765625, "loss": 0.5137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8107168674468994, "rewards/margins": 0.7254303693771362, "rewards/rejected": 2.0852866172790527, "step": 24920 }, { "epoch": 1.1574353498305399, "grad_norm": 12.60099983215332, "learning_rate": 2.3059566367983656e-07, "logits/chosen": -18.60142707824707, "logits/rejected": -18.010164260864258, "logps/chosen": -432.38616943359375, "logps/rejected": -409.93780517578125, "loss": 1.2036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.132324695587158, "rewards/margins": 0.39693084359169006, "rewards/rejected": 2.735393762588501, "step": 24930 }, { "epoch": 1.1578996239379729, "grad_norm": 136.44285583496094, "learning_rate": 2.3056780723339058e-07, "logits/chosen": -18.075546264648438, "logits/rejected": -17.97252655029297, "logps/chosen": -297.880126953125, "logps/rejected": -273.8750915527344, "loss": 0.7332, "rewards/accuracies": 0.5, "rewards/chosen": 2.259052038192749, "rewards/margins": 0.29674360156059265, "rewards/rejected": 1.9623081684112549, "step": 24940 }, { "epoch": 1.158363898045406, "grad_norm": 81.0479736328125, "learning_rate": 2.3053995078694462e-07, "logits/chosen": -19.13484764099121, "logits/rejected": -18.368127822875977, "logps/chosen": -434.4019470214844, "logps/rejected": -320.99871826171875, "loss": 0.3791, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.654543399810791, "rewards/margins": 1.3359334468841553, "rewards/rejected": 2.3186097145080566, "step": 24950 }, { "epoch": 1.158828172152839, "grad_norm": 244.16995239257812, "learning_rate": 2.305120943404986e-07, "logits/chosen": -18.534828186035156, "logits/rejected": -17.754243850708008, "logps/chosen": -486.65692138671875, "logps/rejected": -399.4744873046875, "loss": 0.8252, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2978224754333496, "rewards/margins": 0.48926371335983276, "rewards/rejected": 2.808558464050293, "step": 24960 }, { "epoch": 1.159292446260272, "grad_norm": 31.090316772460938, "learning_rate": 2.3048423789405265e-07, "logits/chosen": -18.833847045898438, "logits/rejected": -17.78860092163086, "logps/chosen": -433.7406311035156, "logps/rejected": -256.77655029296875, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.598278760910034, "rewards/margins": 1.479293942451477, "rewards/rejected": 2.1189846992492676, "step": 24970 }, { "epoch": 1.159756720367705, "grad_norm": 26.084716796875, "learning_rate": 2.3045638144760666e-07, "logits/chosen": -18.894256591796875, "logits/rejected": -18.523656845092773, "logps/chosen": -440.68341064453125, "logps/rejected": -399.3804931640625, "loss": 0.9088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.544980525970459, "rewards/margins": 0.17500773072242737, "rewards/rejected": 2.3699727058410645, "step": 24980 }, { "epoch": 1.160220994475138, "grad_norm": 147.92642211914062, "learning_rate": 2.304285250011607e-07, "logits/chosen": -19.587753295898438, "logits/rejected": -18.658859252929688, "logps/chosen": -379.5489196777344, "logps/rejected": -413.42633056640625, "loss": 0.9032, "rewards/accuracies": 0.5, "rewards/chosen": 2.8541007041931152, "rewards/margins": -0.029348814859986305, "rewards/rejected": 2.8834495544433594, "step": 24990 }, { "epoch": 1.1606852685825713, "grad_norm": 90.1060562133789, "learning_rate": 2.304006685547147e-07, "logits/chosen": -18.331247329711914, "logits/rejected": -17.880523681640625, "logps/chosen": -316.127197265625, "logps/rejected": -248.2584686279297, "loss": 0.4624, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9401941299438477, "rewards/margins": 0.7314738035202026, "rewards/rejected": 1.2087204456329346, "step": 25000 }, { "epoch": 1.1611495426900043, "grad_norm": 193.89683532714844, "learning_rate": 2.303728121082687e-07, "logits/chosen": -18.138351440429688, "logits/rejected": -17.413400650024414, "logps/chosen": -348.14495849609375, "logps/rejected": -226.95321655273438, "loss": 0.8087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9828805923461914, "rewards/margins": 1.1399974822998047, "rewards/rejected": 1.8428828716278076, "step": 25010 }, { "epoch": 1.1616138167974372, "grad_norm": 12.745986938476562, "learning_rate": 2.3034495566182275e-07, "logits/chosen": -18.587596893310547, "logits/rejected": -18.404253005981445, "logps/chosen": -462.840087890625, "logps/rejected": -378.9287414550781, "loss": 0.5729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.172077655792236, "rewards/margins": 1.2829853296279907, "rewards/rejected": 2.889091968536377, "step": 25020 }, { "epoch": 1.1620780909048702, "grad_norm": 57.53092575073242, "learning_rate": 2.3031709921537676e-07, "logits/chosen": -19.03474998474121, "logits/rejected": -17.52083969116211, "logps/chosen": -533.884765625, "logps/rejected": -378.3073425292969, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.421450138092041, "rewards/margins": 1.450409173965454, "rewards/rejected": 2.9710402488708496, "step": 25030 }, { "epoch": 1.1625423650123032, "grad_norm": 155.42372131347656, "learning_rate": 2.3028924276893075e-07, "logits/chosen": -20.079965591430664, "logits/rejected": -19.22983169555664, "logps/chosen": -546.5855712890625, "logps/rejected": -455.142822265625, "loss": 0.9952, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5270943641662598, "rewards/margins": 0.22998511791229248, "rewards/rejected": 3.297109603881836, "step": 25040 }, { "epoch": 1.1630066391197362, "grad_norm": 100.45872497558594, "learning_rate": 2.302613863224848e-07, "logits/chosen": -18.402996063232422, "logits/rejected": -17.395362854003906, "logps/chosen": -344.38287353515625, "logps/rejected": -234.85446166992188, "loss": 0.6722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.090771198272705, "rewards/margins": 0.7006543874740601, "rewards/rejected": 1.390116810798645, "step": 25050 }, { "epoch": 1.1634709132271692, "grad_norm": 112.49092102050781, "learning_rate": 2.302335298760388e-07, "logits/chosen": -18.24624252319336, "logits/rejected": -17.9709415435791, "logps/chosen": -349.5583190917969, "logps/rejected": -370.2745666503906, "loss": 0.7797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.461217164993286, "rewards/margins": 0.13944634795188904, "rewards/rejected": 3.321770429611206, "step": 25060 }, { "epoch": 1.1639351873346024, "grad_norm": 22.555015563964844, "learning_rate": 2.3020567342959282e-07, "logits/chosen": -18.01424789428711, "logits/rejected": -16.98668098449707, "logps/chosen": -309.29132080078125, "logps/rejected": -212.9183807373047, "loss": 0.5638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.666041612625122, "rewards/margins": 1.2442386150360107, "rewards/rejected": 1.4218028783798218, "step": 25070 }, { "epoch": 1.1643994614420354, "grad_norm": 119.29778289794922, "learning_rate": 2.3017781698314683e-07, "logits/chosen": -17.586565017700195, "logits/rejected": -17.33010482788086, "logps/chosen": -405.75189208984375, "logps/rejected": -299.9683837890625, "loss": 0.8655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.588251829147339, "rewards/margins": 0.6231354475021362, "rewards/rejected": 1.965116262435913, "step": 25080 }, { "epoch": 1.1648637355494684, "grad_norm": 30.189701080322266, "learning_rate": 2.3014996053670085e-07, "logits/chosen": -19.18890380859375, "logits/rejected": -17.71551513671875, "logps/chosen": -411.1582946777344, "logps/rejected": -346.32940673828125, "loss": 0.4875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.146242618560791, "rewards/margins": 0.8789067268371582, "rewards/rejected": 2.267336368560791, "step": 25090 }, { "epoch": 1.1653280096569014, "grad_norm": 38.30793762207031, "learning_rate": 2.301221040902549e-07, "logits/chosen": -18.56631851196289, "logits/rejected": -17.751766204833984, "logps/chosen": -378.48602294921875, "logps/rejected": -295.12591552734375, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8484482765197754, "rewards/margins": 1.0816519260406494, "rewards/rejected": 1.7667964696884155, "step": 25100 }, { "epoch": 1.1657922837643344, "grad_norm": 23.025060653686523, "learning_rate": 2.3009424764380888e-07, "logits/chosen": -19.186933517456055, "logits/rejected": -18.042085647583008, "logps/chosen": -418.58636474609375, "logps/rejected": -357.11920166015625, "loss": 0.4065, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.850635528564453, "rewards/margins": 1.2290422916412354, "rewards/rejected": 2.621593713760376, "step": 25110 }, { "epoch": 1.1662565578717674, "grad_norm": 6.542489528656006, "learning_rate": 2.3006639119736292e-07, "logits/chosen": -20.08696746826172, "logits/rejected": -18.294105529785156, "logps/chosen": -426.6231384277344, "logps/rejected": -256.1253356933594, "loss": 0.6415, "rewards/accuracies": 0.5, "rewards/chosen": 3.592777967453003, "rewards/margins": 1.3947415351867676, "rewards/rejected": 2.1980364322662354, "step": 25120 }, { "epoch": 1.1667208319792004, "grad_norm": 38.598602294921875, "learning_rate": 2.3003853475091693e-07, "logits/chosen": -18.422143936157227, "logits/rejected": -17.65265655517578, "logps/chosen": -415.91961669921875, "logps/rejected": -328.80511474609375, "loss": 0.5576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8006906509399414, "rewards/margins": 0.4529023766517639, "rewards/rejected": 2.3477883338928223, "step": 25130 }, { "epoch": 1.1671851060866336, "grad_norm": 53.47502517700195, "learning_rate": 2.3001067830447097e-07, "logits/chosen": -18.79631805419922, "logits/rejected": -17.781452178955078, "logps/chosen": -392.11785888671875, "logps/rejected": -323.19342041015625, "loss": 0.8495, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.573538303375244, "rewards/margins": -0.035877861082553864, "rewards/rejected": 2.6094160079956055, "step": 25140 }, { "epoch": 1.1676493801940666, "grad_norm": 50.905757904052734, "learning_rate": 2.2998282185802496e-07, "logits/chosen": -18.178491592407227, "logits/rejected": -18.99294662475586, "logps/chosen": -380.0366516113281, "logps/rejected": -379.0440673828125, "loss": 0.7424, "rewards/accuracies": 0.5, "rewards/chosen": 3.0570578575134277, "rewards/margins": 0.22269991040229797, "rewards/rejected": 2.834357738494873, "step": 25150 }, { "epoch": 1.1681136543014996, "grad_norm": 3.952911138534546, "learning_rate": 2.2995496541157898e-07, "logits/chosen": -18.42087173461914, "logits/rejected": -17.579038619995117, "logps/chosen": -355.2574462890625, "logps/rejected": -292.19061279296875, "loss": 0.6941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9364922046661377, "rewards/margins": 0.8837423324584961, "rewards/rejected": 2.0527498722076416, "step": 25160 }, { "epoch": 1.1685779284089326, "grad_norm": 40.17168045043945, "learning_rate": 2.2992710896513302e-07, "logits/chosen": -18.331302642822266, "logits/rejected": -17.23093032836914, "logps/chosen": -397.3661193847656, "logps/rejected": -246.0234832763672, "loss": 0.5893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.151216506958008, "rewards/margins": 1.2795406579971313, "rewards/rejected": 1.871675729751587, "step": 25170 }, { "epoch": 1.1690422025163656, "grad_norm": 68.15444946289062, "learning_rate": 2.2989925251868703e-07, "logits/chosen": -18.567312240600586, "logits/rejected": -18.424699783325195, "logps/chosen": -488.561767578125, "logps/rejected": -434.11932373046875, "loss": 0.9207, "rewards/accuracies": 0.5, "rewards/chosen": 2.6832351684570312, "rewards/margins": -0.16409043967723846, "rewards/rejected": 2.8473258018493652, "step": 25180 }, { "epoch": 1.1695064766237988, "grad_norm": 168.68475341796875, "learning_rate": 2.2987139607224102e-07, "logits/chosen": -18.229156494140625, "logits/rejected": -17.731082916259766, "logps/chosen": -336.15338134765625, "logps/rejected": -268.5363464355469, "loss": 0.8147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8301985263824463, "rewards/margins": 0.09718076139688492, "rewards/rejected": 1.7330175638198853, "step": 25190 }, { "epoch": 1.1699707507312318, "grad_norm": 20.694162368774414, "learning_rate": 2.2984353962579506e-07, "logits/chosen": -18.981494903564453, "logits/rejected": -17.925886154174805, "logps/chosen": -374.2256774902344, "logps/rejected": -275.9838562011719, "loss": 0.5683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7354073524475098, "rewards/margins": 0.7770344018936157, "rewards/rejected": 1.9583728313446045, "step": 25200 }, { "epoch": 1.1704350248386648, "grad_norm": 223.71044921875, "learning_rate": 2.2981568317934907e-07, "logits/chosen": -18.26744842529297, "logits/rejected": -18.26230239868164, "logps/chosen": -423.5868225097656, "logps/rejected": -326.01055908203125, "loss": 1.3233, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5411572456359863, "rewards/margins": -0.16538572311401367, "rewards/rejected": 2.706543445587158, "step": 25210 }, { "epoch": 1.1708992989460978, "grad_norm": 54.078392028808594, "learning_rate": 2.2978782673290312e-07, "logits/chosen": -17.392391204833984, "logits/rejected": -17.214550018310547, "logps/chosen": -344.1957092285156, "logps/rejected": -318.71600341796875, "loss": 0.6138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7214345932006836, "rewards/margins": 0.5440085530281067, "rewards/rejected": 2.1774258613586426, "step": 25220 }, { "epoch": 1.1713635730535308, "grad_norm": 41.88188552856445, "learning_rate": 2.297599702864571e-07, "logits/chosen": -18.327396392822266, "logits/rejected": -17.472312927246094, "logps/chosen": -288.7088317871094, "logps/rejected": -213.2657012939453, "loss": 0.4183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6301467418670654, "rewards/margins": 1.2924715280532837, "rewards/rejected": 1.3376752138137817, "step": 25230 }, { "epoch": 1.1718278471609638, "grad_norm": 61.19443130493164, "learning_rate": 2.2973211384001115e-07, "logits/chosen": -19.256328582763672, "logits/rejected": -19.06907081604004, "logps/chosen": -386.30047607421875, "logps/rejected": -329.99456787109375, "loss": 0.6772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1503725051879883, "rewards/margins": 0.9038734436035156, "rewards/rejected": 2.2464990615844727, "step": 25240 }, { "epoch": 1.1722921212683968, "grad_norm": 57.00078201293945, "learning_rate": 2.2970425739356516e-07, "logits/chosen": -18.595088958740234, "logits/rejected": -18.348281860351562, "logps/chosen": -350.4451599121094, "logps/rejected": -269.054931640625, "loss": 0.5302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9370977878570557, "rewards/margins": 0.9424684643745422, "rewards/rejected": 1.9946295022964478, "step": 25250 }, { "epoch": 1.17275639537583, "grad_norm": 58.20320129394531, "learning_rate": 2.2967640094711915e-07, "logits/chosen": -18.76167869567871, "logits/rejected": -18.252182006835938, "logps/chosen": -457.13201904296875, "logps/rejected": -404.281494140625, "loss": 0.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1031582355499268, "rewards/margins": 0.2717275619506836, "rewards/rejected": 2.831430435180664, "step": 25260 }, { "epoch": 1.173220669483263, "grad_norm": 128.0050506591797, "learning_rate": 2.296485445006732e-07, "logits/chosen": -18.849315643310547, "logits/rejected": -18.188467025756836, "logps/chosen": -539.7435913085938, "logps/rejected": -428.30224609375, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 3.576662540435791, "rewards/margins": 0.44445866346359253, "rewards/rejected": 3.1322033405303955, "step": 25270 }, { "epoch": 1.173684943590696, "grad_norm": 80.98594665527344, "learning_rate": 2.296206880542272e-07, "logits/chosen": -18.03525161743164, "logits/rejected": -17.786884307861328, "logps/chosen": -478.6861267089844, "logps/rejected": -440.27703857421875, "loss": 0.5671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.343141078948975, "rewards/margins": 0.6139813661575317, "rewards/rejected": 3.729159116744995, "step": 25280 }, { "epoch": 1.174149217698129, "grad_norm": 83.88079833984375, "learning_rate": 2.2959283160778124e-07, "logits/chosen": -18.565309524536133, "logits/rejected": -18.50205421447754, "logps/chosen": -424.94024658203125, "logps/rejected": -326.13800048828125, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9715092182159424, "rewards/margins": 0.7377649545669556, "rewards/rejected": 2.2337441444396973, "step": 25290 }, { "epoch": 1.174613491805562, "grad_norm": 27.267349243164062, "learning_rate": 2.2956497516133523e-07, "logits/chosen": -19.69718360900879, "logits/rejected": -17.844959259033203, "logps/chosen": -473.47833251953125, "logps/rejected": -385.515869140625, "loss": 0.5841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9370319843292236, "rewards/margins": 0.7612287402153015, "rewards/rejected": 2.1758031845092773, "step": 25300 }, { "epoch": 1.175077765912995, "grad_norm": 149.47950744628906, "learning_rate": 2.2953711871488925e-07, "logits/chosen": -18.682313919067383, "logits/rejected": -18.131324768066406, "logps/chosen": -526.2410888671875, "logps/rejected": -437.89019775390625, "loss": 0.4871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7243990898132324, "rewards/margins": 1.1767871379852295, "rewards/rejected": 2.547611951828003, "step": 25310 }, { "epoch": 1.175542040020428, "grad_norm": 45.69804763793945, "learning_rate": 2.295092622684433e-07, "logits/chosen": -19.493946075439453, "logits/rejected": -18.29086685180664, "logps/chosen": -354.79559326171875, "logps/rejected": -328.8020324707031, "loss": 0.5361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3280436992645264, "rewards/margins": 0.7492537498474121, "rewards/rejected": 2.5787901878356934, "step": 25320 }, { "epoch": 1.1760063141278612, "grad_norm": 70.71430206298828, "learning_rate": 2.294814058219973e-07, "logits/chosen": -18.69692039489746, "logits/rejected": -17.93062400817871, "logps/chosen": -408.95172119140625, "logps/rejected": -300.62091064453125, "loss": 0.5412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1107029914855957, "rewards/margins": 0.7076135873794556, "rewards/rejected": 2.4030895233154297, "step": 25330 }, { "epoch": 1.1764705882352942, "grad_norm": 123.63517761230469, "learning_rate": 2.294535493755513e-07, "logits/chosen": -19.36124038696289, "logits/rejected": -18.70895004272461, "logps/chosen": -400.5054626464844, "logps/rejected": -338.3856201171875, "loss": 0.871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4573588371276855, "rewards/margins": 0.4417250156402588, "rewards/rejected": 3.0156335830688477, "step": 25340 }, { "epoch": 1.1769348623427272, "grad_norm": 10.35975456237793, "learning_rate": 2.2942569292910533e-07, "logits/chosen": -19.25037384033203, "logits/rejected": -18.783594131469727, "logps/chosen": -463.417236328125, "logps/rejected": -452.2925720214844, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6946029663085938, "rewards/margins": 0.778597354888916, "rewards/rejected": 2.9160056114196777, "step": 25350 }, { "epoch": 1.1773991364501601, "grad_norm": 67.8857421875, "learning_rate": 2.2939783648265935e-07, "logits/chosen": -19.97662925720215, "logits/rejected": -18.131561279296875, "logps/chosen": -420.65753173828125, "logps/rejected": -280.153564453125, "loss": 0.5719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.886171817779541, "rewards/margins": 1.3689345121383667, "rewards/rejected": 1.5172369480133057, "step": 25360 }, { "epoch": 1.1778634105575931, "grad_norm": 59.21289825439453, "learning_rate": 2.293699800362134e-07, "logits/chosen": -18.668943405151367, "logits/rejected": -18.571369171142578, "logps/chosen": -387.9268798828125, "logps/rejected": -343.7186584472656, "loss": 0.9414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5468335151672363, "rewards/margins": -0.26861563324928284, "rewards/rejected": 2.8154492378234863, "step": 25370 }, { "epoch": 1.1783276846650264, "grad_norm": 7.026782035827637, "learning_rate": 2.2934212358976738e-07, "logits/chosen": -18.219131469726562, "logits/rejected": -17.611522674560547, "logps/chosen": -438.6002502441406, "logps/rejected": -346.5802307128906, "loss": 0.8957, "rewards/accuracies": 0.5, "rewards/chosen": 3.7812037467956543, "rewards/margins": 0.9154647588729858, "rewards/rejected": 2.865739107131958, "step": 25380 }, { "epoch": 1.1787919587724593, "grad_norm": 14.357980728149414, "learning_rate": 2.2931426714332142e-07, "logits/chosen": -19.397985458374023, "logits/rejected": -17.358776092529297, "logps/chosen": -465.56768798828125, "logps/rejected": -272.0730285644531, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": 4.138858795166016, "rewards/margins": 1.8254954814910889, "rewards/rejected": 2.313363552093506, "step": 25390 }, { "epoch": 1.1792562328798923, "grad_norm": 26.661104202270508, "learning_rate": 2.2928641069687543e-07, "logits/chosen": -19.362173080444336, "logits/rejected": -17.834474563598633, "logps/chosen": -461.38677978515625, "logps/rejected": -272.8827209472656, "loss": 0.3907, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0163638591766357, "rewards/margins": 1.4161603450775146, "rewards/rejected": 1.6002037525177002, "step": 25400 }, { "epoch": 1.1797205069873253, "grad_norm": 26.696475982666016, "learning_rate": 2.2925855425042947e-07, "logits/chosen": -19.74039077758789, "logits/rejected": -18.960447311401367, "logps/chosen": -442.9535217285156, "logps/rejected": -388.63165283203125, "loss": 0.7134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2358245849609375, "rewards/margins": 0.5379517674446106, "rewards/rejected": 2.69787335395813, "step": 25410 }, { "epoch": 1.1801847810947583, "grad_norm": 31.066104888916016, "learning_rate": 2.2923069780398346e-07, "logits/chosen": -18.03618812561035, "logits/rejected": -17.507930755615234, "logps/chosen": -329.78411865234375, "logps/rejected": -329.64227294921875, "loss": 0.9031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4502556324005127, "rewards/margins": 0.1597789078950882, "rewards/rejected": 2.2904765605926514, "step": 25420 }, { "epoch": 1.1806490552021913, "grad_norm": 55.532466888427734, "learning_rate": 2.2920284135753747e-07, "logits/chosen": -17.592546463012695, "logits/rejected": -16.631811141967773, "logps/chosen": -334.98602294921875, "logps/rejected": -188.55514526367188, "loss": 0.4354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8934452533721924, "rewards/margins": 1.6342620849609375, "rewards/rejected": 1.2591831684112549, "step": 25430 }, { "epoch": 1.1811133293096243, "grad_norm": 43.47135543823242, "learning_rate": 2.2917498491109152e-07, "logits/chosen": -19.01609230041504, "logits/rejected": -18.968334197998047, "logps/chosen": -391.81781005859375, "logps/rejected": -348.19830322265625, "loss": 1.0031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6983416080474854, "rewards/margins": -0.048636794090270996, "rewards/rejected": 2.746978521347046, "step": 25440 }, { "epoch": 1.1815776034170575, "grad_norm": 64.26664733886719, "learning_rate": 2.291471284646455e-07, "logits/chosen": -19.00113296508789, "logits/rejected": -18.987760543823242, "logps/chosen": -374.84112548828125, "logps/rejected": -348.235595703125, "loss": 0.6504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9848780632019043, "rewards/margins": 0.2679276466369629, "rewards/rejected": 2.7169501781463623, "step": 25450 }, { "epoch": 1.1820418775244905, "grad_norm": 100.19708251953125, "learning_rate": 2.2912205766284415e-07, "logits/chosen": -18.408489227294922, "logits/rejected": -18.490161895751953, "logps/chosen": -478.792236328125, "logps/rejected": -414.35333251953125, "loss": 0.9401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0813283920288086, "rewards/margins": 0.2877081036567688, "rewards/rejected": 2.7936203479766846, "step": 25460 }, { "epoch": 1.1825061516319235, "grad_norm": 257.58502197265625, "learning_rate": 2.2909420121639814e-07, "logits/chosen": -17.26886749267578, "logits/rejected": -18.79363441467285, "logps/chosen": -399.85308837890625, "logps/rejected": -468.15008544921875, "loss": 1.6204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.43646240234375, "rewards/margins": -0.40790730714797974, "rewards/rejected": 3.844369888305664, "step": 25470 }, { "epoch": 1.1829704257393565, "grad_norm": 42.50630569458008, "learning_rate": 2.2906634476995218e-07, "logits/chosen": -19.176124572753906, "logits/rejected": -19.1925106048584, "logps/chosen": -354.8936462402344, "logps/rejected": -295.74127197265625, "loss": 0.8078, "rewards/accuracies": 0.5, "rewards/chosen": 3.1521334648132324, "rewards/margins": 0.18126268684864044, "rewards/rejected": 2.9708707332611084, "step": 25480 }, { "epoch": 1.1834346998467895, "grad_norm": 51.633853912353516, "learning_rate": 2.290384883235062e-07, "logits/chosen": -17.945756912231445, "logits/rejected": -17.6645450592041, "logps/chosen": -340.57952880859375, "logps/rejected": -305.8779296875, "loss": 0.683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0941529273986816, "rewards/margins": 0.3620893061161041, "rewards/rejected": 2.7320637702941895, "step": 25490 }, { "epoch": 1.1838989739542225, "grad_norm": 195.2766876220703, "learning_rate": 2.2901063187706018e-07, "logits/chosen": -19.147010803222656, "logits/rejected": -17.415464401245117, "logps/chosen": -553.4291381835938, "logps/rejected": -410.59063720703125, "loss": 0.6121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.971013307571411, "rewards/margins": 1.3830374479293823, "rewards/rejected": 2.5879757404327393, "step": 25500 }, { "epoch": 1.1843632480616555, "grad_norm": 29.223243713378906, "learning_rate": 2.2898277543061422e-07, "logits/chosen": -17.971359252929688, "logits/rejected": -18.605422973632812, "logps/chosen": -319.8343505859375, "logps/rejected": -397.10699462890625, "loss": 1.3879, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.104362964630127, "rewards/margins": -0.826330304145813, "rewards/rejected": 2.9306931495666504, "step": 25510 }, { "epoch": 1.1848275221690887, "grad_norm": 1.9899795055389404, "learning_rate": 2.2895491898416824e-07, "logits/chosen": -18.610469818115234, "logits/rejected": -17.108627319335938, "logps/chosen": -414.9339904785156, "logps/rejected": -251.14651489257812, "loss": 0.5242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1175971031188965, "rewards/margins": 1.816725730895996, "rewards/rejected": 1.3008716106414795, "step": 25520 }, { "epoch": 1.1852917962765217, "grad_norm": 39.296417236328125, "learning_rate": 2.2892706253772228e-07, "logits/chosen": -19.10787010192871, "logits/rejected": -17.739940643310547, "logps/chosen": -338.1080017089844, "logps/rejected": -236.65518188476562, "loss": 0.9168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.626891613006592, "rewards/margins": 0.5589755773544312, "rewards/rejected": 2.067915916442871, "step": 25530 }, { "epoch": 1.1857560703839547, "grad_norm": 128.1027374267578, "learning_rate": 2.2889920609127627e-07, "logits/chosen": -19.556819915771484, "logits/rejected": -18.391151428222656, "logps/chosen": -430.6061096191406, "logps/rejected": -305.4397888183594, "loss": 0.4304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3277931213378906, "rewards/margins": 1.1769824028015137, "rewards/rejected": 2.150810718536377, "step": 25540 }, { "epoch": 1.1862203444913877, "grad_norm": 43.68759536743164, "learning_rate": 2.2887134964483028e-07, "logits/chosen": -19.363441467285156, "logits/rejected": -18.23908042907715, "logps/chosen": -348.6918640136719, "logps/rejected": -225.5408477783203, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 3.080467700958252, "rewards/margins": 1.2718322277069092, "rewards/rejected": 1.8086354732513428, "step": 25550 }, { "epoch": 1.1866846185988207, "grad_norm": 240.18545532226562, "learning_rate": 2.2884349319838432e-07, "logits/chosen": -18.50339126586914, "logits/rejected": -17.73000144958496, "logps/chosen": -364.35784912109375, "logps/rejected": -324.56158447265625, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8474888801574707, "rewards/margins": 0.5719720125198364, "rewards/rejected": 2.275516986846924, "step": 25560 }, { "epoch": 1.1871488927062537, "grad_norm": 38.35886001586914, "learning_rate": 2.2881563675193834e-07, "logits/chosen": -17.683914184570312, "logits/rejected": -17.734615325927734, "logps/chosen": -346.9702453613281, "logps/rejected": -273.13177490234375, "loss": 0.8009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4926609992980957, "rewards/margins": 0.4957306981086731, "rewards/rejected": 1.9969303607940674, "step": 25570 }, { "epoch": 1.187613166813687, "grad_norm": 19.18947410583496, "learning_rate": 2.2878778030549235e-07, "logits/chosen": -19.11223030090332, "logits/rejected": -18.202844619750977, "logps/chosen": -403.9774475097656, "logps/rejected": -335.74139404296875, "loss": 0.3414, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1161293983459473, "rewards/margins": 1.1146466732025146, "rewards/rejected": 2.0014827251434326, "step": 25580 }, { "epoch": 1.1880774409211199, "grad_norm": 13.714282035827637, "learning_rate": 2.2875992385904637e-07, "logits/chosen": -18.365863800048828, "logits/rejected": -17.57077980041504, "logps/chosen": -393.7373046875, "logps/rejected": -286.9057312011719, "loss": 0.4185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6378986835479736, "rewards/margins": 1.7725681066513062, "rewards/rejected": 1.865330696105957, "step": 25590 }, { "epoch": 1.1885417150285529, "grad_norm": 122.0636978149414, "learning_rate": 2.2873206741260038e-07, "logits/chosen": -19.46526527404785, "logits/rejected": -18.05030059814453, "logps/chosen": -483.44915771484375, "logps/rejected": -367.7001953125, "loss": 0.5212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.951519250869751, "rewards/margins": 0.6324038505554199, "rewards/rejected": 2.319115400314331, "step": 25600 }, { "epoch": 1.1890059891359859, "grad_norm": 114.72903442382812, "learning_rate": 2.2870421096615442e-07, "logits/chosen": -18.62298011779785, "logits/rejected": -17.592960357666016, "logps/chosen": -367.4377746582031, "logps/rejected": -309.087890625, "loss": 0.4776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.783184289932251, "rewards/margins": 0.8490779995918274, "rewards/rejected": 1.934106469154358, "step": 25610 }, { "epoch": 1.1894702632434189, "grad_norm": 98.9190902709961, "learning_rate": 2.286763545197084e-07, "logits/chosen": -18.957229614257812, "logits/rejected": -17.834362030029297, "logps/chosen": -453.027099609375, "logps/rejected": -333.5621643066406, "loss": 0.6673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2099781036376953, "rewards/margins": 0.29857778549194336, "rewards/rejected": 2.911400318145752, "step": 25620 }, { "epoch": 1.1899345373508519, "grad_norm": 242.0653839111328, "learning_rate": 2.2864849807326245e-07, "logits/chosen": -17.89550018310547, "logits/rejected": -17.197940826416016, "logps/chosen": -359.65057373046875, "logps/rejected": -353.07672119140625, "loss": 0.8611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.262221097946167, "rewards/margins": 0.8560999035835266, "rewards/rejected": 2.4061214923858643, "step": 25630 }, { "epoch": 1.190398811458285, "grad_norm": 74.95887756347656, "learning_rate": 2.2862064162681647e-07, "logits/chosen": -18.94052505493164, "logits/rejected": -18.478158950805664, "logps/chosen": -363.9005126953125, "logps/rejected": -338.2109375, "loss": 0.8493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.306954860687256, "rewards/margins": 0.10859671980142593, "rewards/rejected": 2.1983580589294434, "step": 25640 }, { "epoch": 1.190863085565718, "grad_norm": 201.55982971191406, "learning_rate": 2.285927851803705e-07, "logits/chosen": -19.063520431518555, "logits/rejected": -17.343090057373047, "logps/chosen": -441.32275390625, "logps/rejected": -237.4306182861328, "loss": 0.2633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3223488330841064, "rewards/margins": 1.9544456005096436, "rewards/rejected": 1.3679031133651733, "step": 25650 }, { "epoch": 1.191327359673151, "grad_norm": 48.04218292236328, "learning_rate": 2.285649287339245e-07, "logits/chosen": -17.989181518554688, "logits/rejected": -17.00417709350586, "logps/chosen": -254.98849487304688, "logps/rejected": -162.8937225341797, "loss": 0.4781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6903507709503174, "rewards/margins": 1.3520108461380005, "rewards/rejected": 1.338339924812317, "step": 25660 }, { "epoch": 1.191791633780584, "grad_norm": 36.970481872558594, "learning_rate": 2.285370722874785e-07, "logits/chosen": -18.82268714904785, "logits/rejected": -17.480710983276367, "logps/chosen": -371.80316162109375, "logps/rejected": -273.6416015625, "loss": 0.355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.228278636932373, "rewards/margins": 1.1214587688446045, "rewards/rejected": 1.106819987297058, "step": 25670 }, { "epoch": 1.192255907888017, "grad_norm": 59.29256057739258, "learning_rate": 2.2850921584103255e-07, "logits/chosen": -18.902851104736328, "logits/rejected": -17.66832733154297, "logps/chosen": -509.73760986328125, "logps/rejected": -276.482666015625, "loss": 0.3847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5346145629882812, "rewards/margins": 1.6113799810409546, "rewards/rejected": 1.9232347011566162, "step": 25680 }, { "epoch": 1.19272018199545, "grad_norm": 184.89060974121094, "learning_rate": 2.2848135939458654e-07, "logits/chosen": -17.933605194091797, "logits/rejected": -17.486888885498047, "logps/chosen": -435.66796875, "logps/rejected": -333.2447509765625, "loss": 0.7138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8621058464050293, "rewards/margins": 1.0193814039230347, "rewards/rejected": 1.8427244424819946, "step": 25690 }, { "epoch": 1.193184456102883, "grad_norm": 95.46224212646484, "learning_rate": 2.2845350294814055e-07, "logits/chosen": -17.49890899658203, "logits/rejected": -17.372190475463867, "logps/chosen": -367.40924072265625, "logps/rejected": -279.31329345703125, "loss": 1.2808, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.649563789367676, "rewards/margins": -0.04940216615796089, "rewards/rejected": 2.6989662647247314, "step": 25700 }, { "epoch": 1.1936487302103163, "grad_norm": 93.66029357910156, "learning_rate": 2.284256465016946e-07, "logits/chosen": -17.966012954711914, "logits/rejected": -17.743295669555664, "logps/chosen": -285.1363525390625, "logps/rejected": -279.29083251953125, "loss": 0.7598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2436442375183105, "rewards/margins": 0.219883531332016, "rewards/rejected": 2.0237607955932617, "step": 25710 }, { "epoch": 1.1941130043177492, "grad_norm": 112.66319274902344, "learning_rate": 2.283977900552486e-07, "logits/chosen": -18.66032600402832, "logits/rejected": -19.08513069152832, "logps/chosen": -402.80621337890625, "logps/rejected": -417.62060546875, "loss": 1.003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5975160598754883, "rewards/margins": -0.2467842847108841, "rewards/rejected": 2.8443002700805664, "step": 25720 }, { "epoch": 1.1945772784251822, "grad_norm": 38.41669464111328, "learning_rate": 2.2836993360880262e-07, "logits/chosen": -18.92767333984375, "logits/rejected": -17.137592315673828, "logps/chosen": -358.7694091796875, "logps/rejected": -222.5920867919922, "loss": 0.4947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.266806125640869, "rewards/margins": 1.5432765483856201, "rewards/rejected": 1.7235294580459595, "step": 25730 }, { "epoch": 1.1950415525326152, "grad_norm": 58.223052978515625, "learning_rate": 2.2834207716235664e-07, "logits/chosen": -18.883426666259766, "logits/rejected": -17.422649383544922, "logps/chosen": -299.56121826171875, "logps/rejected": -191.10940551757812, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 1.9579846858978271, "rewards/margins": 1.2413804531097412, "rewards/rejected": 0.7166044116020203, "step": 25740 }, { "epoch": 1.1955058266400482, "grad_norm": 45.43611526489258, "learning_rate": 2.2831422071591068e-07, "logits/chosen": -18.568683624267578, "logits/rejected": -17.980356216430664, "logps/chosen": -453.34698486328125, "logps/rejected": -365.83843994140625, "loss": 0.6334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.107147216796875, "rewards/margins": 0.7641891241073608, "rewards/rejected": 2.3429579734802246, "step": 25750 }, { "epoch": 1.1959701007474812, "grad_norm": 6.983313083648682, "learning_rate": 2.282863642694647e-07, "logits/chosen": -18.563772201538086, "logits/rejected": -17.82912826538086, "logps/chosen": -447.77587890625, "logps/rejected": -328.6944885253906, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3686740398406982, "rewards/margins": 1.8920154571533203, "rewards/rejected": 1.476658821105957, "step": 25760 }, { "epoch": 1.1964343748549142, "grad_norm": 99.27774047851562, "learning_rate": 2.2825850782301868e-07, "logits/chosen": -17.917518615722656, "logits/rejected": -17.261598587036133, "logps/chosen": -294.80963134765625, "logps/rejected": -225.50204467773438, "loss": 0.6416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5137838125228882, "rewards/margins": 0.41705235838890076, "rewards/rejected": 1.096731424331665, "step": 25770 }, { "epoch": 1.1968986489623474, "grad_norm": 96.5755386352539, "learning_rate": 2.2823065137657272e-07, "logits/chosen": -18.786481857299805, "logits/rejected": -18.058462142944336, "logps/chosen": -521.8956909179688, "logps/rejected": -436.55413818359375, "loss": 0.5398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.715588092803955, "rewards/margins": 0.8095999956130981, "rewards/rejected": 2.905987501144409, "step": 25780 }, { "epoch": 1.1973629230697804, "grad_norm": 54.744754791259766, "learning_rate": 2.2820279493012674e-07, "logits/chosen": -18.153270721435547, "logits/rejected": -17.73398780822754, "logps/chosen": -417.28167724609375, "logps/rejected": -397.924072265625, "loss": 0.5946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7855887413024902, "rewards/margins": 0.736557126045227, "rewards/rejected": 2.0490317344665527, "step": 25790 }, { "epoch": 1.1978271971772134, "grad_norm": 78.94920349121094, "learning_rate": 2.2817493848368078e-07, "logits/chosen": -18.954504013061523, "logits/rejected": -18.61783218383789, "logps/chosen": -396.7102355957031, "logps/rejected": -369.2766418457031, "loss": 0.5094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.640939235687256, "rewards/margins": 1.095367193222046, "rewards/rejected": 2.54557204246521, "step": 25800 }, { "epoch": 1.1982914712846464, "grad_norm": 173.16200256347656, "learning_rate": 2.2814708203723477e-07, "logits/chosen": -18.298200607299805, "logits/rejected": -17.615764617919922, "logps/chosen": -434.45440673828125, "logps/rejected": -396.30487060546875, "loss": 1.0205, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4857406616210938, "rewards/margins": 0.4345265328884125, "rewards/rejected": 3.0512139797210693, "step": 25810 }, { "epoch": 1.1987557453920794, "grad_norm": 139.93678283691406, "learning_rate": 2.2811922559078878e-07, "logits/chosen": -19.42354965209961, "logits/rejected": -18.369800567626953, "logps/chosen": -373.96221923828125, "logps/rejected": -383.95965576171875, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": 3.3806090354919434, "rewards/margins": 0.5893500447273254, "rewards/rejected": 2.7912588119506836, "step": 25820 }, { "epoch": 1.1992200194995126, "grad_norm": 88.49958801269531, "learning_rate": 2.2809136914434282e-07, "logits/chosen": -19.538494110107422, "logits/rejected": -18.78810691833496, "logps/chosen": -538.1329345703125, "logps/rejected": -461.72198486328125, "loss": 0.7365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8715248107910156, "rewards/margins": 0.7571015357971191, "rewards/rejected": 3.1144237518310547, "step": 25830 }, { "epoch": 1.1996842936069456, "grad_norm": 100.3970718383789, "learning_rate": 2.2806351269789684e-07, "logits/chosen": -19.96845817565918, "logits/rejected": -18.990463256835938, "logps/chosen": -461.959228515625, "logps/rejected": -397.78314208984375, "loss": 0.891, "rewards/accuracies": 0.5, "rewards/chosen": 3.631009578704834, "rewards/margins": 0.5649512410163879, "rewards/rejected": 3.06605863571167, "step": 25840 }, { "epoch": 1.2001485677143786, "grad_norm": 88.34645080566406, "learning_rate": 2.2803565625145082e-07, "logits/chosen": -18.301671981811523, "logits/rejected": -17.524944305419922, "logps/chosen": -358.96844482421875, "logps/rejected": -293.07318115234375, "loss": 0.7953, "rewards/accuracies": 0.5, "rewards/chosen": 3.527524948120117, "rewards/margins": 0.7572106719017029, "rewards/rejected": 2.7703139781951904, "step": 25850 }, { "epoch": 1.2006128418218116, "grad_norm": 65.3553466796875, "learning_rate": 2.2800779980500486e-07, "logits/chosen": -19.79770851135254, "logits/rejected": -19.40823745727539, "logps/chosen": -444.93743896484375, "logps/rejected": -405.47686767578125, "loss": 0.8824, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.2836546897888184, "rewards/margins": -0.18420182168483734, "rewards/rejected": 3.4678566455841064, "step": 25860 }, { "epoch": 1.2010771159292446, "grad_norm": 166.76605224609375, "learning_rate": 2.2797994335855888e-07, "logits/chosen": -18.780139923095703, "logits/rejected": -17.57621192932129, "logps/chosen": -370.71087646484375, "logps/rejected": -303.2621765136719, "loss": 0.7272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.669550657272339, "rewards/margins": 1.3104753494262695, "rewards/rejected": 2.3590757846832275, "step": 25870 }, { "epoch": 1.2015413900366776, "grad_norm": 91.66097259521484, "learning_rate": 2.2795208691211292e-07, "logits/chosen": -17.806560516357422, "logits/rejected": -17.440074920654297, "logps/chosen": -399.3840026855469, "logps/rejected": -361.6353759765625, "loss": 0.6806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.194542169570923, "rewards/margins": 0.7301832437515259, "rewards/rejected": 2.4643588066101074, "step": 25880 }, { "epoch": 1.2020056641441106, "grad_norm": 32.876461029052734, "learning_rate": 2.279242304656669e-07, "logits/chosen": -18.448627471923828, "logits/rejected": -18.339839935302734, "logps/chosen": -315.0210876464844, "logps/rejected": -344.81524658203125, "loss": 0.926, "rewards/accuracies": 0.5, "rewards/chosen": 3.044178009033203, "rewards/margins": 0.017506051808595657, "rewards/rejected": 3.026671886444092, "step": 25890 }, { "epoch": 1.2024699382515438, "grad_norm": 16.077693939208984, "learning_rate": 2.2789637401922095e-07, "logits/chosen": -18.797880172729492, "logits/rejected": -18.38292694091797, "logps/chosen": -326.1229553222656, "logps/rejected": -275.62445068359375, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9576261043548584, "rewards/margins": 1.1110281944274902, "rewards/rejected": 1.8465980291366577, "step": 25900 }, { "epoch": 1.2029342123589768, "grad_norm": 35.62919235229492, "learning_rate": 2.2786851757277496e-07, "logits/chosen": -18.491823196411133, "logits/rejected": -17.57341194152832, "logps/chosen": -377.41644287109375, "logps/rejected": -255.1399383544922, "loss": 0.85, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.086765766143799, "rewards/margins": 0.3235136568546295, "rewards/rejected": 1.7632522583007812, "step": 25910 }, { "epoch": 1.2033984864664098, "grad_norm": 83.26139831542969, "learning_rate": 2.2784066112632895e-07, "logits/chosen": -18.182668685913086, "logits/rejected": -17.803417205810547, "logps/chosen": -403.12982177734375, "logps/rejected": -354.2142028808594, "loss": 0.5544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5720713138580322, "rewards/margins": 0.5775516033172607, "rewards/rejected": 1.994519591331482, "step": 25920 }, { "epoch": 1.2038627605738428, "grad_norm": 38.8469352722168, "learning_rate": 2.27812804679883e-07, "logits/chosen": -18.223346710205078, "logits/rejected": -17.669788360595703, "logps/chosen": -491.8379821777344, "logps/rejected": -417.0399475097656, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6177749633789062, "rewards/margins": 0.8792669177055359, "rewards/rejected": 2.7385079860687256, "step": 25930 }, { "epoch": 1.2043270346812758, "grad_norm": 205.5621795654297, "learning_rate": 2.27784948233437e-07, "logits/chosen": -19.04863929748535, "logits/rejected": -18.275135040283203, "logps/chosen": -443.305908203125, "logps/rejected": -326.09674072265625, "loss": 0.5089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2110321521759033, "rewards/margins": 0.7730633616447449, "rewards/rejected": 2.437969207763672, "step": 25940 }, { "epoch": 1.2047913087887088, "grad_norm": 12.208451271057129, "learning_rate": 2.2775709178699105e-07, "logits/chosen": -19.99191665649414, "logits/rejected": -18.385089874267578, "logps/chosen": -414.21954345703125, "logps/rejected": -282.9340515136719, "loss": 0.4387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.76853609085083, "rewards/margins": 1.358862280845642, "rewards/rejected": 2.4096741676330566, "step": 25950 }, { "epoch": 1.2052555828961418, "grad_norm": 6.89195442199707, "learning_rate": 2.2772923534054504e-07, "logits/chosen": -18.23155403137207, "logits/rejected": -17.80583381652832, "logps/chosen": -488.5712890625, "logps/rejected": -328.0736999511719, "loss": 0.5309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7099804878234863, "rewards/margins": 0.7987867593765259, "rewards/rejected": 1.9111934900283813, "step": 25960 }, { "epoch": 1.205719857003575, "grad_norm": 180.14132690429688, "learning_rate": 2.2770137889409905e-07, "logits/chosen": -18.435375213623047, "logits/rejected": -17.847637176513672, "logps/chosen": -331.36163330078125, "logps/rejected": -249.50778198242188, "loss": 0.4471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.325587749481201, "rewards/margins": 0.9214149713516235, "rewards/rejected": 1.4041727781295776, "step": 25970 }, { "epoch": 1.206184131111008, "grad_norm": 245.03244018554688, "learning_rate": 2.276735224476531e-07, "logits/chosen": -18.323362350463867, "logits/rejected": -18.973758697509766, "logps/chosen": -358.7205505371094, "logps/rejected": -375.2798767089844, "loss": 1.1971, "rewards/accuracies": 0.5, "rewards/chosen": 2.5926661491394043, "rewards/margins": -0.17143258452415466, "rewards/rejected": 2.764098644256592, "step": 25980 }, { "epoch": 1.206648405218441, "grad_norm": 19.468515396118164, "learning_rate": 2.276456660012071e-07, "logits/chosen": -18.540964126586914, "logits/rejected": -17.76406478881836, "logps/chosen": -371.8440856933594, "logps/rejected": -322.697509765625, "loss": 0.2903, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4032890796661377, "rewards/margins": 1.5014755725860596, "rewards/rejected": 1.9018138647079468, "step": 25990 }, { "epoch": 1.207112679325874, "grad_norm": 24.43100357055664, "learning_rate": 2.2761780955476112e-07, "logits/chosen": -17.936668395996094, "logits/rejected": -17.484874725341797, "logps/chosen": -427.6499938964844, "logps/rejected": -399.1722412109375, "loss": 0.9357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.206502914428711, "rewards/margins": 0.5536386370658875, "rewards/rejected": 2.652864694595337, "step": 26000 }, { "epoch": 1.207576953433307, "grad_norm": 49.90932083129883, "learning_rate": 2.2758995310831514e-07, "logits/chosen": -18.753582000732422, "logits/rejected": -18.54625701904297, "logps/chosen": -459.8118591308594, "logps/rejected": -383.2272644042969, "loss": 0.8622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.207240343093872, "rewards/margins": 0.23069123923778534, "rewards/rejected": 2.976548910140991, "step": 26010 }, { "epoch": 1.2080412275407402, "grad_norm": 285.466552734375, "learning_rate": 2.2756209666186915e-07, "logits/chosen": -19.039039611816406, "logits/rejected": -18.831342697143555, "logps/chosen": -346.3467712402344, "logps/rejected": -388.3407897949219, "loss": 0.8437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.881730556488037, "rewards/margins": 0.10927454382181168, "rewards/rejected": 2.772455930709839, "step": 26020 }, { "epoch": 1.2085055016481732, "grad_norm": 115.95744323730469, "learning_rate": 2.275342402154232e-07, "logits/chosen": -20.319133758544922, "logits/rejected": -19.607707977294922, "logps/chosen": -466.58001708984375, "logps/rejected": -373.96343994140625, "loss": 0.3521, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6275336742401123, "rewards/margins": 1.510495901107788, "rewards/rejected": 2.117037296295166, "step": 26030 }, { "epoch": 1.2089697757556062, "grad_norm": 41.857337951660156, "learning_rate": 2.2750638376897718e-07, "logits/chosen": -18.744976043701172, "logits/rejected": -18.016557693481445, "logps/chosen": -396.42669677734375, "logps/rejected": -273.54547119140625, "loss": 0.6351, "rewards/accuracies": 0.5, "rewards/chosen": 2.2830491065979004, "rewards/margins": 0.7826115489006042, "rewards/rejected": 1.5004373788833618, "step": 26040 }, { "epoch": 1.2094340498630392, "grad_norm": 33.65116500854492, "learning_rate": 2.2747852732253122e-07, "logits/chosen": -18.700923919677734, "logits/rejected": -17.97300148010254, "logps/chosen": -386.21221923828125, "logps/rejected": -320.8712463378906, "loss": 0.5458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7431912422180176, "rewards/margins": 0.6910432577133179, "rewards/rejected": 2.0521481037139893, "step": 26050 }, { "epoch": 1.2098983239704721, "grad_norm": 9.944738388061523, "learning_rate": 2.2745067087608523e-07, "logits/chosen": -18.292476654052734, "logits/rejected": -19.21879005432129, "logps/chosen": -345.95501708984375, "logps/rejected": -436.0638122558594, "loss": 0.8566, "rewards/accuracies": 0.5, "rewards/chosen": 2.9721550941467285, "rewards/margins": 0.15225748717784882, "rewards/rejected": 2.819897174835205, "step": 26060 }, { "epoch": 1.2103625980779051, "grad_norm": 104.730712890625, "learning_rate": 2.2742281442963928e-07, "logits/chosen": -19.358484268188477, "logits/rejected": -18.26430320739746, "logps/chosen": -380.93621826171875, "logps/rejected": -291.32000732421875, "loss": 0.7167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5394792556762695, "rewards/margins": 0.5782829523086548, "rewards/rejected": 1.9611963033676147, "step": 26070 }, { "epoch": 1.2108268721853381, "grad_norm": 120.66316986083984, "learning_rate": 2.2739495798319326e-07, "logits/chosen": -18.585193634033203, "logits/rejected": -18.451675415039062, "logps/chosen": -330.69012451171875, "logps/rejected": -332.22198486328125, "loss": 0.7698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2316102981567383, "rewards/margins": 0.04183689504861832, "rewards/rejected": 2.1897730827331543, "step": 26080 }, { "epoch": 1.2112911462927713, "grad_norm": 16.489707946777344, "learning_rate": 2.2736710153674728e-07, "logits/chosen": -18.972970962524414, "logits/rejected": -19.346681594848633, "logps/chosen": -385.9499816894531, "logps/rejected": -383.40643310546875, "loss": 0.8457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8936915397644043, "rewards/margins": 0.26811662316322327, "rewards/rejected": 2.6255745887756348, "step": 26090 }, { "epoch": 1.2117554204002043, "grad_norm": 173.158935546875, "learning_rate": 2.2733924509030132e-07, "logits/chosen": -18.51213264465332, "logits/rejected": -19.9158878326416, "logps/chosen": -339.642578125, "logps/rejected": -477.614990234375, "loss": 1.1602, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.910853862762451, "rewards/margins": -0.3291667401790619, "rewards/rejected": 3.240020751953125, "step": 26100 }, { "epoch": 1.2122196945076373, "grad_norm": 47.334564208984375, "learning_rate": 2.273113886438553e-07, "logits/chosen": -19.437786102294922, "logits/rejected": -17.91563606262207, "logps/chosen": -414.5877990722656, "logps/rejected": -280.78216552734375, "loss": 0.7407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.688779354095459, "rewards/margins": 1.4713174104690552, "rewards/rejected": 2.2174620628356934, "step": 26110 }, { "epoch": 1.2126839686150703, "grad_norm": 191.62538146972656, "learning_rate": 2.2728353219740932e-07, "logits/chosen": -18.09847640991211, "logits/rejected": -17.97531509399414, "logps/chosen": -260.9403381347656, "logps/rejected": -253.1968994140625, "loss": 0.6806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9522621631622314, "rewards/margins": 0.22315946221351624, "rewards/rejected": 1.7291024923324585, "step": 26120 }, { "epoch": 1.2131482427225033, "grad_norm": 123.56151580810547, "learning_rate": 2.2725567575096336e-07, "logits/chosen": -18.74112892150879, "logits/rejected": -18.121326446533203, "logps/chosen": -480.48931884765625, "logps/rejected": -427.5271911621094, "loss": 0.7137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.333380937576294, "rewards/margins": 0.3427525758743286, "rewards/rejected": 2.990628480911255, "step": 26130 }, { "epoch": 1.2136125168299363, "grad_norm": 85.29383850097656, "learning_rate": 2.2722781930451738e-07, "logits/chosen": -18.604320526123047, "logits/rejected": -17.40860366821289, "logps/chosen": -366.48162841796875, "logps/rejected": -331.0735168457031, "loss": 0.5548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.740419864654541, "rewards/margins": 0.6386991739273071, "rewards/rejected": 2.1017205715179443, "step": 26140 }, { "epoch": 1.2140767909373693, "grad_norm": 108.13909912109375, "learning_rate": 2.271999628580714e-07, "logits/chosen": -19.28420066833496, "logits/rejected": -18.613384246826172, "logps/chosen": -459.3080139160156, "logps/rejected": -369.2974853515625, "loss": 0.3984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.315584659576416, "rewards/margins": 1.0639835596084595, "rewards/rejected": 2.251600742340088, "step": 26150 }, { "epoch": 1.2145410650448025, "grad_norm": 72.47946166992188, "learning_rate": 2.271721064116254e-07, "logits/chosen": -18.51705551147461, "logits/rejected": -17.593076705932617, "logps/chosen": -335.65673828125, "logps/rejected": -253.36288452148438, "loss": 0.4772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1343984603881836, "rewards/margins": 1.1561777591705322, "rewards/rejected": 1.9782209396362305, "step": 26160 }, { "epoch": 1.2150053391522355, "grad_norm": 57.410404205322266, "learning_rate": 2.2714424996517945e-07, "logits/chosen": -18.208324432373047, "logits/rejected": -17.271892547607422, "logps/chosen": -393.8982849121094, "logps/rejected": -271.0118713378906, "loss": 0.4015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3335843086242676, "rewards/margins": 1.199730634689331, "rewards/rejected": 2.1338534355163574, "step": 26170 }, { "epoch": 1.2154696132596685, "grad_norm": 8.777316093444824, "learning_rate": 2.2711639351873346e-07, "logits/chosen": -18.78342056274414, "logits/rejected": -17.744144439697266, "logps/chosen": -503.6905822753906, "logps/rejected": -362.70977783203125, "loss": 0.4024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8636863231658936, "rewards/margins": 1.6207736730575562, "rewards/rejected": 2.2429122924804688, "step": 26180 }, { "epoch": 1.2159338873671015, "grad_norm": 11.031211853027344, "learning_rate": 2.2708853707228745e-07, "logits/chosen": -18.175796508789062, "logits/rejected": -17.46197509765625, "logps/chosen": -350.1917724609375, "logps/rejected": -219.05447387695312, "loss": 0.4332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9645848274230957, "rewards/margins": 1.3559707403182983, "rewards/rejected": 1.6086139678955078, "step": 26190 }, { "epoch": 1.2163981614745345, "grad_norm": 117.26887512207031, "learning_rate": 2.270606806258415e-07, "logits/chosen": -18.31443977355957, "logits/rejected": -18.11252212524414, "logps/chosen": -344.99676513671875, "logps/rejected": -393.848876953125, "loss": 0.9245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.377699375152588, "rewards/margins": 0.0569564513862133, "rewards/rejected": 2.3207428455352783, "step": 26200 }, { "epoch": 1.2168624355819677, "grad_norm": 35.37714385986328, "learning_rate": 2.270328241793955e-07, "logits/chosen": -18.246835708618164, "logits/rejected": -17.903430938720703, "logps/chosen": -427.8998107910156, "logps/rejected": -346.10797119140625, "loss": 0.5829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7962074279785156, "rewards/margins": 0.880495548248291, "rewards/rejected": 1.9157119989395142, "step": 26210 }, { "epoch": 1.2173267096894007, "grad_norm": 47.05765151977539, "learning_rate": 2.2700496773294955e-07, "logits/chosen": -17.711746215820312, "logits/rejected": -17.798099517822266, "logps/chosen": -398.75439453125, "logps/rejected": -400.80474853515625, "loss": 0.7445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.987696409225464, "rewards/margins": 0.6115447878837585, "rewards/rejected": 2.3761518001556396, "step": 26220 }, { "epoch": 1.2177909837968337, "grad_norm": 262.8958740234375, "learning_rate": 2.2697711128650353e-07, "logits/chosen": -18.61861801147461, "logits/rejected": -17.48394012451172, "logps/chosen": -409.13372802734375, "logps/rejected": -304.2777404785156, "loss": 0.9159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.57914400100708, "rewards/margins": 0.1520681530237198, "rewards/rejected": 2.4270758628845215, "step": 26230 }, { "epoch": 1.2182552579042667, "grad_norm": 7.462989807128906, "learning_rate": 2.2694925484005755e-07, "logits/chosen": -18.8256893157959, "logits/rejected": -18.288698196411133, "logps/chosen": -397.70440673828125, "logps/rejected": -324.0061950683594, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4982521533966064, "rewards/margins": 1.1990454196929932, "rewards/rejected": 2.299206495285034, "step": 26240 }, { "epoch": 1.2187195320116997, "grad_norm": 189.43438720703125, "learning_rate": 2.269213983936116e-07, "logits/chosen": -18.433460235595703, "logits/rejected": -18.048416137695312, "logps/chosen": -448.2801818847656, "logps/rejected": -374.8477478027344, "loss": 0.6769, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9717624187469482, "rewards/margins": 0.459979385137558, "rewards/rejected": 2.5117831230163574, "step": 26250 }, { "epoch": 1.2191838061191327, "grad_norm": 94.84452819824219, "learning_rate": 2.268935419471656e-07, "logits/chosen": -18.345111846923828, "logits/rejected": -16.57012939453125, "logps/chosen": -395.92388916015625, "logps/rejected": -213.0150604248047, "loss": 0.2937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.336686372756958, "rewards/margins": 2.025042772293091, "rewards/rejected": 1.3116434812545776, "step": 26260 }, { "epoch": 1.2196480802265657, "grad_norm": 60.24872589111328, "learning_rate": 2.268656855007196e-07, "logits/chosen": -18.314029693603516, "logits/rejected": -17.79142951965332, "logps/chosen": -352.55987548828125, "logps/rejected": -301.54583740234375, "loss": 0.6253, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.113248348236084, "rewards/margins": 0.9180103540420532, "rewards/rejected": 2.1952383518218994, "step": 26270 }, { "epoch": 1.220112354333999, "grad_norm": 13.534360885620117, "learning_rate": 2.2683782905427363e-07, "logits/chosen": -19.496536254882812, "logits/rejected": -18.759981155395508, "logps/chosen": -326.3200378417969, "logps/rejected": -283.3829040527344, "loss": 0.7088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2307417392730713, "rewards/margins": 0.6383981704711914, "rewards/rejected": 1.5923435688018799, "step": 26280 }, { "epoch": 1.2205766284414319, "grad_norm": 87.94407653808594, "learning_rate": 2.2680997260782765e-07, "logits/chosen": -18.28795051574707, "logits/rejected": -17.98958396911621, "logps/chosen": -385.87799072265625, "logps/rejected": -400.51239013671875, "loss": 0.5415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7662177085876465, "rewards/margins": 0.7126909494400024, "rewards/rejected": 2.0535268783569336, "step": 26290 }, { "epoch": 1.2210409025488649, "grad_norm": 164.44952392578125, "learning_rate": 2.2678211616138166e-07, "logits/chosen": -19.296871185302734, "logits/rejected": -19.59732437133789, "logps/chosen": -410.22100830078125, "logps/rejected": -482.29473876953125, "loss": 0.9413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7457947731018066, "rewards/margins": 0.18651843070983887, "rewards/rejected": 3.5592758655548096, "step": 26300 }, { "epoch": 1.2215051766562979, "grad_norm": 32.23936462402344, "learning_rate": 2.2675425971493568e-07, "logits/chosen": -18.831239700317383, "logits/rejected": -18.143814086914062, "logps/chosen": -336.2738952636719, "logps/rejected": -302.4380798339844, "loss": 0.6112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.065168857574463, "rewards/margins": 0.5849058032035828, "rewards/rejected": 2.4802629947662354, "step": 26310 }, { "epoch": 1.2219694507637309, "grad_norm": 69.23609924316406, "learning_rate": 2.2672640326848972e-07, "logits/chosen": -19.53461265563965, "logits/rejected": -18.28130531311035, "logps/chosen": -407.99664306640625, "logps/rejected": -288.4060974121094, "loss": 0.4099, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6968822479248047, "rewards/margins": 1.8114837408065796, "rewards/rejected": 1.885398268699646, "step": 26320 }, { "epoch": 1.2224337248711639, "grad_norm": 216.20651245117188, "learning_rate": 2.2669854682204373e-07, "logits/chosen": -19.03024673461914, "logits/rejected": -18.26157569885254, "logps/chosen": -421.5751037597656, "logps/rejected": -360.5625915527344, "loss": 0.6397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7623989582061768, "rewards/margins": 0.7006049752235413, "rewards/rejected": 3.0617940425872803, "step": 26330 }, { "epoch": 1.2228979989785969, "grad_norm": 15.153545379638672, "learning_rate": 2.2667069037559772e-07, "logits/chosen": -18.77290153503418, "logits/rejected": -17.35110092163086, "logps/chosen": -468.7491149902344, "logps/rejected": -291.3679504394531, "loss": 0.3525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.423797607421875, "rewards/margins": 1.6414718627929688, "rewards/rejected": 1.7823255062103271, "step": 26340 }, { "epoch": 1.22336227308603, "grad_norm": 82.51900482177734, "learning_rate": 2.2664283392915176e-07, "logits/chosen": -18.341856002807617, "logits/rejected": -16.955236434936523, "logps/chosen": -479.354736328125, "logps/rejected": -291.6070861816406, "loss": 0.4454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7521424293518066, "rewards/margins": 1.7900187969207764, "rewards/rejected": 1.9621235132217407, "step": 26350 }, { "epoch": 1.223826547193463, "grad_norm": 99.07614135742188, "learning_rate": 2.2661497748270578e-07, "logits/chosen": -18.050371170043945, "logits/rejected": -17.831043243408203, "logps/chosen": -407.6334228515625, "logps/rejected": -330.0123291015625, "loss": 0.6157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1919970512390137, "rewards/margins": 0.6919033527374268, "rewards/rejected": 2.5000932216644287, "step": 26360 }, { "epoch": 1.224290821300896, "grad_norm": 30.328922271728516, "learning_rate": 2.2658712103625982e-07, "logits/chosen": -18.762042999267578, "logits/rejected": -17.754058837890625, "logps/chosen": -429.43963623046875, "logps/rejected": -324.4588928222656, "loss": 0.6268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5407981872558594, "rewards/margins": 0.7993383407592773, "rewards/rejected": 1.741459846496582, "step": 26370 }, { "epoch": 1.224755095408329, "grad_norm": 216.7106170654297, "learning_rate": 2.265592645898138e-07, "logits/chosen": -19.71759605407715, "logits/rejected": -19.141355514526367, "logps/chosen": -417.7137756347656, "logps/rejected": -354.02874755859375, "loss": 1.1086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4892578125, "rewards/margins": 0.38699859380722046, "rewards/rejected": 3.102259635925293, "step": 26380 }, { "epoch": 1.225219369515762, "grad_norm": 97.4752197265625, "learning_rate": 2.2653140814336782e-07, "logits/chosen": -19.061601638793945, "logits/rejected": -18.65553855895996, "logps/chosen": -368.5208435058594, "logps/rejected": -280.3473815917969, "loss": 0.5094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2266452312469482, "rewards/margins": 1.0808745622634888, "rewards/rejected": 2.14577054977417, "step": 26390 }, { "epoch": 1.225683643623195, "grad_norm": 54.678192138671875, "learning_rate": 2.2650355169692186e-07, "logits/chosen": -18.770034790039062, "logits/rejected": -18.393278121948242, "logps/chosen": -363.2131042480469, "logps/rejected": -274.0679016113281, "loss": 0.8957, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.988884687423706, "rewards/margins": -0.1118096113204956, "rewards/rejected": 2.100694179534912, "step": 26400 }, { "epoch": 1.2261479177306283, "grad_norm": 2.078618288040161, "learning_rate": 2.2647569525047588e-07, "logits/chosen": -20.209461212158203, "logits/rejected": -18.459880828857422, "logps/chosen": -311.5202331542969, "logps/rejected": -183.34303283691406, "loss": 0.4464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0317370891571045, "rewards/margins": 2.050507068634033, "rewards/rejected": 0.9812299013137817, "step": 26410 }, { "epoch": 1.2266121918380612, "grad_norm": 40.17897415161133, "learning_rate": 2.264478388040299e-07, "logits/chosen": -19.242734909057617, "logits/rejected": -18.451698303222656, "logps/chosen": -440.052978515625, "logps/rejected": -340.7781982421875, "loss": 0.6694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6468605995178223, "rewards/margins": 0.48093900084495544, "rewards/rejected": 2.165921449661255, "step": 26420 }, { "epoch": 1.2270764659454942, "grad_norm": 55.6024055480957, "learning_rate": 2.264199823575839e-07, "logits/chosen": -19.63626480102539, "logits/rejected": -18.147916793823242, "logps/chosen": -381.8561096191406, "logps/rejected": -300.9503479003906, "loss": 0.4244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.150679111480713, "rewards/margins": 1.1621935367584229, "rewards/rejected": 1.9884859323501587, "step": 26430 }, { "epoch": 1.2275407400529272, "grad_norm": 0.49415528774261475, "learning_rate": 2.2639212591113792e-07, "logits/chosen": -18.86846351623535, "logits/rejected": -18.362781524658203, "logps/chosen": -377.35113525390625, "logps/rejected": -308.1499938964844, "loss": 0.8475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.997356653213501, "rewards/margins": 0.353874146938324, "rewards/rejected": 2.6434826850891113, "step": 26440 }, { "epoch": 1.2280050141603602, "grad_norm": 260.37158203125, "learning_rate": 2.2636426946469196e-07, "logits/chosen": -18.2574405670166, "logits/rejected": -17.729167938232422, "logps/chosen": -328.0274658203125, "logps/rejected": -271.6080322265625, "loss": 0.6516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.671604871749878, "rewards/margins": 0.8250864148139954, "rewards/rejected": 1.8465187549591064, "step": 26450 }, { "epoch": 1.2284692882677932, "grad_norm": 60.77957534790039, "learning_rate": 2.2633641301824595e-07, "logits/chosen": -18.593425750732422, "logits/rejected": -17.28902816772461, "logps/chosen": -455.86279296875, "logps/rejected": -312.1059875488281, "loss": 0.3976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.15380334854126, "rewards/margins": 1.565464973449707, "rewards/rejected": 2.5883381366729736, "step": 26460 }, { "epoch": 1.2289335623752264, "grad_norm": 112.91633605957031, "learning_rate": 2.263085565718e-07, "logits/chosen": -17.95547103881836, "logits/rejected": -18.993938446044922, "logps/chosen": -388.11669921875, "logps/rejected": -451.7691955566406, "loss": 1.1983, "rewards/accuracies": 0.5, "rewards/chosen": 2.748760223388672, "rewards/margins": -0.14318744838237762, "rewards/rejected": 2.8919479846954346, "step": 26470 }, { "epoch": 1.2293978364826594, "grad_norm": 12.981513977050781, "learning_rate": 2.26280700125354e-07, "logits/chosen": -18.690303802490234, "logits/rejected": -17.860082626342773, "logps/chosen": -458.59564208984375, "logps/rejected": -386.94366455078125, "loss": 0.7639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4078502655029297, "rewards/margins": 0.6295459866523743, "rewards/rejected": 2.7783045768737793, "step": 26480 }, { "epoch": 1.2298621105900924, "grad_norm": 70.46629333496094, "learning_rate": 2.26252843678908e-07, "logits/chosen": -19.11136245727539, "logits/rejected": -18.678659439086914, "logps/chosen": -477.9192810058594, "logps/rejected": -332.2737121582031, "loss": 0.4225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.215235471725464, "rewards/margins": 0.908115565776825, "rewards/rejected": 2.307119846343994, "step": 26490 }, { "epoch": 1.2303263846975254, "grad_norm": 1.7338109016418457, "learning_rate": 2.2622498723246203e-07, "logits/chosen": -18.577152252197266, "logits/rejected": -17.337772369384766, "logps/chosen": -476.80035400390625, "logps/rejected": -266.1208801269531, "loss": 0.3232, "rewards/accuracies": 1.0, "rewards/chosen": 3.321380615234375, "rewards/margins": 1.5464622974395752, "rewards/rejected": 1.7749183177947998, "step": 26500 }, { "epoch": 1.2307906588049584, "grad_norm": 47.10158920288086, "learning_rate": 2.2619713078601605e-07, "logits/chosen": -18.20856285095215, "logits/rejected": -17.738386154174805, "logps/chosen": -422.5848693847656, "logps/rejected": -404.19891357421875, "loss": 0.6979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2066121101379395, "rewards/margins": 0.5969409942626953, "rewards/rejected": 2.609671115875244, "step": 26510 }, { "epoch": 1.2312549329123914, "grad_norm": 241.04254150390625, "learning_rate": 2.261692743395701e-07, "logits/chosen": -18.95876121520996, "logits/rejected": -19.10479736328125, "logps/chosen": -396.9951171875, "logps/rejected": -249.33157348632812, "loss": 0.5754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.166637420654297, "rewards/margins": 1.1547927856445312, "rewards/rejected": 2.0118443965911865, "step": 26520 }, { "epoch": 1.2317192070198244, "grad_norm": 45.25527572631836, "learning_rate": 2.2614141789312408e-07, "logits/chosen": -19.027708053588867, "logits/rejected": -17.707820892333984, "logps/chosen": -334.1341552734375, "logps/rejected": -270.76373291015625, "loss": 0.4664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.785829544067383, "rewards/margins": 1.5515388250350952, "rewards/rejected": 2.234290599822998, "step": 26530 }, { "epoch": 1.2321834811272576, "grad_norm": 74.28425598144531, "learning_rate": 2.261135614466781e-07, "logits/chosen": -18.763837814331055, "logits/rejected": -18.32036781311035, "logps/chosen": -390.734619140625, "logps/rejected": -369.68487548828125, "loss": 1.2988, "rewards/accuracies": 0.5, "rewards/chosen": 2.688833475112915, "rewards/margins": -0.5129397511482239, "rewards/rejected": 3.2017734050750732, "step": 26540 }, { "epoch": 1.2326477552346906, "grad_norm": 14.025968551635742, "learning_rate": 2.2608570500023213e-07, "logits/chosen": -19.071374893188477, "logits/rejected": -17.294065475463867, "logps/chosen": -383.33953857421875, "logps/rejected": -296.6546936035156, "loss": 0.5502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.297454357147217, "rewards/margins": 1.0486820936203003, "rewards/rejected": 2.248772144317627, "step": 26550 }, { "epoch": 1.2331120293421236, "grad_norm": 46.56739044189453, "learning_rate": 2.2605784855378615e-07, "logits/chosen": -19.980937957763672, "logits/rejected": -19.26267433166504, "logps/chosen": -329.5985412597656, "logps/rejected": -296.90252685546875, "loss": 0.565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.458604335784912, "rewards/margins": 0.844499945640564, "rewards/rejected": 2.614104747772217, "step": 26560 }, { "epoch": 1.2335763034495566, "grad_norm": 92.7117919921875, "learning_rate": 2.2602999210734016e-07, "logits/chosen": -18.496685028076172, "logits/rejected": -17.330734252929688, "logps/chosen": -397.5951232910156, "logps/rejected": -306.94219970703125, "loss": 0.5188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2691261768341064, "rewards/margins": 1.229547142982483, "rewards/rejected": 2.039578914642334, "step": 26570 }, { "epoch": 1.2340405775569896, "grad_norm": 27.182456970214844, "learning_rate": 2.2600213566089418e-07, "logits/chosen": -19.14019012451172, "logits/rejected": -17.523784637451172, "logps/chosen": -387.37127685546875, "logps/rejected": -280.07244873046875, "loss": 0.3975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.087359428405762, "rewards/margins": 1.8387033939361572, "rewards/rejected": 2.2486557960510254, "step": 26580 }, { "epoch": 1.2345048516644226, "grad_norm": 153.5219268798828, "learning_rate": 2.2597427921444822e-07, "logits/chosen": -18.878984451293945, "logits/rejected": -18.174318313598633, "logps/chosen": -458.82550048828125, "logps/rejected": -335.9886779785156, "loss": 0.6035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9981493949890137, "rewards/margins": 0.7085078954696655, "rewards/rejected": 2.2896413803100586, "step": 26590 }, { "epoch": 1.2349691257718556, "grad_norm": 1.2191944122314453, "learning_rate": 2.2594642276800223e-07, "logits/chosen": -20.02285385131836, "logits/rejected": -18.704673767089844, "logps/chosen": -517.2922973632812, "logps/rejected": -368.2447814941406, "loss": 0.5063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6362690925598145, "rewards/margins": 0.9634889364242554, "rewards/rejected": 2.6727800369262695, "step": 26600 }, { "epoch": 1.2354333998792888, "grad_norm": 11.323661804199219, "learning_rate": 2.2591856632155622e-07, "logits/chosen": -18.476062774658203, "logits/rejected": -18.350910186767578, "logps/chosen": -373.4455871582031, "logps/rejected": -371.6781311035156, "loss": 0.8638, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.47456431388855, "rewards/margins": 0.09927999973297119, "rewards/rejected": 2.375284194946289, "step": 26610 }, { "epoch": 1.2358976739867218, "grad_norm": 85.81209564208984, "learning_rate": 2.2589070987511026e-07, "logits/chosen": -19.15328598022461, "logits/rejected": -18.947906494140625, "logps/chosen": -416.8589782714844, "logps/rejected": -372.86981201171875, "loss": 0.4944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5384914875030518, "rewards/margins": 0.7930593490600586, "rewards/rejected": 2.745432138442993, "step": 26620 }, { "epoch": 1.2363619480941548, "grad_norm": 14.583115577697754, "learning_rate": 2.2586285342866428e-07, "logits/chosen": -18.43950843811035, "logits/rejected": -17.99354362487793, "logps/chosen": -451.02392578125, "logps/rejected": -361.6495666503906, "loss": 0.6265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4435298442840576, "rewards/margins": 0.6739908456802368, "rewards/rejected": 2.769538402557373, "step": 26630 }, { "epoch": 1.2368262222015878, "grad_norm": 119.47830963134766, "learning_rate": 2.2583499698221832e-07, "logits/chosen": -19.640504837036133, "logits/rejected": -17.45077133178711, "logps/chosen": -492.104736328125, "logps/rejected": -338.50042724609375, "loss": 0.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4561638832092285, "rewards/margins": 1.5700231790542603, "rewards/rejected": 1.8861404657363892, "step": 26640 }, { "epoch": 1.2372904963090208, "grad_norm": 64.68513488769531, "learning_rate": 2.258071405357723e-07, "logits/chosen": -18.830318450927734, "logits/rejected": -17.863889694213867, "logps/chosen": -361.9334716796875, "logps/rejected": -271.2427978515625, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0156025886535645, "rewards/margins": 1.203613519668579, "rewards/rejected": 1.8119890689849854, "step": 26650 }, { "epoch": 1.237754770416454, "grad_norm": 103.9702377319336, "learning_rate": 2.2577928408932632e-07, "logits/chosen": -18.295392990112305, "logits/rejected": -17.314464569091797, "logps/chosen": -466.96356201171875, "logps/rejected": -354.0509338378906, "loss": 0.3968, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6571319103240967, "rewards/margins": 1.0023621320724487, "rewards/rejected": 2.6547701358795166, "step": 26660 }, { "epoch": 1.238219044523887, "grad_norm": 84.62726593017578, "learning_rate": 2.2575142764288036e-07, "logits/chosen": -18.649517059326172, "logits/rejected": -17.93168067932129, "logps/chosen": -480.556884765625, "logps/rejected": -430.146484375, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9741485118865967, "rewards/margins": 1.2090212106704712, "rewards/rejected": 2.765127420425415, "step": 26670 }, { "epoch": 1.23868331863132, "grad_norm": 187.6856231689453, "learning_rate": 2.2572357119643435e-07, "logits/chosen": -18.79623794555664, "logits/rejected": -17.56106185913086, "logps/chosen": -401.69610595703125, "logps/rejected": -240.81362915039062, "loss": 0.6455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2413814067840576, "rewards/margins": 0.9034392237663269, "rewards/rejected": 2.337942600250244, "step": 26680 }, { "epoch": 1.239147592738753, "grad_norm": 94.29015350341797, "learning_rate": 2.2569571474998836e-07, "logits/chosen": -18.287065505981445, "logits/rejected": -17.97220230102539, "logps/chosen": -343.58013916015625, "logps/rejected": -316.6505126953125, "loss": 0.6325, "rewards/accuracies": 0.5, "rewards/chosen": 2.6842048168182373, "rewards/margins": 0.47334757447242737, "rewards/rejected": 2.210857391357422, "step": 26690 }, { "epoch": 1.239611866846186, "grad_norm": 89.516357421875, "learning_rate": 2.256678583035424e-07, "logits/chosen": -18.993349075317383, "logits/rejected": -19.037006378173828, "logps/chosen": -363.0284729003906, "logps/rejected": -401.47381591796875, "loss": 0.6755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.780656099319458, "rewards/margins": 0.5706362128257751, "rewards/rejected": 2.210019588470459, "step": 26700 }, { "epoch": 1.240076140953619, "grad_norm": 17.03728485107422, "learning_rate": 2.2564000185709642e-07, "logits/chosen": -19.41684341430664, "logits/rejected": -18.371562957763672, "logps/chosen": -404.7804870605469, "logps/rejected": -286.14013671875, "loss": 0.4553, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6813855171203613, "rewards/margins": 1.472883939743042, "rewards/rejected": 2.2085013389587402, "step": 26710 }, { "epoch": 1.240540415061052, "grad_norm": 19.141170501708984, "learning_rate": 2.2561214541065043e-07, "logits/chosen": -19.392452239990234, "logits/rejected": -18.969741821289062, "logps/chosen": -355.2585754394531, "logps/rejected": -328.424560546875, "loss": 0.8804, "rewards/accuracies": 0.5, "rewards/chosen": 3.026637315750122, "rewards/margins": 0.4413262903690338, "rewards/rejected": 2.5853111743927, "step": 26720 }, { "epoch": 1.2410046891684852, "grad_norm": 16.87065315246582, "learning_rate": 2.2558428896420445e-07, "logits/chosen": -18.737781524658203, "logits/rejected": -17.98478126525879, "logps/chosen": -414.02880859375, "logps/rejected": -342.2918701171875, "loss": 0.5575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5314953327178955, "rewards/margins": 0.7064696550369263, "rewards/rejected": 1.8250255584716797, "step": 26730 }, { "epoch": 1.2414689632759182, "grad_norm": 9.125032424926758, "learning_rate": 2.255564325177585e-07, "logits/chosen": -18.533235549926758, "logits/rejected": -17.27880859375, "logps/chosen": -437.6492614746094, "logps/rejected": -300.11029052734375, "loss": 0.8535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8119382858276367, "rewards/margins": 0.899459958076477, "rewards/rejected": 1.9124786853790283, "step": 26740 }, { "epoch": 1.2419332373833512, "grad_norm": 91.21480560302734, "learning_rate": 2.255285760713125e-07, "logits/chosen": -19.451522827148438, "logits/rejected": -18.28321075439453, "logps/chosen": -354.9679260253906, "logps/rejected": -283.0146789550781, "loss": 0.589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9998538494110107, "rewards/margins": 0.5229818820953369, "rewards/rejected": 2.476871967315674, "step": 26750 }, { "epoch": 1.2423975114907841, "grad_norm": 62.01862335205078, "learning_rate": 2.255007196248665e-07, "logits/chosen": -19.287708282470703, "logits/rejected": -19.237564086914062, "logps/chosen": -379.3844909667969, "logps/rejected": -365.37115478515625, "loss": 0.7245, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5358641147613525, "rewards/margins": 0.24870562553405762, "rewards/rejected": 2.287158489227295, "step": 26760 }, { "epoch": 1.2428617855982171, "grad_norm": 61.74559783935547, "learning_rate": 2.2547286317842053e-07, "logits/chosen": -18.630901336669922, "logits/rejected": -18.497623443603516, "logps/chosen": -351.05242919921875, "logps/rejected": -301.75390625, "loss": 0.6351, "rewards/accuracies": 0.5, "rewards/chosen": 2.572918653488159, "rewards/margins": 0.31872522830963135, "rewards/rejected": 2.2541935443878174, "step": 26770 }, { "epoch": 1.2433260597056501, "grad_norm": 40.814476013183594, "learning_rate": 2.2544500673197455e-07, "logits/chosen": -18.344602584838867, "logits/rejected": -18.20493507385254, "logps/chosen": -319.0268249511719, "logps/rejected": -314.80096435546875, "loss": 0.8292, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6875102519989014, "rewards/margins": 0.20825958251953125, "rewards/rejected": 2.47925066947937, "step": 26780 }, { "epoch": 1.2437903338130831, "grad_norm": 34.22871398925781, "learning_rate": 2.254171502855286e-07, "logits/chosen": -18.048198699951172, "logits/rejected": -17.94524383544922, "logps/chosen": -397.64422607421875, "logps/rejected": -401.1236572265625, "loss": 1.111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9797911643981934, "rewards/margins": 0.39402294158935547, "rewards/rejected": 3.585768222808838, "step": 26790 }, { "epoch": 1.2442546079205163, "grad_norm": 14.629817962646484, "learning_rate": 2.2538929383908258e-07, "logits/chosen": -19.323938369750977, "logits/rejected": -17.62617301940918, "logps/chosen": -353.52532958984375, "logps/rejected": -230.94479370117188, "loss": 0.5362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.963365077972412, "rewards/margins": 1.5047606229782104, "rewards/rejected": 1.458604335784912, "step": 26800 }, { "epoch": 1.2447188820279493, "grad_norm": 31.753494262695312, "learning_rate": 2.253614373926366e-07, "logits/chosen": -18.634780883789062, "logits/rejected": -17.386932373046875, "logps/chosen": -322.80828857421875, "logps/rejected": -251.8843536376953, "loss": 0.4702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6343131065368652, "rewards/margins": 0.8574848175048828, "rewards/rejected": 1.776828408241272, "step": 26810 }, { "epoch": 1.2451831561353823, "grad_norm": 169.34449768066406, "learning_rate": 2.2533358094619063e-07, "logits/chosen": -18.849262237548828, "logits/rejected": -18.703062057495117, "logps/chosen": -419.7828063964844, "logps/rejected": -370.31683349609375, "loss": 0.7472, "rewards/accuracies": 0.5, "rewards/chosen": 2.776320695877075, "rewards/margins": 0.3418978750705719, "rewards/rejected": 2.434422731399536, "step": 26820 }, { "epoch": 1.2456474302428153, "grad_norm": 28.324331283569336, "learning_rate": 2.2530572449974465e-07, "logits/chosen": -18.499710083007812, "logits/rejected": -18.304523468017578, "logps/chosen": -350.28973388671875, "logps/rejected": -364.91729736328125, "loss": 0.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.300067901611328, "rewards/margins": 0.5128040313720703, "rewards/rejected": 2.787264108657837, "step": 26830 }, { "epoch": 1.2461117043502483, "grad_norm": 38.447906494140625, "learning_rate": 2.2527786805329866e-07, "logits/chosen": -20.20830535888672, "logits/rejected": -19.314611434936523, "logps/chosen": -463.32171630859375, "logps/rejected": -450.6817321777344, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6030821800231934, "rewards/margins": 0.5846080780029297, "rewards/rejected": 3.0184738636016846, "step": 26840 }, { "epoch": 1.2465759784576815, "grad_norm": 189.9942626953125, "learning_rate": 2.2525001160685267e-07, "logits/chosen": -18.54258918762207, "logits/rejected": -18.292835235595703, "logps/chosen": -384.01483154296875, "logps/rejected": -346.6154479980469, "loss": 0.9464, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.161501407623291, "rewards/margins": 0.18918438255786896, "rewards/rejected": 2.9723167419433594, "step": 26850 }, { "epoch": 1.2470402525651145, "grad_norm": 105.64529418945312, "learning_rate": 2.252221551604067e-07, "logits/chosen": -19.327640533447266, "logits/rejected": -18.804960250854492, "logps/chosen": -426.6156311035156, "logps/rejected": -322.60845947265625, "loss": 1.0986, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0226707458496094, "rewards/margins": 0.36624622344970703, "rewards/rejected": 2.6564242839813232, "step": 26860 }, { "epoch": 1.2475045266725475, "grad_norm": 28.535764694213867, "learning_rate": 2.2519429871396073e-07, "logits/chosen": -18.622406005859375, "logits/rejected": -18.11750030517578, "logps/chosen": -313.85418701171875, "logps/rejected": -260.48590087890625, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5940299034118652, "rewards/margins": 0.4815402626991272, "rewards/rejected": 2.1124894618988037, "step": 26870 }, { "epoch": 1.2479688007799805, "grad_norm": 64.6421890258789, "learning_rate": 2.2516644226751472e-07, "logits/chosen": -19.165569305419922, "logits/rejected": -18.236698150634766, "logps/chosen": -417.210693359375, "logps/rejected": -240.23629760742188, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 3.104835033416748, "rewards/margins": 1.1746962070465088, "rewards/rejected": 1.9301389455795288, "step": 26880 }, { "epoch": 1.2484330748874135, "grad_norm": 177.52554321289062, "learning_rate": 2.2513858582106876e-07, "logits/chosen": -19.275135040283203, "logits/rejected": -18.51932144165039, "logps/chosen": -364.96685791015625, "logps/rejected": -287.4686279296875, "loss": 0.8912, "rewards/accuracies": 0.5, "rewards/chosen": 2.9474778175354004, "rewards/margins": 0.46749362349510193, "rewards/rejected": 2.479984760284424, "step": 26890 }, { "epoch": 1.2488973489948465, "grad_norm": 8.434253692626953, "learning_rate": 2.2511072937462277e-07, "logits/chosen": -18.071163177490234, "logits/rejected": -17.528409957885742, "logps/chosen": -334.6692199707031, "logps/rejected": -204.8892822265625, "loss": 0.5765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6986145973205566, "rewards/margins": 1.1483514308929443, "rewards/rejected": 1.5502634048461914, "step": 26900 }, { "epoch": 1.2493616231022795, "grad_norm": 108.34651947021484, "learning_rate": 2.2508287292817676e-07, "logits/chosen": -19.880754470825195, "logits/rejected": -20.034957885742188, "logps/chosen": -448.13763427734375, "logps/rejected": -430.05780029296875, "loss": 0.6991, "rewards/accuracies": 0.5, "rewards/chosen": 3.944009304046631, "rewards/margins": 0.13426926732063293, "rewards/rejected": 3.8097400665283203, "step": 26910 }, { "epoch": 1.2498258972097127, "grad_norm": 13.308463096618652, "learning_rate": 2.250550164817308e-07, "logits/chosen": -19.280256271362305, "logits/rejected": -17.72159194946289, "logps/chosen": -391.0003967285156, "logps/rejected": -257.35577392578125, "loss": 0.2379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8593337535858154, "rewards/margins": 1.8783899545669556, "rewards/rejected": 0.980944037437439, "step": 26920 }, { "epoch": 1.2502901713171457, "grad_norm": 101.5059585571289, "learning_rate": 2.2502716003528482e-07, "logits/chosen": -18.55109977722168, "logits/rejected": -18.14169692993164, "logps/chosen": -405.0726013183594, "logps/rejected": -340.574462890625, "loss": 0.587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5499234199523926, "rewards/margins": 0.31233513355255127, "rewards/rejected": 2.237588405609131, "step": 26930 }, { "epoch": 1.2507544454245787, "grad_norm": 1.5785832405090332, "learning_rate": 2.2499930358883886e-07, "logits/chosen": -19.177154541015625, "logits/rejected": -18.6722412109375, "logps/chosen": -407.3284606933594, "logps/rejected": -303.77099609375, "loss": 0.4505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2451796531677246, "rewards/margins": 0.9431538581848145, "rewards/rejected": 2.30202579498291, "step": 26940 }, { "epoch": 1.2512187195320117, "grad_norm": 108.65440368652344, "learning_rate": 2.2497144714239285e-07, "logits/chosen": -17.976806640625, "logits/rejected": -17.581113815307617, "logps/chosen": -265.20233154296875, "logps/rejected": -241.3566131591797, "loss": 0.5213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.471524715423584, "rewards/margins": 0.6084370017051697, "rewards/rejected": 1.8630876541137695, "step": 26950 }, { "epoch": 1.2516829936394447, "grad_norm": 87.03662109375, "learning_rate": 2.2494359069594686e-07, "logits/chosen": -18.207019805908203, "logits/rejected": -17.602176666259766, "logps/chosen": -467.6863708496094, "logps/rejected": -364.1920166015625, "loss": 0.5121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9595179557800293, "rewards/margins": 1.4198156595230103, "rewards/rejected": 2.5397026538848877, "step": 26960 }, { "epoch": 1.2521472677468777, "grad_norm": 10.89175033569336, "learning_rate": 2.249157342495009e-07, "logits/chosen": -17.945110321044922, "logits/rejected": -17.415796279907227, "logps/chosen": -434.08026123046875, "logps/rejected": -369.8470764160156, "loss": 0.8784, "rewards/accuracies": 0.5, "rewards/chosen": 3.3493804931640625, "rewards/margins": 0.4740748405456543, "rewards/rejected": 2.8753058910369873, "step": 26970 }, { "epoch": 1.2526115418543107, "grad_norm": 99.17279052734375, "learning_rate": 2.2488787780305492e-07, "logits/chosen": -19.184892654418945, "logits/rejected": -18.858003616333008, "logps/chosen": -351.8506164550781, "logps/rejected": -333.5193786621094, "loss": 0.5779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2343082427978516, "rewards/margins": 0.5205462574958801, "rewards/rejected": 2.713761806488037, "step": 26980 }, { "epoch": 1.2530758159617439, "grad_norm": 128.6915740966797, "learning_rate": 2.2486002135660893e-07, "logits/chosen": -18.711490631103516, "logits/rejected": -18.09313201904297, "logps/chosen": -417.4087829589844, "logps/rejected": -271.99560546875, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.313976764678955, "rewards/margins": 1.2457845211029053, "rewards/rejected": 2.06819224357605, "step": 26990 }, { "epoch": 1.2535400900691769, "grad_norm": 3.6624577045440674, "learning_rate": 2.2483216491016295e-07, "logits/chosen": -18.40427017211914, "logits/rejected": -18.023460388183594, "logps/chosen": -439.43121337890625, "logps/rejected": -427.6068420410156, "loss": 0.8527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3227572441101074, "rewards/margins": 0.6231563687324524, "rewards/rejected": 2.6996009349823, "step": 27000 }, { "epoch": 1.2540043641766099, "grad_norm": 3.688108444213867, "learning_rate": 2.2480430846371699e-07, "logits/chosen": -18.233001708984375, "logits/rejected": -17.63895606994629, "logps/chosen": -384.0355224609375, "logps/rejected": -330.5226135253906, "loss": 0.7254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2725512981414795, "rewards/margins": 0.6007829904556274, "rewards/rejected": 2.6717681884765625, "step": 27010 }, { "epoch": 1.2544686382840429, "grad_norm": 25.95802879333496, "learning_rate": 2.24776452017271e-07, "logits/chosen": -17.84341812133789, "logits/rejected": -17.323957443237305, "logps/chosen": -275.10784912109375, "logps/rejected": -257.71728515625, "loss": 0.7933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.360182762145996, "rewards/margins": 0.5692646503448486, "rewards/rejected": 1.7909183502197266, "step": 27020 }, { "epoch": 1.2549329123914759, "grad_norm": 110.2599105834961, "learning_rate": 2.24748595570825e-07, "logits/chosen": -18.802278518676758, "logits/rejected": -18.557666778564453, "logps/chosen": -401.35076904296875, "logps/rejected": -356.07940673828125, "loss": 0.5778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3049914836883545, "rewards/margins": 0.626264214515686, "rewards/rejected": 2.678727626800537, "step": 27030 }, { "epoch": 1.255397186498909, "grad_norm": 80.68828582763672, "learning_rate": 2.2472073912437903e-07, "logits/chosen": -18.47518539428711, "logits/rejected": -18.768705368041992, "logps/chosen": -387.10223388671875, "logps/rejected": -315.96282958984375, "loss": 1.0065, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5419678688049316, "rewards/margins": 0.2890937328338623, "rewards/rejected": 2.2528741359710693, "step": 27040 }, { "epoch": 1.2558614606063419, "grad_norm": 26.601903915405273, "learning_rate": 2.2469288267793304e-07, "logits/chosen": -18.31015968322754, "logits/rejected": -17.344036102294922, "logps/chosen": -309.0499572753906, "logps/rejected": -251.0706024169922, "loss": 0.4777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.261475086212158, "rewards/margins": 1.3099300861358643, "rewards/rejected": 1.951545000076294, "step": 27050 }, { "epoch": 1.256325734713775, "grad_norm": 124.77889251708984, "learning_rate": 2.2466502623148709e-07, "logits/chosen": -18.482765197753906, "logits/rejected": -17.424718856811523, "logps/chosen": -398.6727600097656, "logps/rejected": -293.5951843261719, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2254199981689453, "rewards/margins": 1.1254174709320068, "rewards/rejected": 2.1000022888183594, "step": 27060 }, { "epoch": 1.256790008821208, "grad_norm": 2.838027000427246, "learning_rate": 2.2463716978504107e-07, "logits/chosen": -19.474727630615234, "logits/rejected": -19.277393341064453, "logps/chosen": -491.0738220214844, "logps/rejected": -480.931396484375, "loss": 0.701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8784847259521484, "rewards/margins": 0.38414764404296875, "rewards/rejected": 3.4943370819091797, "step": 27070 }, { "epoch": 1.257254282928641, "grad_norm": 88.65399932861328, "learning_rate": 2.246093133385951e-07, "logits/chosen": -19.04339599609375, "logits/rejected": -18.056640625, "logps/chosen": -622.0498657226562, "logps/rejected": -461.0049743652344, "loss": 0.6342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.244714736938477, "rewards/margins": 0.7834131717681885, "rewards/rejected": 3.461301326751709, "step": 27080 }, { "epoch": 1.257718557036074, "grad_norm": 177.53170776367188, "learning_rate": 2.2458145689214913e-07, "logits/chosen": -18.339479446411133, "logits/rejected": -18.584442138671875, "logps/chosen": -447.96490478515625, "logps/rejected": -387.2854919433594, "loss": 0.9594, "rewards/accuracies": 0.5, "rewards/chosen": 3.0521903038024902, "rewards/margins": 0.0779917985200882, "rewards/rejected": 2.974198341369629, "step": 27090 }, { "epoch": 1.258182831143507, "grad_norm": 108.90086364746094, "learning_rate": 2.2455360044570312e-07, "logits/chosen": -18.00931739807129, "logits/rejected": -17.290328979492188, "logps/chosen": -369.1174011230469, "logps/rejected": -316.8238220214844, "loss": 0.5453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9132437705993652, "rewards/margins": 1.2865557670593262, "rewards/rejected": 1.626688003540039, "step": 27100 }, { "epoch": 1.2586471052509403, "grad_norm": 24.435832977294922, "learning_rate": 2.2452574399925713e-07, "logits/chosen": -19.892337799072266, "logits/rejected": -19.512598037719727, "logps/chosen": -370.0697937011719, "logps/rejected": -303.6750183105469, "loss": 0.7563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3462164402008057, "rewards/margins": 0.33592724800109863, "rewards/rejected": 2.010289430618286, "step": 27110 }, { "epoch": 1.2591113793583733, "grad_norm": 72.9441146850586, "learning_rate": 2.2449788755281117e-07, "logits/chosen": -20.191797256469727, "logits/rejected": -19.340534210205078, "logps/chosen": -349.3985595703125, "logps/rejected": -301.96527099609375, "loss": 0.5177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1916024684906006, "rewards/margins": 0.8255593180656433, "rewards/rejected": 2.3660428524017334, "step": 27120 }, { "epoch": 1.2595756534658062, "grad_norm": 14.760904312133789, "learning_rate": 2.244700311063652e-07, "logits/chosen": -19.12088966369629, "logits/rejected": -18.271114349365234, "logps/chosen": -364.94720458984375, "logps/rejected": -293.87164306640625, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.970928907394409, "rewards/margins": 0.6853410005569458, "rewards/rejected": 2.285587787628174, "step": 27130 }, { "epoch": 1.2600399275732392, "grad_norm": 43.08918762207031, "learning_rate": 2.244421746599192e-07, "logits/chosen": -18.105878829956055, "logits/rejected": -17.750871658325195, "logps/chosen": -397.1445007324219, "logps/rejected": -368.5334777832031, "loss": 0.6147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.440741777420044, "rewards/margins": 1.0522749423980713, "rewards/rejected": 2.3884665966033936, "step": 27140 }, { "epoch": 1.2605042016806722, "grad_norm": 5.657496452331543, "learning_rate": 2.2441431821347322e-07, "logits/chosen": -18.558917999267578, "logits/rejected": -18.59774398803711, "logps/chosen": -411.24432373046875, "logps/rejected": -478.1405334472656, "loss": 1.0356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0433883666992188, "rewards/margins": 0.1655346155166626, "rewards/rejected": 2.8778538703918457, "step": 27150 }, { "epoch": 1.2609684757881052, "grad_norm": 31.72960090637207, "learning_rate": 2.2438646176702726e-07, "logits/chosen": -18.596750259399414, "logits/rejected": -18.561738967895508, "logps/chosen": -345.41058349609375, "logps/rejected": -359.7509460449219, "loss": 1.2827, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.336235761642456, "rewards/margins": -0.4282306730747223, "rewards/rejected": 2.7644667625427246, "step": 27160 }, { "epoch": 1.2614327498955382, "grad_norm": 23.27060317993164, "learning_rate": 2.2435860532058127e-07, "logits/chosen": -19.589799880981445, "logits/rejected": -18.132213592529297, "logps/chosen": -356.8643493652344, "logps/rejected": -246.70742797851562, "loss": 0.6661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.458409309387207, "rewards/margins": 0.9731564521789551, "rewards/rejected": 1.4852527379989624, "step": 27170 }, { "epoch": 1.2618970240029714, "grad_norm": 70.76461029052734, "learning_rate": 2.2433074887413526e-07, "logits/chosen": -19.599355697631836, "logits/rejected": -18.919017791748047, "logps/chosen": -371.7813415527344, "logps/rejected": -306.90185546875, "loss": 0.6172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.181363344192505, "rewards/margins": 0.7889841794967651, "rewards/rejected": 2.39237904548645, "step": 27180 }, { "epoch": 1.2623612981104044, "grad_norm": 1.334658145904541, "learning_rate": 2.243028924276893e-07, "logits/chosen": -18.625293731689453, "logits/rejected": -18.63547134399414, "logps/chosen": -412.2630920410156, "logps/rejected": -365.29327392578125, "loss": 1.2435, "rewards/accuracies": 0.5, "rewards/chosen": 3.0520801544189453, "rewards/margins": -0.13502565026283264, "rewards/rejected": 3.187106132507324, "step": 27190 }, { "epoch": 1.2628255722178374, "grad_norm": 37.799232482910156, "learning_rate": 2.2427503598124332e-07, "logits/chosen": -18.974102020263672, "logits/rejected": -18.617374420166016, "logps/chosen": -434.69403076171875, "logps/rejected": -348.3297424316406, "loss": 0.8094, "rewards/accuracies": 0.5, "rewards/chosen": 3.3662097454071045, "rewards/margins": 0.5884532928466797, "rewards/rejected": 2.777756929397583, "step": 27200 }, { "epoch": 1.2632898463252704, "grad_norm": 114.99311065673828, "learning_rate": 2.2424717953479736e-07, "logits/chosen": -18.075138092041016, "logits/rejected": -18.004243850708008, "logps/chosen": -431.9600524902344, "logps/rejected": -496.63006591796875, "loss": 0.9821, "rewards/accuracies": 0.5, "rewards/chosen": 3.1451923847198486, "rewards/margins": -0.20546868443489075, "rewards/rejected": 3.350661039352417, "step": 27210 }, { "epoch": 1.2637541204327034, "grad_norm": 1.6146612167358398, "learning_rate": 2.2421932308835134e-07, "logits/chosen": -19.394912719726562, "logits/rejected": -18.98599624633789, "logps/chosen": -484.75689697265625, "logps/rejected": -403.97454833984375, "loss": 0.8291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5181548595428467, "rewards/margins": 0.946880042552948, "rewards/rejected": 2.571275234222412, "step": 27220 }, { "epoch": 1.2642183945401366, "grad_norm": 25.461223602294922, "learning_rate": 2.2419146664190536e-07, "logits/chosen": -19.549968719482422, "logits/rejected": -18.068851470947266, "logps/chosen": -439.6224670410156, "logps/rejected": -292.6976013183594, "loss": 0.616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1002485752105713, "rewards/margins": 1.2793152332305908, "rewards/rejected": 1.8209333419799805, "step": 27230 }, { "epoch": 1.2646826686475694, "grad_norm": 40.27078628540039, "learning_rate": 2.241636101954594e-07, "logits/chosen": -20.179370880126953, "logits/rejected": -20.054079055786133, "logps/chosen": -476.53271484375, "logps/rejected": -460.9225158691406, "loss": 0.7129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.113448143005371, "rewards/margins": 0.24626760184764862, "rewards/rejected": 2.867180347442627, "step": 27240 }, { "epoch": 1.2651469427550026, "grad_norm": 77.49908447265625, "learning_rate": 2.2413575374901341e-07, "logits/chosen": -17.46586036682129, "logits/rejected": -17.173709869384766, "logps/chosen": -346.1502380371094, "logps/rejected": -285.73016357421875, "loss": 0.5998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.33937406539917, "rewards/margins": 0.8254686594009399, "rewards/rejected": 1.51390540599823, "step": 27250 }, { "epoch": 1.2656112168624356, "grad_norm": 77.95346069335938, "learning_rate": 2.2410789730256743e-07, "logits/chosen": -17.78957176208496, "logits/rejected": -16.949399948120117, "logps/chosen": -360.081298828125, "logps/rejected": -237.80203247070312, "loss": 0.603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5277092456817627, "rewards/margins": 1.1779658794403076, "rewards/rejected": 1.349743366241455, "step": 27260 }, { "epoch": 1.2660754909698686, "grad_norm": 46.86552047729492, "learning_rate": 2.2408004085612144e-07, "logits/chosen": -19.022586822509766, "logits/rejected": -18.74250030517578, "logps/chosen": -454.2607421875, "logps/rejected": -366.52325439453125, "loss": 0.6785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5006377696990967, "rewards/margins": 0.7246161699295044, "rewards/rejected": 2.776021718978882, "step": 27270 }, { "epoch": 1.2665397650773016, "grad_norm": 40.76836013793945, "learning_rate": 2.2405218440967546e-07, "logits/chosen": -19.005142211914062, "logits/rejected": -18.409034729003906, "logps/chosen": -427.6949768066406, "logps/rejected": -276.1646423339844, "loss": 0.5805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.678043842315674, "rewards/margins": 0.9706453084945679, "rewards/rejected": 2.7073981761932373, "step": 27280 }, { "epoch": 1.2670040391847346, "grad_norm": 65.75064849853516, "learning_rate": 2.2402432796322947e-07, "logits/chosen": -17.92876434326172, "logits/rejected": -17.36474609375, "logps/chosen": -308.1967468261719, "logps/rejected": -262.04833984375, "loss": 0.6774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8463973999023438, "rewards/margins": 1.1486258506774902, "rewards/rejected": 1.697771430015564, "step": 27290 }, { "epoch": 1.2674683132921678, "grad_norm": 212.73715209960938, "learning_rate": 2.239964715167835e-07, "logits/chosen": -17.92835807800293, "logits/rejected": -17.295536041259766, "logps/chosen": -461.7474060058594, "logps/rejected": -379.6353759765625, "loss": 0.7812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.84431791305542, "rewards/margins": 1.0159547328948975, "rewards/rejected": 2.8283629417419434, "step": 27300 }, { "epoch": 1.2679325873996008, "grad_norm": 28.25607681274414, "learning_rate": 2.2396861507033753e-07, "logits/chosen": -19.595966339111328, "logits/rejected": -18.883378982543945, "logps/chosen": -399.20428466796875, "logps/rejected": -296.0687255859375, "loss": 0.4833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.710998296737671, "rewards/margins": 0.6254213452339172, "rewards/rejected": 2.0855770111083984, "step": 27310 }, { "epoch": 1.2683968615070338, "grad_norm": 42.174739837646484, "learning_rate": 2.2394075862389154e-07, "logits/chosen": -17.875194549560547, "logits/rejected": -17.900970458984375, "logps/chosen": -457.89288330078125, "logps/rejected": -501.25653076171875, "loss": 0.7097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8895905017852783, "rewards/margins": 0.6033636331558228, "rewards/rejected": 3.286226749420166, "step": 27320 }, { "epoch": 1.2688611356144668, "grad_norm": 0.13288748264312744, "learning_rate": 2.2391290217744553e-07, "logits/chosen": -18.30929946899414, "logits/rejected": -17.607458114624023, "logps/chosen": -380.1839294433594, "logps/rejected": -284.9244384765625, "loss": 0.4103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5661873817443848, "rewards/margins": 1.427361011505127, "rewards/rejected": 2.1388261318206787, "step": 27330 }, { "epoch": 1.2693254097218998, "grad_norm": 0.357470840215683, "learning_rate": 2.2388504573099957e-07, "logits/chosen": -18.351360321044922, "logits/rejected": -17.6016845703125, "logps/chosen": -358.69451904296875, "logps/rejected": -270.2682189941406, "loss": 0.5565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9902665615081787, "rewards/margins": 1.4749083518981934, "rewards/rejected": 1.5153582096099854, "step": 27340 }, { "epoch": 1.2697896838293328, "grad_norm": 1.6914690732955933, "learning_rate": 2.2385718928455359e-07, "logits/chosen": -17.537851333618164, "logits/rejected": -18.033002853393555, "logps/chosen": -396.43060302734375, "logps/rejected": -414.9183654785156, "loss": 1.3586, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.000491142272949, "rewards/margins": -0.06315012276172638, "rewards/rejected": 3.0636415481567383, "step": 27350 }, { "epoch": 1.2702539579367658, "grad_norm": 9.1185302734375, "learning_rate": 2.2382933283810763e-07, "logits/chosen": -18.468326568603516, "logits/rejected": -17.55438995361328, "logps/chosen": -429.1612854003906, "logps/rejected": -341.5014343261719, "loss": 0.5677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4469685554504395, "rewards/margins": 0.7821134924888611, "rewards/rejected": 2.6648545265197754, "step": 27360 }, { "epoch": 1.270718232044199, "grad_norm": 133.690185546875, "learning_rate": 2.2380147639166162e-07, "logits/chosen": -19.374736785888672, "logits/rejected": -18.665313720703125, "logps/chosen": -503.18487548828125, "logps/rejected": -443.3279724121094, "loss": 0.8302, "rewards/accuracies": 0.5, "rewards/chosen": 3.3279213905334473, "rewards/margins": 0.6129552125930786, "rewards/rejected": 2.7149658203125, "step": 27370 }, { "epoch": 1.271182506151632, "grad_norm": 19.905120849609375, "learning_rate": 2.2377361994521563e-07, "logits/chosen": -18.04193115234375, "logits/rejected": -17.557485580444336, "logps/chosen": -303.61846923828125, "logps/rejected": -210.9472198486328, "loss": 1.0113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2437305450439453, "rewards/margins": 0.45080089569091797, "rewards/rejected": 1.7929296493530273, "step": 27380 }, { "epoch": 1.271646780259065, "grad_norm": 44.07947540283203, "learning_rate": 2.2374576349876967e-07, "logits/chosen": -19.42642593383789, "logits/rejected": -18.009876251220703, "logps/chosen": -496.1640625, "logps/rejected": -308.0025634765625, "loss": 0.3491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.081338405609131, "rewards/margins": 1.4843223094940186, "rewards/rejected": 1.5970160961151123, "step": 27390 }, { "epoch": 1.272111054366498, "grad_norm": 133.09788513183594, "learning_rate": 2.2371790705232369e-07, "logits/chosen": -18.204345703125, "logits/rejected": -17.572166442871094, "logps/chosen": -335.0914611816406, "logps/rejected": -304.3608703613281, "loss": 0.8357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8585667610168457, "rewards/margins": 0.3377973139286041, "rewards/rejected": 2.5207691192626953, "step": 27400 }, { "epoch": 1.272575328473931, "grad_norm": 37.99427032470703, "learning_rate": 2.236900506058777e-07, "logits/chosen": -18.057514190673828, "logits/rejected": -17.57139015197754, "logps/chosen": -349.63226318359375, "logps/rejected": -287.84429931640625, "loss": 0.6152, "rewards/accuracies": 0.5, "rewards/chosen": 2.471733570098877, "rewards/margins": 0.7202601432800293, "rewards/rejected": 1.7514736652374268, "step": 27410 }, { "epoch": 1.2730396025813642, "grad_norm": 19.20818519592285, "learning_rate": 2.2366219415943171e-07, "logits/chosen": -19.45533561706543, "logits/rejected": -17.873626708984375, "logps/chosen": -415.35260009765625, "logps/rejected": -260.5776062011719, "loss": 0.4856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0708940029144287, "rewards/margins": 1.5906016826629639, "rewards/rejected": 1.4802924394607544, "step": 27420 }, { "epoch": 1.273503876688797, "grad_norm": 22.88006591796875, "learning_rate": 2.2363433771298576e-07, "logits/chosen": -18.668075561523438, "logits/rejected": -18.95305633544922, "logps/chosen": -450.93157958984375, "logps/rejected": -456.8270568847656, "loss": 1.0624, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.2809739112854004, "rewards/margins": 0.01556382142007351, "rewards/rejected": 3.2654106616973877, "step": 27430 }, { "epoch": 1.2739681507962302, "grad_norm": 112.43262481689453, "learning_rate": 2.2360648126653977e-07, "logits/chosen": -18.86394691467285, "logits/rejected": -18.36453628540039, "logps/chosen": -312.3498840332031, "logps/rejected": -295.51361083984375, "loss": 0.8282, "rewards/accuracies": 0.5, "rewards/chosen": 2.7655413150787354, "rewards/margins": 0.6492870450019836, "rewards/rejected": 2.1162540912628174, "step": 27440 }, { "epoch": 1.2744324249036632, "grad_norm": 45.3952522277832, "learning_rate": 2.2357862482009376e-07, "logits/chosen": -18.28241729736328, "logits/rejected": -18.480154037475586, "logps/chosen": -426.6544494628906, "logps/rejected": -305.2191467285156, "loss": 0.5526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.422372817993164, "rewards/margins": 0.9241557121276855, "rewards/rejected": 2.4982171058654785, "step": 27450 }, { "epoch": 1.2748966990110961, "grad_norm": 103.96969604492188, "learning_rate": 2.235507683736478e-07, "logits/chosen": -19.710941314697266, "logits/rejected": -18.665605545043945, "logps/chosen": -408.87445068359375, "logps/rejected": -338.5869140625, "loss": 0.8121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3607017993927, "rewards/margins": 0.8820065259933472, "rewards/rejected": 2.4786953926086426, "step": 27460 }, { "epoch": 1.2753609731185291, "grad_norm": 72.48468780517578, "learning_rate": 2.2352291192720181e-07, "logits/chosen": -17.984603881835938, "logits/rejected": -17.718896865844727, "logps/chosen": -299.3255920410156, "logps/rejected": -263.9999694824219, "loss": 0.7511, "rewards/accuracies": 0.5, "rewards/chosen": 1.6772884130477905, "rewards/margins": 0.5110076665878296, "rewards/rejected": 1.166280746459961, "step": 27470 }, { "epoch": 1.2758252472259621, "grad_norm": 60.666385650634766, "learning_rate": 2.234950554807558e-07, "logits/chosen": -18.06826400756836, "logits/rejected": -18.272525787353516, "logps/chosen": -339.4171447753906, "logps/rejected": -335.78546142578125, "loss": 0.7629, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.699326753616333, "rewards/margins": 0.4711173474788666, "rewards/rejected": 3.2282092571258545, "step": 27480 }, { "epoch": 1.2762895213333953, "grad_norm": 196.29458618164062, "learning_rate": 2.2346719903430984e-07, "logits/chosen": -19.85730743408203, "logits/rejected": -19.39419937133789, "logps/chosen": -414.02569580078125, "logps/rejected": -371.300537109375, "loss": 0.905, "rewards/accuracies": 0.5, "rewards/chosen": 3.1424901485443115, "rewards/margins": 0.4414183497428894, "rewards/rejected": 2.7010719776153564, "step": 27490 }, { "epoch": 1.2767537954408283, "grad_norm": 1.515339970588684, "learning_rate": 2.2343934258786386e-07, "logits/chosen": -19.273691177368164, "logits/rejected": -18.299625396728516, "logps/chosen": -420.8919982910156, "logps/rejected": -317.07049560546875, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 4.009024143218994, "rewards/margins": 1.0539709329605103, "rewards/rejected": 2.9550538063049316, "step": 27500 }, { "epoch": 1.2772180695482613, "grad_norm": 103.55354309082031, "learning_rate": 2.234114861414179e-07, "logits/chosen": -19.246540069580078, "logits/rejected": -18.693071365356445, "logps/chosen": -518.112548828125, "logps/rejected": -465.1558532714844, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5403480529785156, "rewards/margins": 1.0786484479904175, "rewards/rejected": 2.4616997241973877, "step": 27510 }, { "epoch": 1.2776823436556943, "grad_norm": 242.0132293701172, "learning_rate": 2.2338362969497189e-07, "logits/chosen": -18.661998748779297, "logits/rejected": -19.00585174560547, "logps/chosen": -355.7702331542969, "logps/rejected": -323.9360046386719, "loss": 1.107, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.10221266746521, "rewards/margins": -0.2967917323112488, "rewards/rejected": 3.3990044593811035, "step": 27520 }, { "epoch": 1.2781466177631273, "grad_norm": 44.32951736450195, "learning_rate": 2.233557732485259e-07, "logits/chosen": -17.734779357910156, "logits/rejected": -16.581058502197266, "logps/chosen": -367.50567626953125, "logps/rejected": -220.17642211914062, "loss": 0.2964, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4823625087738037, "rewards/margins": 1.616127610206604, "rewards/rejected": 0.8662349581718445, "step": 27530 }, { "epoch": 1.2786108918705603, "grad_norm": 4.855879306793213, "learning_rate": 2.2332791680207994e-07, "logits/chosen": -18.35894775390625, "logits/rejected": -17.484567642211914, "logps/chosen": -566.6214599609375, "logps/rejected": -383.88104248046875, "loss": 0.6147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.020424842834473, "rewards/margins": 1.2433881759643555, "rewards/rejected": 2.7770371437072754, "step": 27540 }, { "epoch": 1.2790751659779933, "grad_norm": 9.932167053222656, "learning_rate": 2.2330006035563396e-07, "logits/chosen": -18.33847427368164, "logits/rejected": -17.642093658447266, "logps/chosen": -393.1402893066406, "logps/rejected": -296.2943420410156, "loss": 0.7868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1909003257751465, "rewards/margins": 0.8650274276733398, "rewards/rejected": 2.3258728981018066, "step": 27550 }, { "epoch": 1.2795394400854265, "grad_norm": 97.88471221923828, "learning_rate": 2.2327220390918797e-07, "logits/chosen": -18.583599090576172, "logits/rejected": -18.403385162353516, "logps/chosen": -466.38665771484375, "logps/rejected": -380.5989685058594, "loss": 0.4536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.135867118835449, "rewards/margins": 0.8427049517631531, "rewards/rejected": 3.2931625843048096, "step": 27560 }, { "epoch": 1.2800037141928595, "grad_norm": 25.837276458740234, "learning_rate": 2.2324434746274199e-07, "logits/chosen": -20.188566207885742, "logits/rejected": -18.798564910888672, "logps/chosen": -481.2447814941406, "logps/rejected": -365.35272216796875, "loss": 0.565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.645207166671753, "rewards/margins": 0.8864274024963379, "rewards/rejected": 2.758780002593994, "step": 27570 }, { "epoch": 1.2804679883002925, "grad_norm": 45.467529296875, "learning_rate": 2.2321649101629603e-07, "logits/chosen": -19.227914810180664, "logits/rejected": -18.969697952270508, "logps/chosen": -349.4791259765625, "logps/rejected": -392.08660888671875, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 3.129225730895996, "rewards/margins": 0.8002525568008423, "rewards/rejected": 2.3289732933044434, "step": 27580 }, { "epoch": 1.2809322624077255, "grad_norm": 27.946359634399414, "learning_rate": 2.2318863456985004e-07, "logits/chosen": -18.550832748413086, "logits/rejected": -17.89965057373047, "logps/chosen": -404.42803955078125, "logps/rejected": -355.17596435546875, "loss": 0.3943, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1065611839294434, "rewards/margins": 0.9820218086242676, "rewards/rejected": 2.1245391368865967, "step": 27590 }, { "epoch": 1.2813965365151585, "grad_norm": 94.96974182128906, "learning_rate": 2.2316077812340403e-07, "logits/chosen": -20.207998275756836, "logits/rejected": -19.897314071655273, "logps/chosen": -508.34466552734375, "logps/rejected": -464.8546447753906, "loss": 0.5362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.400770902633667, "rewards/margins": 0.9357576370239258, "rewards/rejected": 2.465013265609741, "step": 27600 }, { "epoch": 1.2818608106225915, "grad_norm": 48.719017028808594, "learning_rate": 2.2313292167695807e-07, "logits/chosen": -18.794010162353516, "logits/rejected": -18.195735931396484, "logps/chosen": -431.49053955078125, "logps/rejected": -377.6318664550781, "loss": 0.2862, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.634918212890625, "rewards/margins": 1.3828157186508179, "rewards/rejected": 2.2521023750305176, "step": 27610 }, { "epoch": 1.2823250847300245, "grad_norm": 34.758296966552734, "learning_rate": 2.2310506523051208e-07, "logits/chosen": -19.466983795166016, "logits/rejected": -18.835773468017578, "logps/chosen": -329.1402893066406, "logps/rejected": -246.6390838623047, "loss": 0.676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.346665859222412, "rewards/margins": 0.8095590472221375, "rewards/rejected": 1.5371068716049194, "step": 27620 }, { "epoch": 1.2827893588374577, "grad_norm": 1.6621413230895996, "learning_rate": 2.2307720878406613e-07, "logits/chosen": -18.20241355895996, "logits/rejected": -17.4534912109375, "logps/chosen": -340.46527099609375, "logps/rejected": -289.11468505859375, "loss": 0.8333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.998629331588745, "rewards/margins": 0.879707932472229, "rewards/rejected": 2.1189217567443848, "step": 27630 }, { "epoch": 1.2832536329448907, "grad_norm": 50.52735900878906, "learning_rate": 2.2304935233762011e-07, "logits/chosen": -18.935697555541992, "logits/rejected": -18.058277130126953, "logps/chosen": -366.9531555175781, "logps/rejected": -206.7763671875, "loss": 0.446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5538902282714844, "rewards/margins": 1.6314767599105835, "rewards/rejected": 1.9224132299423218, "step": 27640 }, { "epoch": 1.2837179070523237, "grad_norm": 74.66898345947266, "learning_rate": 2.2302149589117413e-07, "logits/chosen": -18.389339447021484, "logits/rejected": -17.895904541015625, "logps/chosen": -360.57098388671875, "logps/rejected": -355.5372619628906, "loss": 0.7505, "rewards/accuracies": 0.5, "rewards/chosen": 2.9910295009613037, "rewards/margins": 0.24604877829551697, "rewards/rejected": 2.744980812072754, "step": 27650 }, { "epoch": 1.2841821811597567, "grad_norm": 107.38294219970703, "learning_rate": 2.2299363944472817e-07, "logits/chosen": -18.49010467529297, "logits/rejected": -18.32530975341797, "logps/chosen": -485.91705322265625, "logps/rejected": -472.9237365722656, "loss": 1.2168, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.408102035522461, "rewards/margins": -0.2989954948425293, "rewards/rejected": 3.707097291946411, "step": 27660 }, { "epoch": 1.2846464552671897, "grad_norm": 209.86305236816406, "learning_rate": 2.2296578299828216e-07, "logits/chosen": -19.1876277923584, "logits/rejected": -18.073169708251953, "logps/chosen": -422.4393005371094, "logps/rejected": -316.46722412109375, "loss": 0.4644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.769928455352783, "rewards/margins": 1.3555244207382202, "rewards/rejected": 2.4144036769866943, "step": 27670 }, { "epoch": 1.285110729374623, "grad_norm": 85.35726165771484, "learning_rate": 2.229379265518362e-07, "logits/chosen": -18.04338264465332, "logits/rejected": -16.67551612854004, "logps/chosen": -382.8206481933594, "logps/rejected": -313.8924865722656, "loss": 0.7155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4001152515411377, "rewards/margins": 0.6171437501907349, "rewards/rejected": 1.7829713821411133, "step": 27680 }, { "epoch": 1.2855750034820557, "grad_norm": 313.2302551269531, "learning_rate": 2.229100701053902e-07, "logits/chosen": -18.069175720214844, "logits/rejected": -18.123048782348633, "logps/chosen": -324.95562744140625, "logps/rejected": -480.44921875, "loss": 1.2865, "rewards/accuracies": 0.5, "rewards/chosen": 3.091069459915161, "rewards/margins": -0.04997382313013077, "rewards/rejected": 3.141043186187744, "step": 27690 }, { "epoch": 1.2860392775894889, "grad_norm": 202.63783264160156, "learning_rate": 2.2288221365894423e-07, "logits/chosen": -19.25231170654297, "logits/rejected": -19.13590431213379, "logps/chosen": -443.3775939941406, "logps/rejected": -428.1344299316406, "loss": 0.7423, "rewards/accuracies": 0.5, "rewards/chosen": 3.259673595428467, "rewards/margins": 0.3355800211429596, "rewards/rejected": 2.9240939617156982, "step": 27700 }, { "epoch": 1.2865035516969219, "grad_norm": 59.80741882324219, "learning_rate": 2.2285435721249824e-07, "logits/chosen": -19.31009864807129, "logits/rejected": -18.423114776611328, "logps/chosen": -372.51995849609375, "logps/rejected": -288.0373840332031, "loss": 0.6189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.055873394012451, "rewards/margins": 0.8097555041313171, "rewards/rejected": 2.2461178302764893, "step": 27710 }, { "epoch": 1.2869678258043549, "grad_norm": 165.16282653808594, "learning_rate": 2.2282650076605226e-07, "logits/chosen": -18.38662338256836, "logits/rejected": -18.586307525634766, "logps/chosen": -359.09417724609375, "logps/rejected": -360.97052001953125, "loss": 0.7287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4657905101776123, "rewards/margins": 0.7639302611351013, "rewards/rejected": 2.7018604278564453, "step": 27720 }, { "epoch": 1.2874320999117879, "grad_norm": 136.53150939941406, "learning_rate": 2.227986443196063e-07, "logits/chosen": -20.020015716552734, "logits/rejected": -17.81527328491211, "logps/chosen": -473.7594299316406, "logps/rejected": -296.0162353515625, "loss": 0.5166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7405502796173096, "rewards/margins": 1.1717534065246582, "rewards/rejected": 2.5687971115112305, "step": 27730 }, { "epoch": 1.2878963740192209, "grad_norm": 1.2112922668457031, "learning_rate": 2.227707878731603e-07, "logits/chosen": -18.555171966552734, "logits/rejected": -17.794078826904297, "logps/chosen": -416.06170654296875, "logps/rejected": -332.63140869140625, "loss": 0.6487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2089781761169434, "rewards/margins": 0.8255038261413574, "rewards/rejected": 2.383474588394165, "step": 27740 }, { "epoch": 1.288360648126654, "grad_norm": 27.284029006958008, "learning_rate": 2.227429314267143e-07, "logits/chosen": -19.057607650756836, "logits/rejected": -19.215576171875, "logps/chosen": -375.846435546875, "logps/rejected": -337.672119140625, "loss": 0.3565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2989604473114014, "rewards/margins": 1.1810723543167114, "rewards/rejected": 2.1178879737854004, "step": 27750 }, { "epoch": 1.288824922234087, "grad_norm": 0.21207572519779205, "learning_rate": 2.2271507498026834e-07, "logits/chosen": -18.65309715270996, "logits/rejected": -17.450693130493164, "logps/chosen": -368.8487854003906, "logps/rejected": -249.0009765625, "loss": 0.6187, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.896595001220703, "rewards/margins": 2.3732781410217285, "rewards/rejected": 1.5233169794082642, "step": 27760 }, { "epoch": 1.28928919634152, "grad_norm": 10.485936164855957, "learning_rate": 2.2268721853382236e-07, "logits/chosen": -19.131839752197266, "logits/rejected": -17.82321548461914, "logps/chosen": -408.8031921386719, "logps/rejected": -350.12237548828125, "loss": 0.5827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.709263563156128, "rewards/margins": 1.2577366828918457, "rewards/rejected": 2.4515273571014404, "step": 27770 }, { "epoch": 1.289753470448953, "grad_norm": 189.58602905273438, "learning_rate": 2.226593620873764e-07, "logits/chosen": -18.442155838012695, "logits/rejected": -17.068565368652344, "logps/chosen": -348.2153015136719, "logps/rejected": -243.19381713867188, "loss": 0.5051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8046650886535645, "rewards/margins": 1.624847173690796, "rewards/rejected": 1.1798176765441895, "step": 27780 }, { "epoch": 1.290217744556386, "grad_norm": 0.9569107294082642, "learning_rate": 2.2263150564093038e-07, "logits/chosen": -18.9923038482666, "logits/rejected": -17.3846492767334, "logps/chosen": -497.3192443847656, "logps/rejected": -282.3000183105469, "loss": 0.4765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.128631591796875, "rewards/margins": 1.8654630184173584, "rewards/rejected": 2.2631685733795166, "step": 27790 }, { "epoch": 1.290682018663819, "grad_norm": 65.57073974609375, "learning_rate": 2.226036491944844e-07, "logits/chosen": -18.49515151977539, "logits/rejected": -17.455528259277344, "logps/chosen": -381.6418762207031, "logps/rejected": -282.0870056152344, "loss": 0.5231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.842702865600586, "rewards/margins": 0.933463454246521, "rewards/rejected": 1.9092395305633545, "step": 27800 }, { "epoch": 1.291146292771252, "grad_norm": 3.1092240810394287, "learning_rate": 2.2257579274803844e-07, "logits/chosen": -18.00971031188965, "logits/rejected": -17.00588607788086, "logps/chosen": -449.32861328125, "logps/rejected": -301.39678955078125, "loss": 0.4758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.483576536178589, "rewards/margins": 1.2420622110366821, "rewards/rejected": 2.2415146827697754, "step": 27810 }, { "epoch": 1.2916105668786853, "grad_norm": 131.19012451171875, "learning_rate": 2.2254793630159245e-07, "logits/chosen": -18.376834869384766, "logits/rejected": -17.886274337768555, "logps/chosen": -408.103515625, "logps/rejected": -295.52288818359375, "loss": 0.9325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7155749797821045, "rewards/margins": 0.8231135606765747, "rewards/rejected": 2.8924612998962402, "step": 27820 }, { "epoch": 1.2920748409861182, "grad_norm": 79.68292236328125, "learning_rate": 2.2252007985514647e-07, "logits/chosen": -18.774057388305664, "logits/rejected": -17.9179744720459, "logps/chosen": -363.77301025390625, "logps/rejected": -352.5286560058594, "loss": 0.9756, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.635591745376587, "rewards/margins": -0.0847310870885849, "rewards/rejected": 2.720322847366333, "step": 27830 }, { "epoch": 1.2925391150935512, "grad_norm": 126.14350891113281, "learning_rate": 2.2249222340870048e-07, "logits/chosen": -18.907011032104492, "logits/rejected": -17.620380401611328, "logps/chosen": -380.576416015625, "logps/rejected": -272.6709289550781, "loss": 0.5688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.800352096557617, "rewards/margins": 1.237444281578064, "rewards/rejected": 2.5629076957702637, "step": 27840 }, { "epoch": 1.2930033892009842, "grad_norm": 88.39179992675781, "learning_rate": 2.2246436696225453e-07, "logits/chosen": -18.6170711517334, "logits/rejected": -18.306522369384766, "logps/chosen": -322.81280517578125, "logps/rejected": -274.0256042480469, "loss": 1.1481, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.038041114807129, "rewards/margins": -0.37219542264938354, "rewards/rejected": 2.4102365970611572, "step": 27850 }, { "epoch": 1.2934676633084172, "grad_norm": 116.67240142822266, "learning_rate": 2.2243651051580854e-07, "logits/chosen": -18.778728485107422, "logits/rejected": -18.66804313659668, "logps/chosen": -334.660888671875, "logps/rejected": -342.1275329589844, "loss": 0.983, "rewards/accuracies": 0.5, "rewards/chosen": 2.4392383098602295, "rewards/margins": -0.17538940906524658, "rewards/rejected": 2.6146280765533447, "step": 27860 }, { "epoch": 1.2939319374158504, "grad_norm": 17.50191307067871, "learning_rate": 2.2240865406936253e-07, "logits/chosen": -18.958702087402344, "logits/rejected": -18.36294937133789, "logps/chosen": -436.9043884277344, "logps/rejected": -443.7288513183594, "loss": 0.4496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.228573322296143, "rewards/margins": 0.8823297619819641, "rewards/rejected": 3.346242904663086, "step": 27870 }, { "epoch": 1.2943962115232832, "grad_norm": 73.03701782226562, "learning_rate": 2.2238079762291657e-07, "logits/chosen": -19.618661880493164, "logits/rejected": -19.388545989990234, "logps/chosen": -401.50811767578125, "logps/rejected": -361.41937255859375, "loss": 0.6783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3612728118896484, "rewards/margins": 0.48215317726135254, "rewards/rejected": 2.879119634628296, "step": 27880 }, { "epoch": 1.2948604856307164, "grad_norm": 66.52942657470703, "learning_rate": 2.2235294117647058e-07, "logits/chosen": -18.796634674072266, "logits/rejected": -17.84119987487793, "logps/chosen": -377.42388916015625, "logps/rejected": -316.0799560546875, "loss": 0.4731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.923901081085205, "rewards/margins": 0.790081799030304, "rewards/rejected": 2.133819580078125, "step": 27890 }, { "epoch": 1.2953247597381494, "grad_norm": 5.91404390335083, "learning_rate": 2.2232508473002457e-07, "logits/chosen": -18.801326751708984, "logits/rejected": -17.80815887451172, "logps/chosen": -508.3014221191406, "logps/rejected": -354.0302734375, "loss": 0.4613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.45463490486145, "rewards/margins": 1.047767162322998, "rewards/rejected": 2.406867504119873, "step": 27900 }, { "epoch": 1.2957890338455824, "grad_norm": 81.23104095458984, "learning_rate": 2.222972282835786e-07, "logits/chosen": -17.804319381713867, "logits/rejected": -17.299922943115234, "logps/chosen": -458.3938903808594, "logps/rejected": -358.39422607421875, "loss": 0.5594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6471469402313232, "rewards/margins": 0.6449285745620728, "rewards/rejected": 3.002218246459961, "step": 27910 }, { "epoch": 1.2962533079530154, "grad_norm": 16.84272003173828, "learning_rate": 2.2226937183713263e-07, "logits/chosen": -19.12656593322754, "logits/rejected": -18.125221252441406, "logps/chosen": -338.91754150390625, "logps/rejected": -285.02911376953125, "loss": 0.565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1276423931121826, "rewards/margins": 0.5742952823638916, "rewards/rejected": 2.553347110748291, "step": 27920 }, { "epoch": 1.2967175820604484, "grad_norm": 59.96636962890625, "learning_rate": 2.2224151539068667e-07, "logits/chosen": -17.970478057861328, "logits/rejected": -18.248409271240234, "logps/chosen": -303.86309814453125, "logps/rejected": -288.61663818359375, "loss": 0.9464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3973076343536377, "rewards/margins": 0.49432048201560974, "rewards/rejected": 1.9029871225357056, "step": 27930 }, { "epoch": 1.2971818561678816, "grad_norm": 56.64808654785156, "learning_rate": 2.2221365894424066e-07, "logits/chosen": -18.271343231201172, "logits/rejected": -17.839792251586914, "logps/chosen": -352.44891357421875, "logps/rejected": -363.696044921875, "loss": 0.796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.962067127227783, "rewards/margins": 0.599806010723114, "rewards/rejected": 2.3622612953186035, "step": 27940 }, { "epoch": 1.2976461302753146, "grad_norm": 30.22040367126465, "learning_rate": 2.2218580249779467e-07, "logits/chosen": -19.334510803222656, "logits/rejected": -19.008106231689453, "logps/chosen": -369.18896484375, "logps/rejected": -338.49957275390625, "loss": 0.5204, "rewards/accuracies": 0.5, "rewards/chosen": 3.0387301445007324, "rewards/margins": 1.3444982767105103, "rewards/rejected": 1.6942317485809326, "step": 27950 }, { "epoch": 1.2981104043827476, "grad_norm": 148.6195526123047, "learning_rate": 2.221579460513487e-07, "logits/chosen": -19.26273536682129, "logits/rejected": -18.131242752075195, "logps/chosen": -446.72381591796875, "logps/rejected": -337.8268127441406, "loss": 0.4876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4446945190429688, "rewards/margins": 1.0649648904800415, "rewards/rejected": 2.379729747772217, "step": 27960 }, { "epoch": 1.2985746784901806, "grad_norm": 77.4954605102539, "learning_rate": 2.2213008960490273e-07, "logits/chosen": -18.176204681396484, "logits/rejected": -16.817909240722656, "logps/chosen": -469.5975646972656, "logps/rejected": -364.37054443359375, "loss": 0.4654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.37756609916687, "rewards/margins": 1.0818160772323608, "rewards/rejected": 2.2957499027252197, "step": 27970 }, { "epoch": 1.2990389525976136, "grad_norm": 38.96803283691406, "learning_rate": 2.2210223315845674e-07, "logits/chosen": -18.165203094482422, "logits/rejected": -18.627687454223633, "logps/chosen": -464.7427673339844, "logps/rejected": -453.8006286621094, "loss": 1.4702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9086718559265137, "rewards/margins": -0.6577306985855103, "rewards/rejected": 3.5664024353027344, "step": 27980 }, { "epoch": 1.2995032267050466, "grad_norm": 71.45086669921875, "learning_rate": 2.2207437671201075e-07, "logits/chosen": -18.81683921813965, "logits/rejected": -18.06954574584961, "logps/chosen": -366.89483642578125, "logps/rejected": -340.5971984863281, "loss": 0.4839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0855154991149902, "rewards/margins": 0.6630929708480835, "rewards/rejected": 2.4224226474761963, "step": 27990 }, { "epoch": 1.2999675008124796, "grad_norm": 0.22346238791942596, "learning_rate": 2.220465202655648e-07, "logits/chosen": -19.582435607910156, "logits/rejected": -17.489532470703125, "logps/chosen": -335.1676330566406, "logps/rejected": -221.2667236328125, "loss": 0.6225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0415074825286865, "rewards/margins": 1.8276745080947876, "rewards/rejected": 1.2138333320617676, "step": 28000 }, { "epoch": 1.3004317749199128, "grad_norm": 18.38301658630371, "learning_rate": 2.220186638191188e-07, "logits/chosen": -18.501644134521484, "logits/rejected": -18.152753829956055, "logps/chosen": -330.6657409667969, "logps/rejected": -263.7333984375, "loss": 0.5209, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5427002906799316, "rewards/margins": 1.1678740978240967, "rewards/rejected": 1.3748260736465454, "step": 28010 }, { "epoch": 1.3008960490273458, "grad_norm": 8.917224884033203, "learning_rate": 2.219908073726728e-07, "logits/chosen": -19.9464111328125, "logits/rejected": -18.24738311767578, "logps/chosen": -409.95855712890625, "logps/rejected": -236.5019989013672, "loss": 0.2551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.106350421905518, "rewards/margins": 2.311445474624634, "rewards/rejected": 1.7949049472808838, "step": 28020 }, { "epoch": 1.3013603231347788, "grad_norm": 262.11334228515625, "learning_rate": 2.2196295092622684e-07, "logits/chosen": -19.434877395629883, "logits/rejected": -18.76633644104004, "logps/chosen": -431.66949462890625, "logps/rejected": -371.25689697265625, "loss": 0.6261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2296996116638184, "rewards/margins": 0.579118549823761, "rewards/rejected": 2.650581121444702, "step": 28030 }, { "epoch": 1.3018245972422118, "grad_norm": 90.24093627929688, "learning_rate": 2.2193509447978085e-07, "logits/chosen": -18.6387882232666, "logits/rejected": -18.043251037597656, "logps/chosen": -430.0155334472656, "logps/rejected": -345.1475524902344, "loss": 0.8814, "rewards/accuracies": 0.5, "rewards/chosen": 3.7266628742218018, "rewards/margins": 0.5573819875717163, "rewards/rejected": 3.169280529022217, "step": 28040 }, { "epoch": 1.3022888713496448, "grad_norm": 6.231227874755859, "learning_rate": 2.219072380333349e-07, "logits/chosen": -19.026020050048828, "logits/rejected": -18.116640090942383, "logps/chosen": -373.3572082519531, "logps/rejected": -353.2697448730469, "loss": 0.5982, "rewards/accuracies": 0.5, "rewards/chosen": 2.689401865005493, "rewards/margins": 0.5319979190826416, "rewards/rejected": 2.1574037075042725, "step": 28050 }, { "epoch": 1.302753145457078, "grad_norm": 25.201345443725586, "learning_rate": 2.2187938158688888e-07, "logits/chosen": -19.297260284423828, "logits/rejected": -17.972461700439453, "logps/chosen": -484.5802307128906, "logps/rejected": -338.9808349609375, "loss": 0.3808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.572056770324707, "rewards/margins": 1.8304970264434814, "rewards/rejected": 2.7415599822998047, "step": 28060 }, { "epoch": 1.3032174195645108, "grad_norm": 52.335262298583984, "learning_rate": 2.218515251404429e-07, "logits/chosen": -18.387157440185547, "logits/rejected": -17.332706451416016, "logps/chosen": -394.3747863769531, "logps/rejected": -298.96441650390625, "loss": 0.3864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9321093559265137, "rewards/margins": 1.1617166996002197, "rewards/rejected": 1.7703927755355835, "step": 28070 }, { "epoch": 1.303681693671944, "grad_norm": 20.778331756591797, "learning_rate": 2.2182366869399694e-07, "logits/chosen": -18.493450164794922, "logits/rejected": -16.983211517333984, "logps/chosen": -369.63201904296875, "logps/rejected": -219.46627807617188, "loss": 0.268, "rewards/accuracies": 1.0, "rewards/chosen": 3.0381648540496826, "rewards/margins": 1.540938377380371, "rewards/rejected": 1.497226595878601, "step": 28080 }, { "epoch": 1.304145967779377, "grad_norm": 1.1371862888336182, "learning_rate": 2.2179581224755093e-07, "logits/chosen": -18.851234436035156, "logits/rejected": -18.191408157348633, "logps/chosen": -399.19384765625, "logps/rejected": -338.95391845703125, "loss": 0.5831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8329923152923584, "rewards/margins": 0.7476099729537964, "rewards/rejected": 2.0853824615478516, "step": 28090 }, { "epoch": 1.30461024188681, "grad_norm": 0.7304302453994751, "learning_rate": 2.2176795580110497e-07, "logits/chosen": -19.231201171875, "logits/rejected": -17.167531967163086, "logps/chosen": -537.7147827148438, "logps/rejected": -308.3214416503906, "loss": 0.3376, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.979269504547119, "rewards/margins": 1.943213701248169, "rewards/rejected": 2.0360560417175293, "step": 28100 }, { "epoch": 1.305074515994243, "grad_norm": 5.711526870727539, "learning_rate": 2.2174009935465898e-07, "logits/chosen": -18.241409301757812, "logits/rejected": -17.4190673828125, "logps/chosen": -356.47796630859375, "logps/rejected": -251.893310546875, "loss": 0.8801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.767167568206787, "rewards/margins": 0.45625194907188416, "rewards/rejected": 2.310915946960449, "step": 28110 }, { "epoch": 1.305538790101676, "grad_norm": 13.99826717376709, "learning_rate": 2.21712242908213e-07, "logits/chosen": -19.030263900756836, "logits/rejected": -17.360654830932617, "logps/chosen": -547.4797973632812, "logps/rejected": -297.51251220703125, "loss": 0.8985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3036410808563232, "rewards/margins": 0.9928226470947266, "rewards/rejected": 2.3108181953430176, "step": 28120 }, { "epoch": 1.3060030642091092, "grad_norm": 41.611331939697266, "learning_rate": 2.21684386461767e-07, "logits/chosen": -18.186981201171875, "logits/rejected": -17.06688690185547, "logps/chosen": -462.9842224121094, "logps/rejected": -309.2450256347656, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1526217460632324, "rewards/margins": 1.4089651107788086, "rewards/rejected": 1.7436567544937134, "step": 28130 }, { "epoch": 1.3064673383165422, "grad_norm": 41.341217041015625, "learning_rate": 2.2165653001532103e-07, "logits/chosen": -19.774250030517578, "logits/rejected": -18.995868682861328, "logps/chosen": -360.5492858886719, "logps/rejected": -363.95135498046875, "loss": 0.8736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5420122146606445, "rewards/margins": 0.39668044447898865, "rewards/rejected": 2.145331859588623, "step": 28140 }, { "epoch": 1.3069316124239752, "grad_norm": 194.8011016845703, "learning_rate": 2.2162867356887507e-07, "logits/chosen": -18.462480545043945, "logits/rejected": -17.860876083374023, "logps/chosen": -397.3071594238281, "logps/rejected": -334.60614013671875, "loss": 0.8476, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3563122749328613, "rewards/margins": 0.4785836338996887, "rewards/rejected": 1.8777284622192383, "step": 28150 }, { "epoch": 1.3073958865314081, "grad_norm": 83.10103607177734, "learning_rate": 2.2160081712242908e-07, "logits/chosen": -19.446006774902344, "logits/rejected": -18.22208023071289, "logps/chosen": -410.55145263671875, "logps/rejected": -250.2841339111328, "loss": 0.3139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392019271850586, "rewards/margins": 2.060680866241455, "rewards/rejected": 1.33133864402771, "step": 28160 }, { "epoch": 1.3078601606388411, "grad_norm": 108.49002838134766, "learning_rate": 2.2157296067598307e-07, "logits/chosen": -19.01662826538086, "logits/rejected": -18.88408851623535, "logps/chosen": -398.9676208496094, "logps/rejected": -442.84393310546875, "loss": 1.0792, "rewards/accuracies": 0.5, "rewards/chosen": 2.7672171592712402, "rewards/margins": 0.11937534809112549, "rewards/rejected": 2.647841691970825, "step": 28170 }, { "epoch": 1.3083244347462741, "grad_norm": 18.628318786621094, "learning_rate": 2.215451042295371e-07, "logits/chosen": -18.777206420898438, "logits/rejected": -18.76643943786621, "logps/chosen": -298.10052490234375, "logps/rejected": -262.7229919433594, "loss": 0.894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.135866641998291, "rewards/margins": -0.07978622615337372, "rewards/rejected": 2.21565318107605, "step": 28180 }, { "epoch": 1.3087887088537071, "grad_norm": 9.857766151428223, "learning_rate": 2.2151724778309113e-07, "logits/chosen": -18.17005729675293, "logits/rejected": -17.481098175048828, "logps/chosen": -337.4897155761719, "logps/rejected": -208.7733917236328, "loss": 0.5226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8722984790802, "rewards/margins": 1.9958007335662842, "rewards/rejected": 1.8764979839324951, "step": 28190 }, { "epoch": 1.3092529829611403, "grad_norm": 0.07863055169582367, "learning_rate": 2.2148939133664517e-07, "logits/chosen": -19.1539363861084, "logits/rejected": -18.503711700439453, "logps/chosen": -469.0298767089844, "logps/rejected": -438.79193115234375, "loss": 0.8244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3017189502716064, "rewards/margins": 1.0353739261627197, "rewards/rejected": 2.2663447856903076, "step": 28200 }, { "epoch": 1.3097172570685733, "grad_norm": 7.726950645446777, "learning_rate": 2.2146153489019915e-07, "logits/chosen": -18.142324447631836, "logits/rejected": -17.334848403930664, "logps/chosen": -266.46856689453125, "logps/rejected": -201.94796752929688, "loss": 0.5516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.84761643409729, "rewards/margins": 0.7184447050094604, "rewards/rejected": 1.12917160987854, "step": 28210 }, { "epoch": 1.3101815311760063, "grad_norm": 55.61085510253906, "learning_rate": 2.2143367844375317e-07, "logits/chosen": -19.08820152282715, "logits/rejected": -18.249927520751953, "logps/chosen": -361.83636474609375, "logps/rejected": -265.5487365722656, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 3.023120403289795, "rewards/margins": 0.9141624569892883, "rewards/rejected": 2.1089577674865723, "step": 28220 }, { "epoch": 1.3106458052834393, "grad_norm": 12.621336936950684, "learning_rate": 2.214058219973072e-07, "logits/chosen": -18.685270309448242, "logits/rejected": -17.14278221130371, "logps/chosen": -405.8993225097656, "logps/rejected": -252.5281982421875, "loss": 0.351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5803771018981934, "rewards/margins": 1.9255355596542358, "rewards/rejected": 1.6548417806625366, "step": 28230 }, { "epoch": 1.3111100793908723, "grad_norm": 3.871091842651367, "learning_rate": 2.2137796555086122e-07, "logits/chosen": -18.833072662353516, "logits/rejected": -17.8428955078125, "logps/chosen": -459.39642333984375, "logps/rejected": -315.57281494140625, "loss": 0.3976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0301427841186523, "rewards/margins": 1.5282385349273682, "rewards/rejected": 1.5019042491912842, "step": 28240 }, { "epoch": 1.3115743534983055, "grad_norm": 5.365188121795654, "learning_rate": 2.2135010910441524e-07, "logits/chosen": -18.527738571166992, "logits/rejected": -17.492267608642578, "logps/chosen": -461.7811584472656, "logps/rejected": -298.25799560546875, "loss": 0.7122, "rewards/accuracies": 0.5, "rewards/chosen": 2.632660388946533, "rewards/margins": 0.594946563243866, "rewards/rejected": 2.0377135276794434, "step": 28250 }, { "epoch": 1.3120386276057383, "grad_norm": 175.09500122070312, "learning_rate": 2.2132225265796925e-07, "logits/chosen": -18.248422622680664, "logits/rejected": -17.03675079345703, "logps/chosen": -374.3241882324219, "logps/rejected": -235.9447021484375, "loss": 0.4914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.92274808883667, "rewards/margins": 1.1240862607955933, "rewards/rejected": 1.7986619472503662, "step": 28260 }, { "epoch": 1.3125029017131715, "grad_norm": 22.319482803344727, "learning_rate": 2.212943962115233e-07, "logits/chosen": -18.666362762451172, "logits/rejected": -18.367305755615234, "logps/chosen": -394.5975646972656, "logps/rejected": -372.402587890625, "loss": 0.6115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8915042877197266, "rewards/margins": 1.0511116981506348, "rewards/rejected": 2.8403921127319336, "step": 28270 }, { "epoch": 1.3129671758206045, "grad_norm": 54.33168029785156, "learning_rate": 2.2126653976507728e-07, "logits/chosen": -18.42601776123047, "logits/rejected": -17.798358917236328, "logps/chosen": -315.980224609375, "logps/rejected": -294.33880615234375, "loss": 0.5654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.475585460662842, "rewards/margins": 1.133683681488037, "rewards/rejected": 1.3419017791748047, "step": 28280 }, { "epoch": 1.3134314499280375, "grad_norm": 2.222266435623169, "learning_rate": 2.212386833186313e-07, "logits/chosen": -18.03003692626953, "logits/rejected": -17.088186264038086, "logps/chosen": -381.41265869140625, "logps/rejected": -258.77008056640625, "loss": 0.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8300766944885254, "rewards/margins": 2.0050106048583984, "rewards/rejected": 1.825066328048706, "step": 28290 }, { "epoch": 1.3138957240354705, "grad_norm": 148.2580108642578, "learning_rate": 2.2121082687218534e-07, "logits/chosen": -18.59263038635254, "logits/rejected": -17.957223892211914, "logps/chosen": -371.9869384765625, "logps/rejected": -325.15740966796875, "loss": 0.5982, "rewards/accuracies": 0.5, "rewards/chosen": 3.189575672149658, "rewards/margins": 0.9058569073677063, "rewards/rejected": 2.283719062805176, "step": 28300 }, { "epoch": 1.3143599981429035, "grad_norm": 171.90895080566406, "learning_rate": 2.2118297042573935e-07, "logits/chosen": -18.833721160888672, "logits/rejected": -17.70280647277832, "logps/chosen": -446.091064453125, "logps/rejected": -381.2463684082031, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.647195816040039, "rewards/margins": 0.8008850812911987, "rewards/rejected": 2.84631085395813, "step": 28310 }, { "epoch": 1.3148242722503367, "grad_norm": 54.04427719116211, "learning_rate": 2.2115511397929334e-07, "logits/chosen": -18.205703735351562, "logits/rejected": -17.688629150390625, "logps/chosen": -404.22808837890625, "logps/rejected": -348.16986083984375, "loss": 0.8698, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7188897132873535, "rewards/margins": -0.028237616643309593, "rewards/rejected": 2.7471275329589844, "step": 28320 }, { "epoch": 1.3152885463577697, "grad_norm": 153.33888244628906, "learning_rate": 2.2112725753284738e-07, "logits/chosen": -19.575288772583008, "logits/rejected": -18.92156982421875, "logps/chosen": -418.3016052246094, "logps/rejected": -330.12933349609375, "loss": 0.4347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8334736824035645, "rewards/margins": 1.0818897485733032, "rewards/rejected": 2.751584053039551, "step": 28330 }, { "epoch": 1.3157528204652027, "grad_norm": 103.39321899414062, "learning_rate": 2.210994010864014e-07, "logits/chosen": -18.398218154907227, "logits/rejected": -17.262386322021484, "logps/chosen": -423.6227111816406, "logps/rejected": -244.2369842529297, "loss": 0.3918, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0643773078918457, "rewards/margins": 1.2559781074523926, "rewards/rejected": 1.8083995580673218, "step": 28340 }, { "epoch": 1.3162170945726357, "grad_norm": 51.118927001953125, "learning_rate": 2.2107154463995544e-07, "logits/chosen": -19.23685073852539, "logits/rejected": -20.177946090698242, "logps/chosen": -413.18951416015625, "logps/rejected": -395.29644775390625, "loss": 1.0109, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.9785866737365723, "rewards/margins": -0.4466514587402344, "rewards/rejected": 3.4252376556396484, "step": 28350 }, { "epoch": 1.3166813686800687, "grad_norm": 47.42267990112305, "learning_rate": 2.2104368819350943e-07, "logits/chosen": -18.704469680786133, "logits/rejected": -18.035106658935547, "logps/chosen": -383.81341552734375, "logps/rejected": -364.5076599121094, "loss": 0.6711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.241931438446045, "rewards/margins": 0.603988528251648, "rewards/rejected": 2.6379427909851074, "step": 28360 }, { "epoch": 1.3171456427875017, "grad_norm": 23.035959243774414, "learning_rate": 2.2101583174706344e-07, "logits/chosen": -17.823516845703125, "logits/rejected": -17.001110076904297, "logps/chosen": -402.74822998046875, "logps/rejected": -284.19097900390625, "loss": 0.7034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3664233684539795, "rewards/margins": 1.3024975061416626, "rewards/rejected": 2.0639259815216064, "step": 28370 }, { "epoch": 1.3176099168949347, "grad_norm": 23.97551727294922, "learning_rate": 2.2098797530061748e-07, "logits/chosen": -18.13772964477539, "logits/rejected": -17.903783798217773, "logps/chosen": -401.85247802734375, "logps/rejected": -409.27783203125, "loss": 0.5804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.860126256942749, "rewards/margins": 0.6592984795570374, "rewards/rejected": 2.2008278369903564, "step": 28380 }, { "epoch": 1.318074191002368, "grad_norm": 57.958290100097656, "learning_rate": 2.209601188541715e-07, "logits/chosen": -18.771625518798828, "logits/rejected": -18.681779861450195, "logps/chosen": -409.2705993652344, "logps/rejected": -375.47979736328125, "loss": 0.7241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1982529163360596, "rewards/margins": 0.4064396023750305, "rewards/rejected": 2.791813373565674, "step": 28390 }, { "epoch": 1.3185384651098009, "grad_norm": 125.89427947998047, "learning_rate": 2.209322624077255e-07, "logits/chosen": -19.144287109375, "logits/rejected": -18.15513038635254, "logps/chosen": -338.28662109375, "logps/rejected": -297.14495849609375, "loss": 0.9202, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6598098278045654, "rewards/margins": 0.4820188581943512, "rewards/rejected": 2.177791118621826, "step": 28400 }, { "epoch": 1.3190027392172339, "grad_norm": 30.758129119873047, "learning_rate": 2.2090440596127952e-07, "logits/chosen": -18.928665161132812, "logits/rejected": -18.095531463623047, "logps/chosen": -409.7774353027344, "logps/rejected": -363.89324951171875, "loss": 0.5014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6629767417907715, "rewards/margins": 1.4394214153289795, "rewards/rejected": 2.223555564880371, "step": 28410 }, { "epoch": 1.3194670133246669, "grad_norm": 162.4693145751953, "learning_rate": 2.2087654951483357e-07, "logits/chosen": -18.0462703704834, "logits/rejected": -17.485904693603516, "logps/chosen": -418.90447998046875, "logps/rejected": -308.11932373046875, "loss": 0.6965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5956878662109375, "rewards/margins": 0.9133466482162476, "rewards/rejected": 2.6823410987854004, "step": 28420 }, { "epoch": 1.3199312874320999, "grad_norm": 41.425331115722656, "learning_rate": 2.2084869306838758e-07, "logits/chosen": -19.452556610107422, "logits/rejected": -18.597057342529297, "logps/chosen": -429.8894958496094, "logps/rejected": -306.9667053222656, "loss": 0.669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.86187481880188, "rewards/margins": 1.0243951082229614, "rewards/rejected": 2.8374791145324707, "step": 28430 }, { "epoch": 1.3203955615395329, "grad_norm": 93.67683410644531, "learning_rate": 2.2082083662194157e-07, "logits/chosen": -18.23501968383789, "logits/rejected": -18.326793670654297, "logps/chosen": -401.16912841796875, "logps/rejected": -389.85650634765625, "loss": 0.9918, "rewards/accuracies": 0.5, "rewards/chosen": 2.6774497032165527, "rewards/margins": -0.06494620442390442, "rewards/rejected": 2.7423956394195557, "step": 28440 }, { "epoch": 1.3208598356469659, "grad_norm": 215.8942413330078, "learning_rate": 2.207929801754956e-07, "logits/chosen": -19.356029510498047, "logits/rejected": -18.905473709106445, "logps/chosen": -414.5328674316406, "logps/rejected": -440.9871520996094, "loss": 0.8478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5048980712890625, "rewards/margins": 0.4771643579006195, "rewards/rejected": 3.02773380279541, "step": 28450 }, { "epoch": 1.321324109754399, "grad_norm": 36.473873138427734, "learning_rate": 2.2076512372904962e-07, "logits/chosen": -18.991668701171875, "logits/rejected": -17.328798294067383, "logps/chosen": -361.50213623046875, "logps/rejected": -285.5621337890625, "loss": 0.5737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9901139736175537, "rewards/margins": 1.2513271570205688, "rewards/rejected": 1.7387869358062744, "step": 28460 }, { "epoch": 1.321788383861832, "grad_norm": 195.8057861328125, "learning_rate": 2.207372672826036e-07, "logits/chosen": -20.03726577758789, "logits/rejected": -18.22300910949707, "logps/chosen": -513.7213134765625, "logps/rejected": -329.56646728515625, "loss": 0.4409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.960238218307495, "rewards/margins": 1.5450057983398438, "rewards/rejected": 2.4152324199676514, "step": 28470 }, { "epoch": 1.322252657969265, "grad_norm": 66.6654281616211, "learning_rate": 2.2070941083615765e-07, "logits/chosen": -18.90648078918457, "logits/rejected": -18.531545639038086, "logps/chosen": -429.6578063964844, "logps/rejected": -416.31170654296875, "loss": 0.7052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3707051277160645, "rewards/margins": 0.47745591402053833, "rewards/rejected": 2.893249034881592, "step": 28480 }, { "epoch": 1.322716932076698, "grad_norm": 158.84683227539062, "learning_rate": 2.2068155438971167e-07, "logits/chosen": -19.55318260192871, "logits/rejected": -18.580646514892578, "logps/chosen": -500.13482666015625, "logps/rejected": -337.34027099609375, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.670180559158325, "rewards/margins": 1.2888524532318115, "rewards/rejected": 2.3813281059265137, "step": 28490 }, { "epoch": 1.323181206184131, "grad_norm": 43.09602737426758, "learning_rate": 2.206536979432657e-07, "logits/chosen": -18.533117294311523, "logits/rejected": -18.69110870361328, "logps/chosen": -358.23577880859375, "logps/rejected": -394.5953063964844, "loss": 0.9869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0402824878692627, "rewards/margins": 0.07424850761890411, "rewards/rejected": 2.966033935546875, "step": 28500 }, { "epoch": 1.3236454802915643, "grad_norm": 103.08760833740234, "learning_rate": 2.206258414968197e-07, "logits/chosen": -18.29605484008789, "logits/rejected": -17.897682189941406, "logps/chosen": -384.5674743652344, "logps/rejected": -333.5448303222656, "loss": 0.7706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.281061887741089, "rewards/margins": 0.6543760895729065, "rewards/rejected": 2.626685857772827, "step": 28510 }, { "epoch": 1.324109754398997, "grad_norm": 33.65848159790039, "learning_rate": 2.2059798505037374e-07, "logits/chosen": -19.75954818725586, "logits/rejected": -19.40688133239746, "logps/chosen": -427.2232360839844, "logps/rejected": -351.33734130859375, "loss": 0.588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1955249309539795, "rewards/margins": 0.4222331941127777, "rewards/rejected": 2.773292064666748, "step": 28520 }, { "epoch": 1.3245740285064302, "grad_norm": 40.14455795288086, "learning_rate": 2.2057012860392775e-07, "logits/chosen": -18.386642456054688, "logits/rejected": -17.913246154785156, "logps/chosen": -371.561279296875, "logps/rejected": -292.1465148925781, "loss": 0.571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4341788291931152, "rewards/margins": 0.8450753092765808, "rewards/rejected": 2.589103937149048, "step": 28530 }, { "epoch": 1.3250383026138632, "grad_norm": 244.68638610839844, "learning_rate": 2.2054227215748177e-07, "logits/chosen": -19.48889923095703, "logits/rejected": -18.123077392578125, "logps/chosen": -363.6212158203125, "logps/rejected": -259.3393249511719, "loss": 0.7014, "rewards/accuracies": 0.5, "rewards/chosen": 2.9427714347839355, "rewards/margins": 0.8749790191650391, "rewards/rejected": 2.0677921772003174, "step": 28540 }, { "epoch": 1.3255025767212962, "grad_norm": 58.299171447753906, "learning_rate": 2.2051441571103578e-07, "logits/chosen": -18.475704193115234, "logits/rejected": -18.793176651000977, "logps/chosen": -410.79083251953125, "logps/rejected": -435.9335021972656, "loss": 1.1056, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4766063690185547, "rewards/margins": -0.21571770310401917, "rewards/rejected": 3.692324161529541, "step": 28550 }, { "epoch": 1.3259668508287292, "grad_norm": 12.340846061706543, "learning_rate": 2.204865592645898e-07, "logits/chosen": -17.711084365844727, "logits/rejected": -18.11713218688965, "logps/chosen": -251.97445678710938, "logps/rejected": -275.6487731933594, "loss": 1.2108, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9639761447906494, "rewards/margins": -0.3299641013145447, "rewards/rejected": 2.2939400672912598, "step": 28560 }, { "epoch": 1.3264311249361622, "grad_norm": 0.5199731588363647, "learning_rate": 2.2045870281814384e-07, "logits/chosen": -18.926496505737305, "logits/rejected": -17.952774047851562, "logps/chosen": -416.13165283203125, "logps/rejected": -317.09942626953125, "loss": 0.8848, "rewards/accuracies": 0.5, "rewards/chosen": 3.437565326690674, "rewards/margins": 0.9801050424575806, "rewards/rejected": 2.4574601650238037, "step": 28570 }, { "epoch": 1.3268953990435954, "grad_norm": 41.52384948730469, "learning_rate": 2.2043084637169785e-07, "logits/chosen": -19.093229293823242, "logits/rejected": -19.725746154785156, "logps/chosen": -381.49285888671875, "logps/rejected": -357.9566345214844, "loss": 0.6579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0685410499572754, "rewards/margins": 0.23132212460041046, "rewards/rejected": 2.837218761444092, "step": 28580 }, { "epoch": 1.3273596731510284, "grad_norm": 155.6582794189453, "learning_rate": 2.2040298992525184e-07, "logits/chosen": -18.429523468017578, "logits/rejected": -18.146167755126953, "logps/chosen": -387.22247314453125, "logps/rejected": -325.8670349121094, "loss": 0.6515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.262880802154541, "rewards/margins": 0.6004262566566467, "rewards/rejected": 2.662454128265381, "step": 28590 }, { "epoch": 1.3278239472584614, "grad_norm": 34.8386344909668, "learning_rate": 2.2037513347880588e-07, "logits/chosen": -18.72378921508789, "logits/rejected": -19.27284049987793, "logps/chosen": -398.11138916015625, "logps/rejected": -411.0439453125, "loss": 0.7067, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.743722915649414, "rewards/margins": 0.427992045879364, "rewards/rejected": 3.315730571746826, "step": 28600 }, { "epoch": 1.3282882213658944, "grad_norm": 116.54947662353516, "learning_rate": 2.203472770323599e-07, "logits/chosen": -18.366506576538086, "logits/rejected": -17.94593048095703, "logps/chosen": -464.31317138671875, "logps/rejected": -382.51300048828125, "loss": 0.7731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.199111223220825, "rewards/margins": 0.9544559717178345, "rewards/rejected": 2.244655132293701, "step": 28610 }, { "epoch": 1.3287524954733274, "grad_norm": 39.5938835144043, "learning_rate": 2.2031942058591394e-07, "logits/chosen": -18.161495208740234, "logits/rejected": -18.157716751098633, "logps/chosen": -336.6044006347656, "logps/rejected": -375.7011413574219, "loss": 1.1612, "rewards/accuracies": 0.5, "rewards/chosen": 2.199439764022827, "rewards/margins": -0.41740989685058594, "rewards/rejected": 2.616849899291992, "step": 28620 }, { "epoch": 1.3292167695807604, "grad_norm": 87.4654312133789, "learning_rate": 2.2029156413946792e-07, "logits/chosen": -18.591686248779297, "logits/rejected": -16.72760772705078, "logps/chosen": -414.2057189941406, "logps/rejected": -235.77279663085938, "loss": 0.254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.318664073944092, "rewards/margins": 2.3798441886901855, "rewards/rejected": 0.9388197660446167, "step": 28630 }, { "epoch": 1.3296810436881934, "grad_norm": 152.9416046142578, "learning_rate": 2.2026370769302194e-07, "logits/chosen": -18.289257049560547, "logits/rejected": -18.0025577545166, "logps/chosen": -410.3744201660156, "logps/rejected": -359.87115478515625, "loss": 1.057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5373635292053223, "rewards/margins": -0.08885929733514786, "rewards/rejected": 2.626222848892212, "step": 28640 }, { "epoch": 1.3301453177956266, "grad_norm": 104.97730255126953, "learning_rate": 2.2023585124657598e-07, "logits/chosen": -18.682558059692383, "logits/rejected": -18.49915885925293, "logps/chosen": -358.6518249511719, "logps/rejected": -381.0821228027344, "loss": 1.1928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6336731910705566, "rewards/margins": -0.16129560768604279, "rewards/rejected": 2.794968605041504, "step": 28650 }, { "epoch": 1.3306095919030596, "grad_norm": 104.14095306396484, "learning_rate": 2.2020799480012997e-07, "logits/chosen": -19.391578674316406, "logits/rejected": -18.48609161376953, "logps/chosen": -379.72259521484375, "logps/rejected": -338.1890563964844, "loss": 0.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0430335998535156, "rewards/margins": 0.7312837839126587, "rewards/rejected": 2.3117499351501465, "step": 28660 }, { "epoch": 1.3310738660104926, "grad_norm": 75.5672378540039, "learning_rate": 2.20180138353684e-07, "logits/chosen": -19.27365493774414, "logits/rejected": -17.226791381835938, "logps/chosen": -568.9161987304688, "logps/rejected": -293.4749450683594, "loss": 0.2367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.321531295776367, "rewards/margins": 2.6626131534576416, "rewards/rejected": 1.6589183807373047, "step": 28670 }, { "epoch": 1.3315381401179256, "grad_norm": 34.31483840942383, "learning_rate": 2.2015228190723802e-07, "logits/chosen": -17.825836181640625, "logits/rejected": -17.803525924682617, "logps/chosen": -382.405029296875, "logps/rejected": -371.92926025390625, "loss": 0.6786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9719767570495605, "rewards/margins": 0.5390729904174805, "rewards/rejected": 2.43290376663208, "step": 28680 }, { "epoch": 1.3320024142253586, "grad_norm": 154.83843994140625, "learning_rate": 2.2012442546079206e-07, "logits/chosen": -19.369935989379883, "logits/rejected": -19.146032333374023, "logps/chosen": -390.6966247558594, "logps/rejected": -342.2490234375, "loss": 0.8261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.088655471801758, "rewards/margins": 0.4573155343532562, "rewards/rejected": 2.6313395500183105, "step": 28690 }, { "epoch": 1.3324666883327918, "grad_norm": 17.833127975463867, "learning_rate": 2.2009656901434605e-07, "logits/chosen": -19.27680778503418, "logits/rejected": -19.66022300720215, "logps/chosen": -402.2232666015625, "logps/rejected": -371.2875671386719, "loss": 0.9325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0991885662078857, "rewards/margins": 0.1833408772945404, "rewards/rejected": 2.9158475399017334, "step": 28700 }, { "epoch": 1.3329309624402246, "grad_norm": 21.113027572631836, "learning_rate": 2.2006871256790007e-07, "logits/chosen": -18.05606460571289, "logits/rejected": -17.179527282714844, "logps/chosen": -352.4468688964844, "logps/rejected": -278.26934814453125, "loss": 0.5926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.388962984085083, "rewards/margins": 0.549403190612793, "rewards/rejected": 1.83955979347229, "step": 28710 }, { "epoch": 1.3333952365476578, "grad_norm": 78.35485076904297, "learning_rate": 2.200408561214541e-07, "logits/chosen": -19.934490203857422, "logits/rejected": -18.722888946533203, "logps/chosen": -481.6392517089844, "logps/rejected": -311.1304931640625, "loss": 0.4681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.142870903015137, "rewards/margins": 1.7280426025390625, "rewards/rejected": 2.4148285388946533, "step": 28720 }, { "epoch": 1.3338595106550908, "grad_norm": 121.45140838623047, "learning_rate": 2.2001299967500812e-07, "logits/chosen": -18.265342712402344, "logits/rejected": -17.825828552246094, "logps/chosen": -327.79730224609375, "logps/rejected": -331.0824279785156, "loss": 0.8461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.55987811088562, "rewards/margins": 0.175016388297081, "rewards/rejected": 2.384861469268799, "step": 28730 }, { "epoch": 1.3343237847625238, "grad_norm": 223.79022216796875, "learning_rate": 2.199851432285621e-07, "logits/chosen": -18.448379516601562, "logits/rejected": -18.1511287689209, "logps/chosen": -342.58905029296875, "logps/rejected": -298.8192443847656, "loss": 0.5896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.878345012664795, "rewards/margins": 0.9933568239212036, "rewards/rejected": 1.8849881887435913, "step": 28740 }, { "epoch": 1.3347880588699568, "grad_norm": 91.49044799804688, "learning_rate": 2.1995728678211615e-07, "logits/chosen": -19.446044921875, "logits/rejected": -18.812593460083008, "logps/chosen": -352.377685546875, "logps/rejected": -340.855224609375, "loss": 0.4646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.053713083267212, "rewards/margins": 0.9369089007377625, "rewards/rejected": 2.116804361343384, "step": 28750 }, { "epoch": 1.3352523329773898, "grad_norm": 166.72938537597656, "learning_rate": 2.1992943033567017e-07, "logits/chosen": -19.17569351196289, "logits/rejected": -18.254671096801758, "logps/chosen": -515.3980712890625, "logps/rejected": -356.4705810546875, "loss": 0.334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.859100341796875, "rewards/margins": 1.7530567646026611, "rewards/rejected": 2.1060433387756348, "step": 28760 }, { "epoch": 1.335716607084823, "grad_norm": 55.229957580566406, "learning_rate": 2.199015738892242e-07, "logits/chosen": -19.245738983154297, "logits/rejected": -18.71575355529785, "logps/chosen": -482.6240234375, "logps/rejected": -440.71563720703125, "loss": 0.9432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8880648612976074, "rewards/margins": 0.156840518116951, "rewards/rejected": 3.731224775314331, "step": 28770 }, { "epoch": 1.336180881192256, "grad_norm": 56.29967498779297, "learning_rate": 2.198737174427782e-07, "logits/chosen": -19.935501098632812, "logits/rejected": -19.657337188720703, "logps/chosen": -538.4075927734375, "logps/rejected": -511.99822998046875, "loss": 1.0931, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.474536418914795, "rewards/margins": -0.36887356638908386, "rewards/rejected": 3.843410015106201, "step": 28780 }, { "epoch": 1.336645155299689, "grad_norm": 57.8193244934082, "learning_rate": 2.198458609963322e-07, "logits/chosen": -19.204036712646484, "logits/rejected": -18.139415740966797, "logps/chosen": -473.9835510253906, "logps/rejected": -358.65118408203125, "loss": 0.5305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.054560661315918, "rewards/margins": 0.748202919960022, "rewards/rejected": 2.3063578605651855, "step": 28790 }, { "epoch": 1.337109429407122, "grad_norm": 24.36040687561035, "learning_rate": 2.1981800454988625e-07, "logits/chosen": -19.136571884155273, "logits/rejected": -18.2795352935791, "logps/chosen": -494.8760681152344, "logps/rejected": -371.9925537109375, "loss": 0.5121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.79522442817688, "rewards/margins": 1.2684812545776367, "rewards/rejected": 2.526742696762085, "step": 28800 }, { "epoch": 1.337573703514555, "grad_norm": 120.1703872680664, "learning_rate": 2.1979014810344026e-07, "logits/chosen": -18.887453079223633, "logits/rejected": -18.643051147460938, "logps/chosen": -423.77825927734375, "logps/rejected": -442.25909423828125, "loss": 0.7579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8903374671936035, "rewards/margins": 0.7031711339950562, "rewards/rejected": 3.1871657371520996, "step": 28810 }, { "epoch": 1.338037977621988, "grad_norm": 59.60015106201172, "learning_rate": 2.1976229165699428e-07, "logits/chosen": -19.55989646911621, "logits/rejected": -18.247325897216797, "logps/chosen": -480.6822204589844, "logps/rejected": -321.00445556640625, "loss": 0.3985, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.539099931716919, "rewards/margins": 0.8093400001525879, "rewards/rejected": 2.729759693145752, "step": 28820 }, { "epoch": 1.338502251729421, "grad_norm": 3.8381786346435547, "learning_rate": 2.197344352105483e-07, "logits/chosen": -18.862743377685547, "logits/rejected": -18.31973648071289, "logps/chosen": -451.603515625, "logps/rejected": -433.9456481933594, "loss": 0.9687, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.184809684753418, "rewards/margins": -0.08319129049777985, "rewards/rejected": 3.268000841140747, "step": 28830 }, { "epoch": 1.3389665258368542, "grad_norm": 8.230249404907227, "learning_rate": 2.1970657876410233e-07, "logits/chosen": -19.770854949951172, "logits/rejected": -19.067249298095703, "logps/chosen": -317.7867736816406, "logps/rejected": -238.7992706298828, "loss": 0.6274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1834659576416016, "rewards/margins": 1.1227631568908691, "rewards/rejected": 2.0607028007507324, "step": 28840 }, { "epoch": 1.3394307999442872, "grad_norm": 133.1984405517578, "learning_rate": 2.1967872231765635e-07, "logits/chosen": -18.174495697021484, "logits/rejected": -17.76323127746582, "logps/chosen": -364.6875915527344, "logps/rejected": -294.20404052734375, "loss": 0.4142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.206218719482422, "rewards/margins": 1.0569955110549927, "rewards/rejected": 2.1492230892181396, "step": 28850 }, { "epoch": 1.3398950740517201, "grad_norm": 46.59417724609375, "learning_rate": 2.1965086587121034e-07, "logits/chosen": -18.503372192382812, "logits/rejected": -17.755619049072266, "logps/chosen": -385.0373840332031, "logps/rejected": -313.39251708984375, "loss": 0.8419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6583518981933594, "rewards/margins": 0.8050632476806641, "rewards/rejected": 2.853288412094116, "step": 28860 }, { "epoch": 1.3403593481591531, "grad_norm": 13.012866020202637, "learning_rate": 2.1962300942476438e-07, "logits/chosen": -18.506593704223633, "logits/rejected": -17.698848724365234, "logps/chosen": -255.168212890625, "logps/rejected": -204.26300048828125, "loss": 0.4885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4876275062561035, "rewards/margins": 0.9928483963012695, "rewards/rejected": 1.4947789907455444, "step": 28870 }, { "epoch": 1.3408236222665861, "grad_norm": 286.016357421875, "learning_rate": 2.195951529783184e-07, "logits/chosen": -18.932533264160156, "logits/rejected": -19.06705093383789, "logps/chosen": -363.6294860839844, "logps/rejected": -371.8272399902344, "loss": 0.9752, "rewards/accuracies": 0.5, "rewards/chosen": 2.9860424995422363, "rewards/margins": 0.022731900215148926, "rewards/rejected": 2.963310956954956, "step": 28880 }, { "epoch": 1.3412878963740194, "grad_norm": 0.8271262645721436, "learning_rate": 2.1956729653187238e-07, "logits/chosen": -19.691648483276367, "logits/rejected": -18.62979507446289, "logps/chosen": -476.56170654296875, "logps/rejected": -302.5474853515625, "loss": 0.4608, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9486746788024902, "rewards/margins": 1.8052774667739868, "rewards/rejected": 2.143397331237793, "step": 28890 }, { "epoch": 1.3417521704814521, "grad_norm": 48.80823516845703, "learning_rate": 2.1953944008542642e-07, "logits/chosen": -19.217952728271484, "logits/rejected": -19.060815811157227, "logps/chosen": -483.5846252441406, "logps/rejected": -435.599365234375, "loss": 0.6572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.006729602813721, "rewards/margins": 0.5547218322753906, "rewards/rejected": 3.452007293701172, "step": 28900 }, { "epoch": 1.3422164445888853, "grad_norm": 17.785236358642578, "learning_rate": 2.1951158363898044e-07, "logits/chosen": -19.202735900878906, "logits/rejected": -18.271221160888672, "logps/chosen": -307.1611633300781, "logps/rejected": -180.32260131835938, "loss": 0.4077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.921555995941162, "rewards/margins": 1.6015418767929077, "rewards/rejected": 1.320014238357544, "step": 28910 }, { "epoch": 1.3426807186963183, "grad_norm": 11.747251510620117, "learning_rate": 2.1948372719253448e-07, "logits/chosen": -19.114009857177734, "logits/rejected": -18.267242431640625, "logps/chosen": -527.29345703125, "logps/rejected": -362.39031982421875, "loss": 1.0074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.67824125289917, "rewards/margins": 0.8295029401779175, "rewards/rejected": 2.848738193511963, "step": 28920 }, { "epoch": 1.3431449928037513, "grad_norm": 62.0667839050293, "learning_rate": 2.1945587074608847e-07, "logits/chosen": -18.530824661254883, "logits/rejected": -17.9027099609375, "logps/chosen": -439.25970458984375, "logps/rejected": -385.04156494140625, "loss": 0.5632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.861506223678589, "rewards/margins": 0.6193064451217651, "rewards/rejected": 2.2421998977661133, "step": 28930 }, { "epoch": 1.3436092669111843, "grad_norm": 52.65952682495117, "learning_rate": 2.194280142996425e-07, "logits/chosen": -20.51775550842285, "logits/rejected": -19.9945125579834, "logps/chosen": -409.6156311035156, "logps/rejected": -374.1469421386719, "loss": 0.7423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1186890602111816, "rewards/margins": 0.7013416290283203, "rewards/rejected": 2.4173474311828613, "step": 28940 }, { "epoch": 1.3440735410186173, "grad_norm": 120.14305877685547, "learning_rate": 2.1940015785319652e-07, "logits/chosen": -19.862611770629883, "logits/rejected": -18.828575134277344, "logps/chosen": -412.11712646484375, "logps/rejected": -313.7570495605469, "loss": 0.4299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6782004833221436, "rewards/margins": 0.9486754536628723, "rewards/rejected": 1.7295249700546265, "step": 28950 }, { "epoch": 1.3445378151260505, "grad_norm": 34.42848205566406, "learning_rate": 2.1937230140675054e-07, "logits/chosen": -18.623470306396484, "logits/rejected": -18.04538917541504, "logps/chosen": -303.64990234375, "logps/rejected": -243.9746856689453, "loss": 0.6326, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.211181163787842, "rewards/margins": 0.24874305725097656, "rewards/rejected": 1.9624378681182861, "step": 28960 }, { "epoch": 1.3450020892334835, "grad_norm": 121.41637420654297, "learning_rate": 2.1934444496030455e-07, "logits/chosen": -19.081573486328125, "logits/rejected": -18.914175033569336, "logps/chosen": -354.19232177734375, "logps/rejected": -367.67816162109375, "loss": 0.6383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.023041009902954, "rewards/margins": 0.8763920664787292, "rewards/rejected": 2.14664888381958, "step": 28970 }, { "epoch": 1.3454663633409165, "grad_norm": 27.841793060302734, "learning_rate": 2.1931658851385856e-07, "logits/chosen": -18.216747283935547, "logits/rejected": -17.663469314575195, "logps/chosen": -303.28887939453125, "logps/rejected": -217.0441436767578, "loss": 0.6206, "rewards/accuracies": 0.5, "rewards/chosen": 2.509270191192627, "rewards/margins": 0.8230699300765991, "rewards/rejected": 1.6862001419067383, "step": 28980 }, { "epoch": 1.3459306374483495, "grad_norm": 51.92589569091797, "learning_rate": 2.192887320674126e-07, "logits/chosen": -19.029186248779297, "logits/rejected": -18.13226890563965, "logps/chosen": -375.0447692871094, "logps/rejected": -268.9963073730469, "loss": 0.4584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5504274368286133, "rewards/margins": 0.8416740298271179, "rewards/rejected": 1.7087533473968506, "step": 28990 }, { "epoch": 1.3463949115557825, "grad_norm": 24.48133659362793, "learning_rate": 2.1926087562096662e-07, "logits/chosen": -18.221839904785156, "logits/rejected": -17.938068389892578, "logps/chosen": -279.5390625, "logps/rejected": -229.32870483398438, "loss": 0.6361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3235297203063965, "rewards/margins": 0.6255651712417603, "rewards/rejected": 1.6979644298553467, "step": 29000 }, { "epoch": 1.3468591856632155, "grad_norm": 6.752426624298096, "learning_rate": 2.192330191745206e-07, "logits/chosen": -19.045391082763672, "logits/rejected": -18.193660736083984, "logps/chosen": -323.764892578125, "logps/rejected": -223.86752319335938, "loss": 0.3967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.821044445037842, "rewards/margins": 1.189025640487671, "rewards/rejected": 1.632018804550171, "step": 29010 }, { "epoch": 1.3473234597706485, "grad_norm": 131.06866455078125, "learning_rate": 2.1920516272807465e-07, "logits/chosen": -19.28817367553711, "logits/rejected": -18.810169219970703, "logps/chosen": -412.38946533203125, "logps/rejected": -364.71136474609375, "loss": 0.5009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.949460983276367, "rewards/margins": 0.9095239639282227, "rewards/rejected": 2.0399367809295654, "step": 29020 }, { "epoch": 1.3477877338780817, "grad_norm": 41.75251388549805, "learning_rate": 2.1917730628162866e-07, "logits/chosen": -18.896774291992188, "logits/rejected": -18.2164363861084, "logps/chosen": -416.0994567871094, "logps/rejected": -289.11358642578125, "loss": 0.4783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.477504253387451, "rewards/margins": 0.7228946089744568, "rewards/rejected": 1.7546097040176392, "step": 29030 }, { "epoch": 1.3482520079855147, "grad_norm": 55.532554626464844, "learning_rate": 2.191494498351827e-07, "logits/chosen": -17.72832489013672, "logits/rejected": -17.671960830688477, "logps/chosen": -413.39398193359375, "logps/rejected": -386.6877746582031, "loss": 0.7261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.112699031829834, "rewards/margins": 0.20078134536743164, "rewards/rejected": 2.911917209625244, "step": 29040 }, { "epoch": 1.3487162820929477, "grad_norm": 0.5708407163619995, "learning_rate": 2.191215933887367e-07, "logits/chosen": -20.001407623291016, "logits/rejected": -18.671213150024414, "logps/chosen": -464.57940673828125, "logps/rejected": -369.8418273925781, "loss": 0.3564, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7431745529174805, "rewards/margins": 2.0187220573425293, "rewards/rejected": 2.7244529724121094, "step": 29050 }, { "epoch": 1.3491805562003807, "grad_norm": 19.456573486328125, "learning_rate": 2.190937369422907e-07, "logits/chosen": -18.174663543701172, "logits/rejected": -17.470294952392578, "logps/chosen": -418.6248474121094, "logps/rejected": -291.23150634765625, "loss": 0.4973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9458847045898438, "rewards/margins": 0.9992318153381348, "rewards/rejected": 1.946653127670288, "step": 29060 }, { "epoch": 1.3496448303078137, "grad_norm": 21.111602783203125, "learning_rate": 2.1906588049584475e-07, "logits/chosen": -18.68710708618164, "logits/rejected": -18.207538604736328, "logps/chosen": -237.7649383544922, "logps/rejected": -242.80569458007812, "loss": 0.8362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.40895676612854, "rewards/margins": 0.6773841977119446, "rewards/rejected": 1.7315725088119507, "step": 29070 }, { "epoch": 1.350109104415247, "grad_norm": 41.739891052246094, "learning_rate": 2.1903802404939874e-07, "logits/chosen": -20.62190818786621, "logits/rejected": -18.537839889526367, "logps/chosen": -409.42132568359375, "logps/rejected": -240.6754608154297, "loss": 0.2602, "rewards/accuracies": 1.0, "rewards/chosen": 3.147063732147217, "rewards/margins": 1.6163488626480103, "rewards/rejected": 1.530714750289917, "step": 29080 }, { "epoch": 1.3505733785226797, "grad_norm": 43.84235382080078, "learning_rate": 2.1901016760295278e-07, "logits/chosen": -17.98377227783203, "logits/rejected": -17.378273010253906, "logps/chosen": -392.37506103515625, "logps/rejected": -430.9037170410156, "loss": 0.8581, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.259373426437378, "rewards/margins": 0.16877131164073944, "rewards/rejected": 3.090602397918701, "step": 29090 }, { "epoch": 1.3510376526301129, "grad_norm": 78.35427856445312, "learning_rate": 2.189823111565068e-07, "logits/chosen": -18.281633377075195, "logits/rejected": -17.868762969970703, "logps/chosen": -400.9496154785156, "logps/rejected": -282.5992126464844, "loss": 0.513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0909082889556885, "rewards/margins": 1.1197772026062012, "rewards/rejected": 1.9711313247680664, "step": 29100 }, { "epoch": 1.3515019267375459, "grad_norm": 63.53868103027344, "learning_rate": 2.1895445471006083e-07, "logits/chosen": -18.160892486572266, "logits/rejected": -17.93157958984375, "logps/chosen": -449.18487548828125, "logps/rejected": -355.22998046875, "loss": 0.7664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1359915733337402, "rewards/margins": 0.290182501077652, "rewards/rejected": 2.8458094596862793, "step": 29110 }, { "epoch": 1.3519662008449789, "grad_norm": 10.4175443649292, "learning_rate": 2.1892659826361482e-07, "logits/chosen": -18.79602813720703, "logits/rejected": -17.479236602783203, "logps/chosen": -318.0421447753906, "logps/rejected": -237.39285278320312, "loss": 0.6024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6093521118164062, "rewards/margins": 1.0434736013412476, "rewards/rejected": 1.5658786296844482, "step": 29120 }, { "epoch": 1.3524304749524119, "grad_norm": 19.118478775024414, "learning_rate": 2.1889874181716884e-07, "logits/chosen": -18.026126861572266, "logits/rejected": -18.01729965209961, "logps/chosen": -423.038818359375, "logps/rejected": -392.66229248046875, "loss": 1.1114, "rewards/accuracies": 0.5, "rewards/chosen": 2.2488269805908203, "rewards/margins": 0.004901719279587269, "rewards/rejected": 2.243925094604492, "step": 29130 }, { "epoch": 1.3528947490598449, "grad_norm": 26.461143493652344, "learning_rate": 2.1887088537072288e-07, "logits/chosen": -18.99824333190918, "logits/rejected": -18.83814239501953, "logps/chosen": -409.13433837890625, "logps/rejected": -358.142822265625, "loss": 0.4396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8928604125976562, "rewards/margins": 1.0529394149780273, "rewards/rejected": 2.8399205207824707, "step": 29140 }, { "epoch": 1.353359023167278, "grad_norm": 140.5572052001953, "learning_rate": 2.188430289242769e-07, "logits/chosen": -19.597915649414062, "logits/rejected": -18.669706344604492, "logps/chosen": -397.39349365234375, "logps/rejected": -313.09332275390625, "loss": 0.6594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0769364833831787, "rewards/margins": 0.6735245585441589, "rewards/rejected": 2.403411626815796, "step": 29150 }, { "epoch": 1.353823297274711, "grad_norm": 77.432861328125, "learning_rate": 2.1881517247783088e-07, "logits/chosen": -18.588720321655273, "logits/rejected": -18.51856231689453, "logps/chosen": -433.670654296875, "logps/rejected": -401.3857421875, "loss": 1.0113, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.708334445953369, "rewards/margins": -0.12276041507720947, "rewards/rejected": 2.831094741821289, "step": 29160 }, { "epoch": 1.354287571382144, "grad_norm": 13.069665908813477, "learning_rate": 2.1878731603138492e-07, "logits/chosen": -19.45836639404297, "logits/rejected": -17.70534324645996, "logps/chosen": -369.9068298339844, "logps/rejected": -233.67776489257812, "loss": 0.329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5976085662841797, "rewards/margins": 1.382576584815979, "rewards/rejected": 1.2150318622589111, "step": 29170 }, { "epoch": 1.354751845489577, "grad_norm": 8.182735443115234, "learning_rate": 2.1875945958493893e-07, "logits/chosen": -19.099483489990234, "logits/rejected": -18.957172393798828, "logps/chosen": -473.56988525390625, "logps/rejected": -442.80438232421875, "loss": 0.5551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.080452919006348, "rewards/margins": 1.1994519233703613, "rewards/rejected": 2.881000518798828, "step": 29180 }, { "epoch": 1.35521611959701, "grad_norm": 37.277767181396484, "learning_rate": 2.1873160313849298e-07, "logits/chosen": -18.865360260009766, "logits/rejected": -18.232112884521484, "logps/chosen": -470.66107177734375, "logps/rejected": -377.63861083984375, "loss": 0.5375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.338059425354004, "rewards/margins": 1.0226294994354248, "rewards/rejected": 3.3154296875, "step": 29190 }, { "epoch": 1.355680393704443, "grad_norm": 46.70858383178711, "learning_rate": 2.1870374669204696e-07, "logits/chosen": -18.357831954956055, "logits/rejected": -18.1740665435791, "logps/chosen": -323.24151611328125, "logps/rejected": -349.5395202636719, "loss": 0.4808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2575249671936035, "rewards/margins": 0.594494640827179, "rewards/rejected": 1.6630302667617798, "step": 29200 }, { "epoch": 1.356144667811876, "grad_norm": 57.41835403442383, "learning_rate": 2.1867589024560098e-07, "logits/chosen": -18.843612670898438, "logits/rejected": -17.157785415649414, "logps/chosen": -425.53070068359375, "logps/rejected": -254.07626342773438, "loss": 0.6155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0658822059631348, "rewards/margins": 1.487370491027832, "rewards/rejected": 1.5785115957260132, "step": 29210 }, { "epoch": 1.3566089419193093, "grad_norm": 17.533891677856445, "learning_rate": 2.1864803379915502e-07, "logits/chosen": -17.978527069091797, "logits/rejected": -16.634769439697266, "logps/chosen": -453.04083251953125, "logps/rejected": -307.7094421386719, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5472664833068848, "rewards/margins": 1.1409231424331665, "rewards/rejected": 2.4063429832458496, "step": 29220 }, { "epoch": 1.3570732160267422, "grad_norm": 77.52613067626953, "learning_rate": 2.1862017735270903e-07, "logits/chosen": -18.388898849487305, "logits/rejected": -17.216327667236328, "logps/chosen": -336.0391540527344, "logps/rejected": -235.31661987304688, "loss": 0.5175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3994078636169434, "rewards/margins": 0.9607111811637878, "rewards/rejected": 1.4386968612670898, "step": 29230 }, { "epoch": 1.3575374901341752, "grad_norm": 8.806702613830566, "learning_rate": 2.1859232090626305e-07, "logits/chosen": -18.42814826965332, "logits/rejected": -17.1978816986084, "logps/chosen": -388.5585021972656, "logps/rejected": -255.10543823242188, "loss": 0.5919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7389590740203857, "rewards/margins": 0.9023340344429016, "rewards/rejected": 1.836625099182129, "step": 29240 }, { "epoch": 1.3580017642416082, "grad_norm": 41.887367248535156, "learning_rate": 2.1856446445981706e-07, "logits/chosen": -19.834592819213867, "logits/rejected": -19.677358627319336, "logps/chosen": -463.20037841796875, "logps/rejected": -398.4999694824219, "loss": 0.868, "rewards/accuracies": 0.5, "rewards/chosen": 3.6678624153137207, "rewards/margins": 0.01313935499638319, "rewards/rejected": 3.6547234058380127, "step": 29250 }, { "epoch": 1.3584660383490412, "grad_norm": 33.14532470703125, "learning_rate": 2.185366080133711e-07, "logits/chosen": -19.273723602294922, "logits/rejected": -19.220245361328125, "logps/chosen": -401.2506103515625, "logps/rejected": -437.79388427734375, "loss": 1.0915, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.683671474456787, "rewards/margins": -0.2599867284297943, "rewards/rejected": 3.9436581134796143, "step": 29260 }, { "epoch": 1.3589303124564742, "grad_norm": 1.6598714590072632, "learning_rate": 2.185087515669251e-07, "logits/chosen": -19.122175216674805, "logits/rejected": -17.444149017333984, "logps/chosen": -446.2255859375, "logps/rejected": -355.31622314453125, "loss": 0.5813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.424448013305664, "rewards/margins": 1.153855800628662, "rewards/rejected": 2.270591974258423, "step": 29270 }, { "epoch": 1.3593945865639072, "grad_norm": 26.613801956176758, "learning_rate": 2.184808951204791e-07, "logits/chosen": -19.024232864379883, "logits/rejected": -18.657699584960938, "logps/chosen": -286.0924377441406, "logps/rejected": -309.492919921875, "loss": 0.7969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8272969722747803, "rewards/margins": 0.5268742442131042, "rewards/rejected": 2.3004226684570312, "step": 29280 }, { "epoch": 1.3598588606713404, "grad_norm": 15.259061813354492, "learning_rate": 2.1845303867403315e-07, "logits/chosen": -18.98190689086914, "logits/rejected": -18.423009872436523, "logps/chosen": -418.39971923828125, "logps/rejected": -357.9434509277344, "loss": 0.3671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.330596923828125, "rewards/margins": 1.2750736474990845, "rewards/rejected": 2.055522918701172, "step": 29290 }, { "epoch": 1.3603231347787734, "grad_norm": 9.081788063049316, "learning_rate": 2.1842518222758716e-07, "logits/chosen": -18.37343406677246, "logits/rejected": -17.93887710571289, "logps/chosen": -419.4845275878906, "logps/rejected": -260.4015197753906, "loss": 0.4745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.904226303100586, "rewards/margins": 1.050032377243042, "rewards/rejected": 1.8541936874389648, "step": 29300 }, { "epoch": 1.3607874088862064, "grad_norm": 3.69948410987854, "learning_rate": 2.1839732578114115e-07, "logits/chosen": -18.252300262451172, "logits/rejected": -17.805801391601562, "logps/chosen": -346.3125915527344, "logps/rejected": -236.71875, "loss": 0.8112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2143962383270264, "rewards/margins": 1.0580459833145142, "rewards/rejected": 2.156350612640381, "step": 29310 }, { "epoch": 1.3612516829936394, "grad_norm": 61.38218688964844, "learning_rate": 2.183694693346952e-07, "logits/chosen": -18.868606567382812, "logits/rejected": -17.654516220092773, "logps/chosen": -368.5528869628906, "logps/rejected": -259.982177734375, "loss": 0.5545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.912134885787964, "rewards/margins": 1.0646506547927856, "rewards/rejected": 1.8474843502044678, "step": 29320 }, { "epoch": 1.3617159571010724, "grad_norm": 105.63043975830078, "learning_rate": 2.183416128882492e-07, "logits/chosen": -19.2537841796875, "logits/rejected": -17.32870864868164, "logps/chosen": -471.03607177734375, "logps/rejected": -268.20635986328125, "loss": 0.2924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.022729873657227, "rewards/margins": 2.3066625595092773, "rewards/rejected": 1.7160675525665283, "step": 29330 }, { "epoch": 1.3621802312085056, "grad_norm": 303.20220947265625, "learning_rate": 2.1831375644180325e-07, "logits/chosen": -18.94023895263672, "logits/rejected": -18.028051376342773, "logps/chosen": -366.088623046875, "logps/rejected": -269.584228515625, "loss": 0.7792, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.613854169845581, "rewards/margins": 0.5414530038833618, "rewards/rejected": 2.072401285171509, "step": 29340 }, { "epoch": 1.3626445053159384, "grad_norm": 239.98226928710938, "learning_rate": 2.1828589999535723e-07, "logits/chosen": -19.318941116333008, "logits/rejected": -18.39339256286621, "logps/chosen": -434.883056640625, "logps/rejected": -450.85833740234375, "loss": 0.9393, "rewards/accuracies": 0.5, "rewards/chosen": 2.946876049041748, "rewards/margins": 0.3664192259311676, "rewards/rejected": 2.5804569721221924, "step": 29350 }, { "epoch": 1.3631087794233716, "grad_norm": 62.44658660888672, "learning_rate": 2.1825804354891128e-07, "logits/chosen": -19.523717880249023, "logits/rejected": -18.53289222717285, "logps/chosen": -349.57330322265625, "logps/rejected": -290.6649475097656, "loss": 0.5274, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.088571310043335, "rewards/margins": 0.8470881581306458, "rewards/rejected": 2.241483211517334, "step": 29360 }, { "epoch": 1.3635730535308046, "grad_norm": 74.12797546386719, "learning_rate": 2.182301871024653e-07, "logits/chosen": -18.85193634033203, "logits/rejected": -19.09113121032715, "logps/chosen": -290.62200927734375, "logps/rejected": -301.1548156738281, "loss": 1.1471, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.2775070667266846, "rewards/margins": -0.40112510323524475, "rewards/rejected": 2.6786322593688965, "step": 29370 }, { "epoch": 1.3640373276382376, "grad_norm": 53.645782470703125, "learning_rate": 2.182023306560193e-07, "logits/chosen": -21.079906463623047, "logits/rejected": -18.67386245727539, "logps/chosen": -464.4966735839844, "logps/rejected": -320.73809814453125, "loss": 0.58, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.993854522705078, "rewards/margins": 1.699715256690979, "rewards/rejected": 2.2941389083862305, "step": 29380 }, { "epoch": 1.3645016017456706, "grad_norm": 108.599609375, "learning_rate": 2.1817447420957332e-07, "logits/chosen": -18.799989700317383, "logits/rejected": -18.694204330444336, "logps/chosen": -333.71087646484375, "logps/rejected": -328.3238525390625, "loss": 0.8406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1175003051757812, "rewards/margins": 0.48844075202941895, "rewards/rejected": 2.6290595531463623, "step": 29390 }, { "epoch": 1.3649658758531036, "grad_norm": 1.4206832647323608, "learning_rate": 2.1814661776312733e-07, "logits/chosen": -18.40603256225586, "logits/rejected": -17.345096588134766, "logps/chosen": -389.1285705566406, "logps/rejected": -280.5411682128906, "loss": 0.392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.06689977645874, "rewards/margins": 1.9275213479995728, "rewards/rejected": 2.139378309249878, "step": 29400 }, { "epoch": 1.3654301499605368, "grad_norm": 147.11961364746094, "learning_rate": 2.1811876131668138e-07, "logits/chosen": -19.086692810058594, "logits/rejected": -18.917652130126953, "logps/chosen": -375.8607482910156, "logps/rejected": -402.25836181640625, "loss": 1.0979, "rewards/accuracies": 0.5, "rewards/chosen": 3.186535120010376, "rewards/margins": -0.13029424846172333, "rewards/rejected": 3.316829204559326, "step": 29410 }, { "epoch": 1.3658944240679698, "grad_norm": 139.56138610839844, "learning_rate": 2.180909048702354e-07, "logits/chosen": -19.81371307373047, "logits/rejected": -19.510038375854492, "logps/chosen": -353.1777038574219, "logps/rejected": -309.864990234375, "loss": 0.6854, "rewards/accuracies": 0.5, "rewards/chosen": 3.464730739593506, "rewards/margins": 0.35050028562545776, "rewards/rejected": 3.114230155944824, "step": 29420 }, { "epoch": 1.3663586981754028, "grad_norm": 4.38023042678833, "learning_rate": 2.1806304842378938e-07, "logits/chosen": -19.093231201171875, "logits/rejected": -18.536855697631836, "logps/chosen": -446.93792724609375, "logps/rejected": -375.3482360839844, "loss": 0.7209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.240142822265625, "rewards/margins": 0.5900976061820984, "rewards/rejected": 2.650045394897461, "step": 29430 }, { "epoch": 1.3668229722828358, "grad_norm": 64.46980285644531, "learning_rate": 2.1803519197734342e-07, "logits/chosen": -18.90654754638672, "logits/rejected": -17.709720611572266, "logps/chosen": -408.76544189453125, "logps/rejected": -297.1006774902344, "loss": 0.7232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6437840461730957, "rewards/margins": 1.1559727191925049, "rewards/rejected": 2.487811326980591, "step": 29440 }, { "epoch": 1.3672872463902688, "grad_norm": 17.1387882232666, "learning_rate": 2.1800733553089743e-07, "logits/chosen": -19.06556510925293, "logits/rejected": -18.357051849365234, "logps/chosen": -500.7901916503906, "logps/rejected": -380.48956298828125, "loss": 0.4286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0764241218566895, "rewards/margins": 1.5693823099136353, "rewards/rejected": 2.5070416927337646, "step": 29450 }, { "epoch": 1.3677515204977018, "grad_norm": 52.60224914550781, "learning_rate": 2.1797947908445142e-07, "logits/chosen": -19.477876663208008, "logits/rejected": -18.516483306884766, "logps/chosen": -350.8435974121094, "logps/rejected": -321.4552307128906, "loss": 0.7091, "rewards/accuracies": 0.5, "rewards/chosen": 2.3436882495880127, "rewards/margins": 0.06695874780416489, "rewards/rejected": 2.2767295837402344, "step": 29460 }, { "epoch": 1.3682157946051348, "grad_norm": 10.7445707321167, "learning_rate": 2.1795162263800546e-07, "logits/chosen": -18.361949920654297, "logits/rejected": -17.017410278320312, "logps/chosen": -423.700927734375, "logps/rejected": -251.14053344726562, "loss": 0.3689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.112014293670654, "rewards/margins": 2.2745893001556396, "rewards/rejected": 1.8374249935150146, "step": 29470 }, { "epoch": 1.368680068712568, "grad_norm": 36.34453582763672, "learning_rate": 2.1792376619155948e-07, "logits/chosen": -18.0250301361084, "logits/rejected": -18.171680450439453, "logps/chosen": -289.96673583984375, "logps/rejected": -308.49359130859375, "loss": 0.773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0572001934051514, "rewards/margins": 0.17571142315864563, "rewards/rejected": 1.8814890384674072, "step": 29480 }, { "epoch": 1.369144342820001, "grad_norm": 71.15313720703125, "learning_rate": 2.1789590974511352e-07, "logits/chosen": -19.1596622467041, "logits/rejected": -18.3104248046875, "logps/chosen": -407.3363342285156, "logps/rejected": -358.1509094238281, "loss": 0.4341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1691060066223145, "rewards/margins": 0.7893416881561279, "rewards/rejected": 2.3797643184661865, "step": 29490 }, { "epoch": 1.369608616927434, "grad_norm": 3.245072603225708, "learning_rate": 2.178680532986675e-07, "logits/chosen": -18.207805633544922, "logits/rejected": -17.480396270751953, "logps/chosen": -407.73260498046875, "logps/rejected": -301.8541564941406, "loss": 0.7316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.35823392868042, "rewards/margins": 0.9497013092041016, "rewards/rejected": 2.4085326194763184, "step": 29500 }, { "epoch": 1.370072891034867, "grad_norm": 19.44862937927246, "learning_rate": 2.1784019685222155e-07, "logits/chosen": -18.68130874633789, "logits/rejected": -18.47892189025879, "logps/chosen": -341.0520935058594, "logps/rejected": -434.22454833984375, "loss": 1.7215, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1727747917175293, "rewards/margins": -1.1621894836425781, "rewards/rejected": 3.3349642753601074, "step": 29510 }, { "epoch": 1.3705371651423, "grad_norm": 32.589385986328125, "learning_rate": 2.1781234040577556e-07, "logits/chosen": -18.288990020751953, "logits/rejected": -17.08672523498535, "logps/chosen": -328.0555725097656, "logps/rejected": -164.3221893310547, "loss": 0.385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9293887615203857, "rewards/margins": 1.2661300897598267, "rewards/rejected": 0.6632587313652039, "step": 29520 }, { "epoch": 1.3710014392497332, "grad_norm": 70.17940521240234, "learning_rate": 2.177844839593296e-07, "logits/chosen": -19.029621124267578, "logits/rejected": -18.98578453063965, "logps/chosen": -444.0843200683594, "logps/rejected": -497.2239685058594, "loss": 0.5658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5261077880859375, "rewards/margins": 0.4275476336479187, "rewards/rejected": 3.098559856414795, "step": 29530 }, { "epoch": 1.371465713357166, "grad_norm": 0.2812369167804718, "learning_rate": 2.177566275128836e-07, "logits/chosen": -18.84025764465332, "logits/rejected": -17.91094970703125, "logps/chosen": -338.56439208984375, "logps/rejected": -252.9380340576172, "loss": 0.4699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4784188270568848, "rewards/margins": 1.153193712234497, "rewards/rejected": 1.3252249956130981, "step": 29540 }, { "epoch": 1.3719299874645992, "grad_norm": 98.41946411132812, "learning_rate": 2.177287710664376e-07, "logits/chosen": -18.238290786743164, "logits/rejected": -17.299001693725586, "logps/chosen": -344.08843994140625, "logps/rejected": -236.5164337158203, "loss": 0.5343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0146801471710205, "rewards/margins": 0.8793939352035522, "rewards/rejected": 2.1352860927581787, "step": 29550 }, { "epoch": 1.3723942615720321, "grad_norm": 7.315094470977783, "learning_rate": 2.1770091461999165e-07, "logits/chosen": -18.933034896850586, "logits/rejected": -17.78558921813965, "logps/chosen": -425.34906005859375, "logps/rejected": -281.421875, "loss": 0.3588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.916390895843506, "rewards/margins": 1.3225882053375244, "rewards/rejected": 1.5938029289245605, "step": 29560 }, { "epoch": 1.3728585356794651, "grad_norm": 86.9803695678711, "learning_rate": 2.1767305817354566e-07, "logits/chosen": -18.25552749633789, "logits/rejected": -17.571727752685547, "logps/chosen": -380.79998779296875, "logps/rejected": -285.92218017578125, "loss": 0.745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.001023769378662, "rewards/margins": 0.30704185366630554, "rewards/rejected": 2.693981647491455, "step": 29570 }, { "epoch": 1.3733228097868981, "grad_norm": Infinity, "learning_rate": 2.1764798737174428e-07, "logits/chosen": -18.701168060302734, "logits/rejected": -17.704769134521484, "logps/chosen": -427.58551025390625, "logps/rejected": -299.37701416015625, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 2.9321627616882324, "rewards/margins": 0.4968239665031433, "rewards/rejected": 2.4353384971618652, "step": 29580 }, { "epoch": 1.3737870838943311, "grad_norm": 13.036747932434082, "learning_rate": 2.1762013092529827e-07, "logits/chosen": -18.88518714904785, "logits/rejected": -18.248899459838867, "logps/chosen": -324.2113342285156, "logps/rejected": -314.05596923828125, "loss": 1.0573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.494004011154175, "rewards/margins": -0.014483213424682617, "rewards/rejected": 2.5084872245788574, "step": 29590 }, { "epoch": 1.3742513580017643, "grad_norm": 36.370277404785156, "learning_rate": 2.175922744788523e-07, "logits/chosen": -18.877445220947266, "logits/rejected": -18.196914672851562, "logps/chosen": -398.2958984375, "logps/rejected": -310.82427978515625, "loss": 0.6062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4649271965026855, "rewards/margins": 0.5178825259208679, "rewards/rejected": 1.9470447301864624, "step": 29600 }, { "epoch": 1.3747156321091973, "grad_norm": 92.16556549072266, "learning_rate": 2.1756441803240633e-07, "logits/chosen": -18.390216827392578, "logits/rejected": -17.934640884399414, "logps/chosen": -283.7510986328125, "logps/rejected": -238.443115234375, "loss": 1.0814, "rewards/accuracies": 0.5, "rewards/chosen": 1.9438740015029907, "rewards/margins": -0.41278696060180664, "rewards/rejected": 2.356661319732666, "step": 29610 }, { "epoch": 1.3751799062166303, "grad_norm": 1.6485745906829834, "learning_rate": 2.1753656158596037e-07, "logits/chosen": -19.147777557373047, "logits/rejected": -17.774927139282227, "logps/chosen": -374.7113037109375, "logps/rejected": -287.09326171875, "loss": 0.5431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1971018314361572, "rewards/margins": 1.1834821701049805, "rewards/rejected": 2.0136191844940186, "step": 29620 }, { "epoch": 1.3756441803240633, "grad_norm": 30.19736671447754, "learning_rate": 2.1750870513951435e-07, "logits/chosen": -18.731281280517578, "logits/rejected": -18.184694290161133, "logps/chosen": -466.90625, "logps/rejected": -365.1131286621094, "loss": 0.4388, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8833916187286377, "rewards/margins": 1.5661230087280273, "rewards/rejected": 2.3172688484191895, "step": 29630 }, { "epoch": 1.3761084544314963, "grad_norm": 4.180136680603027, "learning_rate": 2.1748084869306837e-07, "logits/chosen": -18.51962661743164, "logits/rejected": -16.98849868774414, "logps/chosen": -307.0237731933594, "logps/rejected": -223.38961791992188, "loss": 0.6967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7778689861297607, "rewards/margins": 1.2308189868927002, "rewards/rejected": 1.5470499992370605, "step": 29640 }, { "epoch": 1.3765727285389293, "grad_norm": 71.78685760498047, "learning_rate": 2.174529922466224e-07, "logits/chosen": -19.339643478393555, "logits/rejected": -17.84998321533203, "logps/chosen": -414.65460205078125, "logps/rejected": -240.83425903320312, "loss": 0.5232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7236900329589844, "rewards/margins": 1.9930477142333984, "rewards/rejected": 1.7306420803070068, "step": 29650 }, { "epoch": 1.3770370026463623, "grad_norm": 172.3892822265625, "learning_rate": 2.1742513580017642e-07, "logits/chosen": -19.650741577148438, "logits/rejected": -19.477806091308594, "logps/chosen": -392.6651306152344, "logps/rejected": -409.507080078125, "loss": 0.8722, "rewards/accuracies": 0.5, "rewards/chosen": 3.3956475257873535, "rewards/margins": 0.68873530626297, "rewards/rejected": 2.7069122791290283, "step": 29660 }, { "epoch": 1.3775012767537955, "grad_norm": 87.76115417480469, "learning_rate": 2.173972793537304e-07, "logits/chosen": -18.776622772216797, "logits/rejected": -18.333446502685547, "logps/chosen": -368.565673828125, "logps/rejected": -363.0921325683594, "loss": 1.2025, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0937564373016357, "rewards/margins": -0.44040852785110474, "rewards/rejected": 2.5341649055480957, "step": 29670 }, { "epoch": 1.3779655508612285, "grad_norm": 15.815740585327148, "learning_rate": 2.1736942290728445e-07, "logits/chosen": -18.683645248413086, "logits/rejected": -17.743743896484375, "logps/chosen": -401.4272766113281, "logps/rejected": -299.6070556640625, "loss": 0.7558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2355175018310547, "rewards/margins": 0.87274569272995, "rewards/rejected": 2.36277174949646, "step": 29680 }, { "epoch": 1.3784298249686615, "grad_norm": 108.14379119873047, "learning_rate": 2.1734156646083847e-07, "logits/chosen": -19.221424102783203, "logits/rejected": -18.322582244873047, "logps/chosen": -444.0738220214844, "logps/rejected": -368.585693359375, "loss": 0.4958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6016693115234375, "rewards/margins": 0.8125306367874146, "rewards/rejected": 2.7891387939453125, "step": 29690 }, { "epoch": 1.3788940990760945, "grad_norm": 11.654343605041504, "learning_rate": 2.173137100143925e-07, "logits/chosen": -18.839706420898438, "logits/rejected": -18.26099395751953, "logps/chosen": -323.76458740234375, "logps/rejected": -297.32440185546875, "loss": 0.564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5313289165496826, "rewards/margins": 0.6698693037033081, "rewards/rejected": 1.861459732055664, "step": 29700 }, { "epoch": 1.3793583731835275, "grad_norm": 26.17502212524414, "learning_rate": 2.172858535679465e-07, "logits/chosen": -18.994361877441406, "logits/rejected": -18.13008689880371, "logps/chosen": -491.9126892089844, "logps/rejected": -396.32916259765625, "loss": 0.5123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.035876274108887, "rewards/margins": 1.1533052921295166, "rewards/rejected": 2.8825716972351074, "step": 29710 }, { "epoch": 1.3798226472909607, "grad_norm": 80.95411682128906, "learning_rate": 2.172579971215005e-07, "logits/chosen": -18.930017471313477, "logits/rejected": -18.360172271728516, "logps/chosen": -372.22015380859375, "logps/rejected": -282.85223388671875, "loss": 0.6305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.642779588699341, "rewards/margins": 0.5634415745735168, "rewards/rejected": 2.0793380737304688, "step": 29720 }, { "epoch": 1.3802869213983935, "grad_norm": 82.30634307861328, "learning_rate": 2.1723014067505455e-07, "logits/chosen": -19.720537185668945, "logits/rejected": -18.244367599487305, "logps/chosen": -535.9488525390625, "logps/rejected": -419.7588806152344, "loss": 0.2514, "rewards/accuracies": 1.0, "rewards/chosen": 4.528023719787598, "rewards/margins": 2.128714084625244, "rewards/rejected": 2.3993096351623535, "step": 29730 }, { "epoch": 1.3807511955058267, "grad_norm": 216.9219207763672, "learning_rate": 2.1720228422860854e-07, "logits/chosen": -19.78111457824707, "logits/rejected": -20.113956451416016, "logps/chosen": -455.1988220214844, "logps/rejected": -464.7568359375, "loss": 0.9755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4431777000427246, "rewards/margins": 0.092342808842659, "rewards/rejected": 3.350834608078003, "step": 29740 }, { "epoch": 1.3812154696132597, "grad_norm": 27.026914596557617, "learning_rate": 2.1717442778216258e-07, "logits/chosen": -19.25166893005371, "logits/rejected": -17.737241744995117, "logps/chosen": -325.48822021484375, "logps/rejected": -179.43609619140625, "loss": 0.3568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.735969066619873, "rewards/margins": 1.7442706823349, "rewards/rejected": 0.9916985630989075, "step": 29750 }, { "epoch": 1.3816797437206927, "grad_norm": 47.191680908203125, "learning_rate": 2.171465713357166e-07, "logits/chosen": -20.638996124267578, "logits/rejected": -19.00017547607422, "logps/chosen": -358.62774658203125, "logps/rejected": -304.03033447265625, "loss": 0.4064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.082068920135498, "rewards/margins": 1.3045194149017334, "rewards/rejected": 1.7775495052337646, "step": 29760 }, { "epoch": 1.3821440178281257, "grad_norm": 13.447686195373535, "learning_rate": 2.1711871488927064e-07, "logits/chosen": -19.34195327758789, "logits/rejected": -18.049711227416992, "logps/chosen": -405.6257019042969, "logps/rejected": -280.71929931640625, "loss": 0.4161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.135754585266113, "rewards/margins": 1.7498859167099, "rewards/rejected": 2.385868549346924, "step": 29770 }, { "epoch": 1.3826082919355587, "grad_norm": 29.768840789794922, "learning_rate": 2.1709085844282463e-07, "logits/chosen": -18.408172607421875, "logits/rejected": -18.54257583618164, "logps/chosen": -374.8564147949219, "logps/rejected": -330.84228515625, "loss": 0.8497, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.524855852127075, "rewards/margins": 0.5662590861320496, "rewards/rejected": 2.958596706390381, "step": 29780 }, { "epoch": 1.383072566042992, "grad_norm": 14.007524490356445, "learning_rate": 2.1706300199637864e-07, "logits/chosen": -18.65437126159668, "logits/rejected": -16.945064544677734, "logps/chosen": -349.9344177246094, "logps/rejected": -200.16278076171875, "loss": 0.4293, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.584764242172241, "rewards/margins": 1.4684526920318604, "rewards/rejected": 1.11631178855896, "step": 29790 }, { "epoch": 1.3835368401504249, "grad_norm": 62.30255889892578, "learning_rate": 2.1703514554993268e-07, "logits/chosen": -19.287704467773438, "logits/rejected": -18.734434127807617, "logps/chosen": -361.31634521484375, "logps/rejected": -279.55267333984375, "loss": 0.4482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0323050022125244, "rewards/margins": 0.8915418386459351, "rewards/rejected": 2.1407628059387207, "step": 29800 }, { "epoch": 1.3840011142578579, "grad_norm": 38.93742370605469, "learning_rate": 2.170072891034867e-07, "logits/chosen": -18.966171264648438, "logits/rejected": -18.413429260253906, "logps/chosen": -469.4644470214844, "logps/rejected": -489.9541931152344, "loss": 0.5768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.934462070465088, "rewards/margins": 0.8529213070869446, "rewards/rejected": 3.081540584564209, "step": 29810 }, { "epoch": 1.3844653883652909, "grad_norm": 37.825496673583984, "learning_rate": 2.1697943265704068e-07, "logits/chosen": -18.783580780029297, "logits/rejected": -17.944440841674805, "logps/chosen": -415.083251953125, "logps/rejected": -291.3997802734375, "loss": 0.2406, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7747650146484375, "rewards/margins": 2.1561453342437744, "rewards/rejected": 1.6186193227767944, "step": 29820 }, { "epoch": 1.3849296624727239, "grad_norm": 151.9200439453125, "learning_rate": 2.1695157621059472e-07, "logits/chosen": -19.08462142944336, "logits/rejected": -17.964815139770508, "logps/chosen": -375.06549072265625, "logps/rejected": -321.22662353515625, "loss": 0.6085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5468382835388184, "rewards/margins": 1.1991374492645264, "rewards/rejected": 2.347701072692871, "step": 29830 }, { "epoch": 1.3853939365801569, "grad_norm": 144.53184509277344, "learning_rate": 2.1692371976414874e-07, "logits/chosen": -19.22673988342285, "logits/rejected": -19.10248374938965, "logps/chosen": -471.1382751464844, "logps/rejected": -441.19989013671875, "loss": 0.7045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5813727378845215, "rewards/margins": 0.9360165596008301, "rewards/rejected": 2.6453566551208496, "step": 29840 }, { "epoch": 1.3858582106875899, "grad_norm": 3.7224767208099365, "learning_rate": 2.1689586331770278e-07, "logits/chosen": -18.666706085205078, "logits/rejected": -17.310840606689453, "logps/chosen": -474.2411193847656, "logps/rejected": -220.98928833007812, "loss": 0.2807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6799511909484863, "rewards/margins": 2.1609177589416504, "rewards/rejected": 1.5190330743789673, "step": 29850 }, { "epoch": 1.386322484795023, "grad_norm": 60.37713623046875, "learning_rate": 2.1686800687125677e-07, "logits/chosen": -19.66451072692871, "logits/rejected": -18.741365432739258, "logps/chosen": -480.26348876953125, "logps/rejected": -322.80841064453125, "loss": 0.518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.387385606765747, "rewards/margins": 0.9025447964668274, "rewards/rejected": 2.4848408699035645, "step": 29860 }, { "epoch": 1.386786758902456, "grad_norm": 20.419448852539062, "learning_rate": 2.168401504248108e-07, "logits/chosen": -19.861984252929688, "logits/rejected": -18.719655990600586, "logps/chosen": -452.6932678222656, "logps/rejected": -270.37701416015625, "loss": 0.3392, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.751042127609253, "rewards/margins": 1.6968157291412354, "rewards/rejected": 2.0542266368865967, "step": 29870 }, { "epoch": 1.387251033009889, "grad_norm": 241.2776336669922, "learning_rate": 2.1681229397836482e-07, "logits/chosen": -18.585060119628906, "logits/rejected": -17.99932098388672, "logps/chosen": -487.868896484375, "logps/rejected": -408.22137451171875, "loss": 0.4019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9599545001983643, "rewards/margins": 1.5540404319763184, "rewards/rejected": 2.405914306640625, "step": 29880 }, { "epoch": 1.387715307117322, "grad_norm": 12.443347930908203, "learning_rate": 2.1678443753191884e-07, "logits/chosen": -18.271053314208984, "logits/rejected": -17.137319564819336, "logps/chosen": -447.4683532714844, "logps/rejected": -301.6834411621094, "loss": 0.6232, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6422131061553955, "rewards/margins": 1.1298775672912598, "rewards/rejected": 2.5123353004455566, "step": 29890 }, { "epoch": 1.388179581224755, "grad_norm": 7.916622638702393, "learning_rate": 2.1675658108547285e-07, "logits/chosen": -17.951492309570312, "logits/rejected": -17.352676391601562, "logps/chosen": -247.9151611328125, "logps/rejected": -178.51461791992188, "loss": 0.2949, "rewards/accuracies": 1.0, "rewards/chosen": 2.5668272972106934, "rewards/margins": 1.4123724699020386, "rewards/rejected": 1.1544544696807861, "step": 29900 }, { "epoch": 1.3886438553321883, "grad_norm": 112.47512817382812, "learning_rate": 2.1672872463902687e-07, "logits/chosen": -18.97573471069336, "logits/rejected": -18.24471664428711, "logps/chosen": -448.868408203125, "logps/rejected": -267.1368713378906, "loss": 0.3999, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.316346645355225, "rewards/margins": 1.7407478094100952, "rewards/rejected": 2.575598955154419, "step": 29910 }, { "epoch": 1.389108129439621, "grad_norm": 11.405890464782715, "learning_rate": 2.167008681925809e-07, "logits/chosen": -19.063617706298828, "logits/rejected": -17.79415512084961, "logps/chosen": -423.66412353515625, "logps/rejected": -325.7701110839844, "loss": 0.5839, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3943984508514404, "rewards/margins": 1.1095603704452515, "rewards/rejected": 2.2848381996154785, "step": 29920 }, { "epoch": 1.3895724035470542, "grad_norm": 3.7606401443481445, "learning_rate": 2.166730117461349e-07, "logits/chosen": -20.09295082092285, "logits/rejected": -19.28870391845703, "logps/chosen": -384.51593017578125, "logps/rejected": -291.743896484375, "loss": 0.5299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.235483169555664, "rewards/margins": 1.04762601852417, "rewards/rejected": 2.187857151031494, "step": 29930 }, { "epoch": 1.3900366776544872, "grad_norm": 30.174436569213867, "learning_rate": 2.166451552996889e-07, "logits/chosen": -19.05132484436035, "logits/rejected": -19.387432098388672, "logps/chosen": -282.75146484375, "logps/rejected": -279.8640441894531, "loss": 0.9869, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.934635877609253, "rewards/margins": -0.3059437572956085, "rewards/rejected": 2.240579843521118, "step": 29940 }, { "epoch": 1.3905009517619202, "grad_norm": 25.665355682373047, "learning_rate": 2.1661729885324295e-07, "logits/chosen": -18.924596786499023, "logits/rejected": -18.183635711669922, "logps/chosen": -327.17974853515625, "logps/rejected": -249.9462127685547, "loss": 0.5353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.186641216278076, "rewards/margins": 1.3501992225646973, "rewards/rejected": 1.836442232131958, "step": 29950 }, { "epoch": 1.3909652258693532, "grad_norm": 52.16489791870117, "learning_rate": 2.1658944240679697e-07, "logits/chosen": -19.557880401611328, "logits/rejected": -18.801414489746094, "logps/chosen": -654.1825561523438, "logps/rejected": -448.40008544921875, "loss": 0.5048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7136125564575195, "rewards/margins": 1.6373252868652344, "rewards/rejected": 3.0762877464294434, "step": 29960 }, { "epoch": 1.3914294999767862, "grad_norm": 98.30146026611328, "learning_rate": 2.1656158596035095e-07, "logits/chosen": -19.327457427978516, "logits/rejected": -18.589786529541016, "logps/chosen": -481.00274658203125, "logps/rejected": -386.2810363769531, "loss": 0.6556, "rewards/accuracies": 0.5, "rewards/chosen": 3.8645107746124268, "rewards/margins": 0.6878716349601746, "rewards/rejected": 3.1766390800476074, "step": 29970 }, { "epoch": 1.3918937740842194, "grad_norm": 79.9285659790039, "learning_rate": 2.16533729513905e-07, "logits/chosen": -18.288286209106445, "logits/rejected": -17.881511688232422, "logps/chosen": -389.784423828125, "logps/rejected": -322.14227294921875, "loss": 0.8044, "rewards/accuracies": 0.5, "rewards/chosen": 3.4613242149353027, "rewards/margins": 0.37947598099708557, "rewards/rejected": 3.081848621368408, "step": 29980 }, { "epoch": 1.3923580481916524, "grad_norm": 127.48667907714844, "learning_rate": 2.16505873067459e-07, "logits/chosen": -19.249839782714844, "logits/rejected": -18.994657516479492, "logps/chosen": -424.0177307128906, "logps/rejected": -412.29315185546875, "loss": 1.0393, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.690835475921631, "rewards/margins": 0.17280130088329315, "rewards/rejected": 3.5180344581604004, "step": 29990 }, { "epoch": 1.3928223222990854, "grad_norm": 16.137311935424805, "learning_rate": 2.1647801662101305e-07, "logits/chosen": -19.105106353759766, "logits/rejected": -18.644485473632812, "logps/chosen": -425.0948181152344, "logps/rejected": -355.034912109375, "loss": 0.5257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.238217830657959, "rewards/margins": 0.810856819152832, "rewards/rejected": 2.427361011505127, "step": 30000 }, { "epoch": 1.3932865964065184, "grad_norm": 237.29803466796875, "learning_rate": 2.1645016017456704e-07, "logits/chosen": -18.566349029541016, "logits/rejected": -18.541540145874023, "logps/chosen": -470.81500244140625, "logps/rejected": -444.9124450683594, "loss": 0.6631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7472128868103027, "rewards/margins": 0.42416781187057495, "rewards/rejected": 3.323045015335083, "step": 30010 }, { "epoch": 1.3937508705139514, "grad_norm": 21.350698471069336, "learning_rate": 2.1642230372812108e-07, "logits/chosen": -19.36065673828125, "logits/rejected": -18.453998565673828, "logps/chosen": -483.1890563964844, "logps/rejected": -351.0934143066406, "loss": 0.6381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6960220336914062, "rewards/margins": 0.48663797974586487, "rewards/rejected": 3.209383726119995, "step": 30020 }, { "epoch": 1.3942151446213844, "grad_norm": 11.237133979797363, "learning_rate": 2.163944472816751e-07, "logits/chosen": -19.559024810791016, "logits/rejected": -18.874752044677734, "logps/chosen": -575.4073486328125, "logps/rejected": -496.91717529296875, "loss": 0.3399, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.06679630279541, "rewards/margins": 1.3535988330841064, "rewards/rejected": 3.7131972312927246, "step": 30030 }, { "epoch": 1.3946794187288174, "grad_norm": 9.948872566223145, "learning_rate": 2.1636659083522914e-07, "logits/chosen": -19.146900177001953, "logits/rejected": -18.56302833557129, "logps/chosen": -329.54132080078125, "logps/rejected": -289.7119445800781, "loss": 0.4798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6464245319366455, "rewards/margins": 0.9568520784378052, "rewards/rejected": 1.6895725727081299, "step": 30040 }, { "epoch": 1.3951436928362506, "grad_norm": 93.6950912475586, "learning_rate": 2.1633873438878312e-07, "logits/chosen": -19.589876174926758, "logits/rejected": -19.15996742248535, "logps/chosen": -515.2745971679688, "logps/rejected": -406.37115478515625, "loss": 0.5154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.743074893951416, "rewards/margins": 0.72612464427948, "rewards/rejected": 3.016950845718384, "step": 30050 }, { "epoch": 1.3956079669436836, "grad_norm": 86.11325073242188, "learning_rate": 2.1631087794233714e-07, "logits/chosen": -18.53494644165039, "logits/rejected": -18.538068771362305, "logps/chosen": -398.68890380859375, "logps/rejected": -319.44073486328125, "loss": 0.3167, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.659053087234497, "rewards/margins": 1.5415773391723633, "rewards/rejected": 2.117475748062134, "step": 30060 }, { "epoch": 1.3960722410511166, "grad_norm": 42.36782455444336, "learning_rate": 2.1628302149589118e-07, "logits/chosen": -18.975627899169922, "logits/rejected": -18.524288177490234, "logps/chosen": -382.4299011230469, "logps/rejected": -337.7994079589844, "loss": 0.4971, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1623001098632812, "rewards/margins": 0.8786735534667969, "rewards/rejected": 2.2836265563964844, "step": 30070 }, { "epoch": 1.3965365151585496, "grad_norm": 85.7218017578125, "learning_rate": 2.162551650494452e-07, "logits/chosen": -18.74747657775879, "logits/rejected": -18.311031341552734, "logps/chosen": -474.5835876464844, "logps/rejected": -446.178466796875, "loss": 0.6615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.587785243988037, "rewards/margins": 0.21506543457508087, "rewards/rejected": 3.3727200031280518, "step": 30080 }, { "epoch": 1.3970007892659826, "grad_norm": 1.5988469123840332, "learning_rate": 2.1622730860299918e-07, "logits/chosen": -20.15912628173828, "logits/rejected": -19.097797393798828, "logps/chosen": -343.5076904296875, "logps/rejected": -260.2078552246094, "loss": 0.5977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5812315940856934, "rewards/margins": 0.7806487083435059, "rewards/rejected": 1.8005831241607666, "step": 30090 }, { "epoch": 1.3974650633734156, "grad_norm": 2.9121642112731934, "learning_rate": 2.1619945215655322e-07, "logits/chosen": -18.9376277923584, "logits/rejected": -18.4240779876709, "logps/chosen": -306.42816162109375, "logps/rejected": -213.95693969726562, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6989355087280273, "rewards/margins": 0.9692170023918152, "rewards/rejected": 1.729718565940857, "step": 30100 }, { "epoch": 1.3979293374808486, "grad_norm": 58.98095703125, "learning_rate": 2.1617159571010724e-07, "logits/chosen": -19.4155330657959, "logits/rejected": -18.31798553466797, "logps/chosen": -378.1003112792969, "logps/rejected": -281.5301208496094, "loss": 0.3107, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.950918674468994, "rewards/margins": 1.8858463764190674, "rewards/rejected": 2.065072536468506, "step": 30110 }, { "epoch": 1.3983936115882818, "grad_norm": 46.478023529052734, "learning_rate": 2.1614373926366125e-07, "logits/chosen": -19.45513343811035, "logits/rejected": -19.01869773864746, "logps/chosen": -482.2958984375, "logps/rejected": -382.83697509765625, "loss": 0.9112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1670403480529785, "rewards/margins": 0.34156617522239685, "rewards/rejected": 3.825474262237549, "step": 30120 }, { "epoch": 1.3988578856957148, "grad_norm": 11.811318397521973, "learning_rate": 2.1611588281721527e-07, "logits/chosen": -19.053869247436523, "logits/rejected": -18.31026268005371, "logps/chosen": -388.1091613769531, "logps/rejected": -339.0503845214844, "loss": 0.8796, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8436496257781982, "rewards/margins": 0.12387460470199585, "rewards/rejected": 2.7197749614715576, "step": 30130 }, { "epoch": 1.3993221598031478, "grad_norm": 77.0843734741211, "learning_rate": 2.1608802637076928e-07, "logits/chosen": -19.582307815551758, "logits/rejected": -18.884883880615234, "logps/chosen": -424.7140197753906, "logps/rejected": -340.05487060546875, "loss": 0.4675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6347572803497314, "rewards/margins": 0.9956742525100708, "rewards/rejected": 1.639082908630371, "step": 30140 }, { "epoch": 1.3997864339105808, "grad_norm": 144.8376007080078, "learning_rate": 2.1606016992432332e-07, "logits/chosen": -19.304166793823242, "logits/rejected": -18.192474365234375, "logps/chosen": -481.79608154296875, "logps/rejected": -402.29412841796875, "loss": 0.4583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.103021621704102, "rewards/margins": 1.4632513523101807, "rewards/rejected": 2.639770269393921, "step": 30150 }, { "epoch": 1.4002507080180138, "grad_norm": 12.279963493347168, "learning_rate": 2.160323134778773e-07, "logits/chosen": -19.42600440979004, "logits/rejected": -18.378307342529297, "logps/chosen": -305.8888854980469, "logps/rejected": -258.9755859375, "loss": 0.4877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.900524377822876, "rewards/margins": 0.9589322805404663, "rewards/rejected": 1.9415918588638306, "step": 30160 }, { "epoch": 1.400714982125447, "grad_norm": 56.0628662109375, "learning_rate": 2.1600445703143135e-07, "logits/chosen": -18.87811851501465, "logits/rejected": -18.585906982421875, "logps/chosen": -379.08673095703125, "logps/rejected": -336.75323486328125, "loss": 0.6958, "rewards/accuracies": 0.5, "rewards/chosen": 3.129654884338379, "rewards/margins": 0.29996877908706665, "rewards/rejected": 2.829686403274536, "step": 30170 }, { "epoch": 1.4011792562328798, "grad_norm": 14.179998397827148, "learning_rate": 2.1597660058498537e-07, "logits/chosen": -19.60101890563965, "logits/rejected": -19.168581008911133, "logps/chosen": -453.45166015625, "logps/rejected": -378.76959228515625, "loss": 0.5296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9612789154052734, "rewards/margins": 1.0625327825546265, "rewards/rejected": 2.8987460136413574, "step": 30180 }, { "epoch": 1.401643530340313, "grad_norm": 94.59288024902344, "learning_rate": 2.159487441385394e-07, "logits/chosen": -18.62833023071289, "logits/rejected": -17.835308074951172, "logps/chosen": -481.1324768066406, "logps/rejected": -355.89068603515625, "loss": 0.6692, "rewards/accuracies": 0.5, "rewards/chosen": 2.770813465118408, "rewards/margins": 0.5650327801704407, "rewards/rejected": 2.2057807445526123, "step": 30190 }, { "epoch": 1.402107804447746, "grad_norm": 273.76312255859375, "learning_rate": 2.159208876920934e-07, "logits/chosen": -18.385112762451172, "logits/rejected": -18.233842849731445, "logps/chosen": -244.95455932617188, "logps/rejected": -326.41925048828125, "loss": 1.0777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1771860122680664, "rewards/margins": 0.4376205801963806, "rewards/rejected": 1.7395654916763306, "step": 30200 }, { "epoch": 1.402572078555179, "grad_norm": 73.00267028808594, "learning_rate": 2.158930312456474e-07, "logits/chosen": -19.275440216064453, "logits/rejected": -18.65443229675293, "logps/chosen": -416.37200927734375, "logps/rejected": -351.2878723144531, "loss": 0.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.497600555419922, "rewards/margins": 0.529503345489502, "rewards/rejected": 2.96809720993042, "step": 30210 }, { "epoch": 1.403036352662612, "grad_norm": 26.774595260620117, "learning_rate": 2.1586517479920145e-07, "logits/chosen": -20.125410079956055, "logits/rejected": -18.36282730102539, "logps/chosen": -384.41363525390625, "logps/rejected": -269.8265380859375, "loss": 0.5099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9730255603790283, "rewards/margins": 1.0806033611297607, "rewards/rejected": 1.892422080039978, "step": 30220 }, { "epoch": 1.403500626770045, "grad_norm": 3.418604612350464, "learning_rate": 2.1583731835275546e-07, "logits/chosen": -18.0338134765625, "logits/rejected": -17.230831146240234, "logps/chosen": -290.20263671875, "logps/rejected": -236.13003540039062, "loss": 1.3471, "rewards/accuracies": 0.5, "rewards/chosen": 2.2579638957977295, "rewards/margins": 0.0032348395325243473, "rewards/rejected": 2.2547292709350586, "step": 30230 }, { "epoch": 1.4039649008774782, "grad_norm": 4.536510467529297, "learning_rate": 2.1580946190630945e-07, "logits/chosen": -19.277843475341797, "logits/rejected": -17.874469757080078, "logps/chosen": -473.6182556152344, "logps/rejected": -348.76153564453125, "loss": 0.5484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.421899795532227, "rewards/margins": 1.2237402200698853, "rewards/rejected": 3.198159694671631, "step": 30240 }, { "epoch": 1.4044291749849112, "grad_norm": 43.320701599121094, "learning_rate": 2.157816054598635e-07, "logits/chosen": -18.51997947692871, "logits/rejected": -18.00876235961914, "logps/chosen": -408.54046630859375, "logps/rejected": -315.5846862792969, "loss": 0.873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2799313068389893, "rewards/margins": 0.4536685049533844, "rewards/rejected": 2.826262950897217, "step": 30250 }, { "epoch": 1.4048934490923441, "grad_norm": 161.53184509277344, "learning_rate": 2.157537490134175e-07, "logits/chosen": -18.461938858032227, "logits/rejected": -18.536827087402344, "logps/chosen": -409.905517578125, "logps/rejected": -388.7012939453125, "loss": 0.7821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.225792646408081, "rewards/margins": 0.09831857681274414, "rewards/rejected": 3.127474069595337, "step": 30260 }, { "epoch": 1.4053577231997771, "grad_norm": 121.23445892333984, "learning_rate": 2.1572589256697155e-07, "logits/chosen": -18.841230392456055, "logits/rejected": -17.830177307128906, "logps/chosen": -351.96868896484375, "logps/rejected": -283.4669189453125, "loss": 0.592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5695853233337402, "rewards/margins": 1.1254349946975708, "rewards/rejected": 2.444150447845459, "step": 30270 }, { "epoch": 1.4058219973072101, "grad_norm": 116.5450668334961, "learning_rate": 2.1569803612052554e-07, "logits/chosen": -18.491941452026367, "logits/rejected": -18.735328674316406, "logps/chosen": -423.50628662109375, "logps/rejected": -444.4085388183594, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7478713989257812, "rewards/margins": 0.431566059589386, "rewards/rejected": 2.31630539894104, "step": 30280 }, { "epoch": 1.4062862714146431, "grad_norm": 136.8144989013672, "learning_rate": 2.1567017967407958e-07, "logits/chosen": -18.48670196533203, "logits/rejected": -18.473712921142578, "logps/chosen": -407.34100341796875, "logps/rejected": -448.1941833496094, "loss": 1.0134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.033433198928833, "rewards/margins": -0.15816381573677063, "rewards/rejected": 3.1915972232818604, "step": 30290 }, { "epoch": 1.4067505455220761, "grad_norm": 107.54627227783203, "learning_rate": 2.156423232276336e-07, "logits/chosen": -19.312097549438477, "logits/rejected": -18.636707305908203, "logps/chosen": -418.84124755859375, "logps/rejected": -337.72418212890625, "loss": 0.5212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4612534046173096, "rewards/margins": 0.6540559530258179, "rewards/rejected": 2.807197332382202, "step": 30300 }, { "epoch": 1.4072148196295093, "grad_norm": 81.1791000366211, "learning_rate": 2.1561446678118758e-07, "logits/chosen": -18.554445266723633, "logits/rejected": -17.487037658691406, "logps/chosen": -416.1539611816406, "logps/rejected": -270.814208984375, "loss": 0.3279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9739744663238525, "rewards/margins": 1.4173383712768555, "rewards/rejected": 1.5566362142562866, "step": 30310 }, { "epoch": 1.4076790937369423, "grad_norm": 173.60137939453125, "learning_rate": 2.1558661033474162e-07, "logits/chosen": -19.937362670898438, "logits/rejected": -19.161468505859375, "logps/chosen": -393.92352294921875, "logps/rejected": -415.8291015625, "loss": 0.6393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.10050368309021, "rewards/margins": 0.4581625461578369, "rewards/rejected": 2.642341136932373, "step": 30320 }, { "epoch": 1.4081433678443753, "grad_norm": 8.81104564666748, "learning_rate": 2.1555875388829564e-07, "logits/chosen": -19.172361373901367, "logits/rejected": -17.297595977783203, "logps/chosen": -444.8700256347656, "logps/rejected": -327.0988464355469, "loss": 0.2956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.449258804321289, "rewards/margins": 1.3299466371536255, "rewards/rejected": 2.119311809539795, "step": 30330 }, { "epoch": 1.4086076419518083, "grad_norm": 120.92118072509766, "learning_rate": 2.1553089744184968e-07, "logits/chosen": -19.216650009155273, "logits/rejected": -18.840633392333984, "logps/chosen": -346.8013000488281, "logps/rejected": -309.8416748046875, "loss": 0.8814, "rewards/accuracies": 0.5, "rewards/chosen": 2.7781031131744385, "rewards/margins": 0.190212219953537, "rewards/rejected": 2.587891101837158, "step": 30340 }, { "epoch": 1.4090719160592413, "grad_norm": 139.32464599609375, "learning_rate": 2.1550304099540367e-07, "logits/chosen": -19.571197509765625, "logits/rejected": -17.364099502563477, "logps/chosen": -510.52496337890625, "logps/rejected": -244.7283172607422, "loss": 0.2194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.381070613861084, "rewards/margins": 2.659907102584839, "rewards/rejected": 1.7211635112762451, "step": 30350 }, { "epoch": 1.4095361901666745, "grad_norm": 1.3637388944625854, "learning_rate": 2.1547518454895768e-07, "logits/chosen": -19.065597534179688, "logits/rejected": -18.45979881286621, "logps/chosen": -515.509033203125, "logps/rejected": -422.90277099609375, "loss": 0.4963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.026388645172119, "rewards/margins": 1.0922739505767822, "rewards/rejected": 2.934114933013916, "step": 30360 }, { "epoch": 1.4100004642741073, "grad_norm": 112.6546630859375, "learning_rate": 2.1544732810251172e-07, "logits/chosen": -19.775026321411133, "logits/rejected": -19.928943634033203, "logps/chosen": -385.7771911621094, "logps/rejected": -370.84893798828125, "loss": 0.7918, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.7414519786834717, "rewards/margins": -0.03299137204885483, "rewards/rejected": 2.7744433879852295, "step": 30370 }, { "epoch": 1.4104647383815405, "grad_norm": 25.055992126464844, "learning_rate": 2.1541947165606574e-07, "logits/chosen": -19.10369873046875, "logits/rejected": -18.80929946899414, "logps/chosen": -358.08648681640625, "logps/rejected": -301.8377685546875, "loss": 0.5755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.175405740737915, "rewards/margins": 0.8129722476005554, "rewards/rejected": 2.362433433532715, "step": 30380 }, { "epoch": 1.4109290124889735, "grad_norm": 50.44813919067383, "learning_rate": 2.1539161520961972e-07, "logits/chosen": -18.82535171508789, "logits/rejected": -17.306062698364258, "logps/chosen": -453.3929748535156, "logps/rejected": -277.6980895996094, "loss": 0.2933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.541496753692627, "rewards/margins": 1.9404217004776, "rewards/rejected": 1.6010745763778687, "step": 30390 }, { "epoch": 1.4113932865964065, "grad_norm": 2.9309911727905273, "learning_rate": 2.1536375876317376e-07, "logits/chosen": -18.762897491455078, "logits/rejected": -18.040781021118164, "logps/chosen": -472.2914123535156, "logps/rejected": -436.743408203125, "loss": 0.9301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.273608207702637, "rewards/margins": 0.4957260191440582, "rewards/rejected": 3.7778820991516113, "step": 30400 }, { "epoch": 1.4118575607038395, "grad_norm": 4.600278377532959, "learning_rate": 2.1533590231672778e-07, "logits/chosen": -18.168636322021484, "logits/rejected": -18.09583282470703, "logps/chosen": -441.2406311035156, "logps/rejected": -409.83099365234375, "loss": 0.876, "rewards/accuracies": 0.5, "rewards/chosen": 3.256448745727539, "rewards/margins": 0.7713836431503296, "rewards/rejected": 2.485065221786499, "step": 30410 }, { "epoch": 1.4123218348112725, "grad_norm": 41.79450225830078, "learning_rate": 2.1530804587028182e-07, "logits/chosen": -19.086727142333984, "logits/rejected": -18.303234100341797, "logps/chosen": -372.8151550292969, "logps/rejected": -332.64508056640625, "loss": 0.7604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.095141887664795, "rewards/margins": 0.5879902243614197, "rewards/rejected": 2.5071518421173096, "step": 30420 }, { "epoch": 1.4127861089187057, "grad_norm": 40.9100341796875, "learning_rate": 2.152801894238358e-07, "logits/chosen": -19.786853790283203, "logits/rejected": -18.665502548217773, "logps/chosen": -494.7361755371094, "logps/rejected": -389.93646240234375, "loss": 0.4939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.886404037475586, "rewards/margins": 0.7617363929748535, "rewards/rejected": 3.1246676445007324, "step": 30430 }, { "epoch": 1.4132503830261387, "grad_norm": 29.04396629333496, "learning_rate": 2.1525233297738985e-07, "logits/chosen": -18.18996810913086, "logits/rejected": -16.505985260009766, "logps/chosen": -384.51422119140625, "logps/rejected": -221.7262725830078, "loss": 0.2979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1628525257110596, "rewards/margins": 2.0878894329071045, "rewards/rejected": 1.074962854385376, "step": 30440 }, { "epoch": 1.4137146571335717, "grad_norm": 101.08829498291016, "learning_rate": 2.1522447653094386e-07, "logits/chosen": -20.86639404296875, "logits/rejected": -18.12090492248535, "logps/chosen": -563.90478515625, "logps/rejected": -338.28045654296875, "loss": 0.4269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.303948402404785, "rewards/margins": 1.8795521259307861, "rewards/rejected": 2.42439603805542, "step": 30450 }, { "epoch": 1.4141789312410047, "grad_norm": 89.09854888916016, "learning_rate": 2.151966200844979e-07, "logits/chosen": -18.869609832763672, "logits/rejected": -16.92403793334961, "logps/chosen": -407.6612854003906, "logps/rejected": -343.4502868652344, "loss": 0.8079, "rewards/accuracies": 0.5, "rewards/chosen": 3.349086284637451, "rewards/margins": 1.0158292055130005, "rewards/rejected": 2.3332574367523193, "step": 30460 }, { "epoch": 1.4146432053484377, "grad_norm": 30.458316802978516, "learning_rate": 2.151687636380519e-07, "logits/chosen": -18.897396087646484, "logits/rejected": -19.169034957885742, "logps/chosen": -350.79144287109375, "logps/rejected": -320.25762939453125, "loss": 0.8765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.505530595779419, "rewards/margins": 0.1632152795791626, "rewards/rejected": 3.342315673828125, "step": 30470 }, { "epoch": 1.4151074794558707, "grad_norm": 54.561561584472656, "learning_rate": 2.151409071916059e-07, "logits/chosen": -19.133441925048828, "logits/rejected": -18.672870635986328, "logps/chosen": -405.4425354003906, "logps/rejected": -396.0234680175781, "loss": 0.6501, "rewards/accuracies": 0.5, "rewards/chosen": 3.4083964824676514, "rewards/margins": 0.220180481672287, "rewards/rejected": 3.188216209411621, "step": 30480 }, { "epoch": 1.4155717535633037, "grad_norm": 36.74042510986328, "learning_rate": 2.1511305074515995e-07, "logits/chosen": -19.017324447631836, "logits/rejected": -18.96945571899414, "logps/chosen": -464.15106201171875, "logps/rejected": -353.16864013671875, "loss": 1.0747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2085347175598145, "rewards/margins": 0.25610584020614624, "rewards/rejected": 2.9524285793304443, "step": 30490 }, { "epoch": 1.4160360276707369, "grad_norm": 289.79302978515625, "learning_rate": 2.1508519429871394e-07, "logits/chosen": -19.230892181396484, "logits/rejected": -19.233840942382812, "logps/chosen": -396.79998779296875, "logps/rejected": -383.0151062011719, "loss": 1.2908, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.700287103652954, "rewards/margins": -0.44653430581092834, "rewards/rejected": 3.1468214988708496, "step": 30500 }, { "epoch": 1.4165003017781699, "grad_norm": 54.69047546386719, "learning_rate": 2.1505733785226795e-07, "logits/chosen": -19.502620697021484, "logits/rejected": -18.744495391845703, "logps/chosen": -436.73992919921875, "logps/rejected": -342.57452392578125, "loss": 0.6948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5249881744384766, "rewards/margins": 0.7174922823905945, "rewards/rejected": 2.8074960708618164, "step": 30510 }, { "epoch": 1.4169645758856029, "grad_norm": 38.53491973876953, "learning_rate": 2.15029481405822e-07, "logits/chosen": -18.852018356323242, "logits/rejected": -17.85368537902832, "logps/chosen": -325.4930114746094, "logps/rejected": -285.13616943359375, "loss": 0.4868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7550644874572754, "rewards/margins": 1.597496747970581, "rewards/rejected": 1.1575677394866943, "step": 30520 }, { "epoch": 1.4174288499930359, "grad_norm": 33.541255950927734, "learning_rate": 2.15001624959376e-07, "logits/chosen": -19.121450424194336, "logits/rejected": -18.433948516845703, "logps/chosen": -486.16180419921875, "logps/rejected": -360.6485900878906, "loss": 0.3991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6281025409698486, "rewards/margins": 1.1961947679519653, "rewards/rejected": 2.431908130645752, "step": 30530 }, { "epoch": 1.4178931241004689, "grad_norm": 189.54942321777344, "learning_rate": 2.1497376851293002e-07, "logits/chosen": -18.438770294189453, "logits/rejected": -18.816926956176758, "logps/chosen": -407.88409423828125, "logps/rejected": -402.49969482421875, "loss": 1.1326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.185976505279541, "rewards/margins": -0.16913601756095886, "rewards/rejected": 3.3551125526428223, "step": 30540 }, { "epoch": 1.418357398207902, "grad_norm": 50.431522369384766, "learning_rate": 2.1494591206648404e-07, "logits/chosen": -20.00625228881836, "logits/rejected": -18.368505477905273, "logps/chosen": -446.4697265625, "logps/rejected": -319.592529296875, "loss": 0.5482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3426895141601562, "rewards/margins": 1.057180643081665, "rewards/rejected": 2.2855091094970703, "step": 30550 }, { "epoch": 1.4188216723153348, "grad_norm": 118.71295928955078, "learning_rate": 2.1491805562003805e-07, "logits/chosen": -18.70086097717285, "logits/rejected": -17.254154205322266, "logps/chosen": -405.9654541015625, "logps/rejected": -294.88006591796875, "loss": 0.5746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2527682781219482, "rewards/margins": 0.83241206407547, "rewards/rejected": 2.420356273651123, "step": 30560 }, { "epoch": 1.419285946422768, "grad_norm": 18.67329978942871, "learning_rate": 2.148901991735921e-07, "logits/chosen": -19.537242889404297, "logits/rejected": -18.374706268310547, "logps/chosen": -334.4112243652344, "logps/rejected": -268.7576904296875, "loss": 0.7905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8478498458862305, "rewards/margins": 0.6284763813018799, "rewards/rejected": 2.2193734645843506, "step": 30570 }, { "epoch": 1.419750220530201, "grad_norm": 16.742210388183594, "learning_rate": 2.1486234272714608e-07, "logits/chosen": -18.610576629638672, "logits/rejected": -18.134510040283203, "logps/chosen": -551.4684448242188, "logps/rejected": -399.18865966796875, "loss": 0.5935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4938607215881348, "rewards/margins": 0.7525818943977356, "rewards/rejected": 2.7412784099578857, "step": 30580 }, { "epoch": 1.420214494637634, "grad_norm": 16.58062744140625, "learning_rate": 2.1483448628070012e-07, "logits/chosen": -19.02858543395996, "logits/rejected": -18.27298355102539, "logps/chosen": -400.31036376953125, "logps/rejected": -362.6719055175781, "loss": 0.5499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2576937675476074, "rewards/margins": 0.6232720613479614, "rewards/rejected": 2.6344218254089355, "step": 30590 }, { "epoch": 1.420678768745067, "grad_norm": 188.33358764648438, "learning_rate": 2.1480662983425413e-07, "logits/chosen": -19.68341636657715, "logits/rejected": -18.84290885925293, "logps/chosen": -448.6011657714844, "logps/rejected": -380.1344299316406, "loss": 0.7097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.69840931892395, "rewards/margins": 0.89063560962677, "rewards/rejected": 2.8077733516693115, "step": 30600 }, { "epoch": 1.4211430428525, "grad_norm": 39.55568313598633, "learning_rate": 2.1477877338780818e-07, "logits/chosen": -19.167531967163086, "logits/rejected": -18.67820930480957, "logps/chosen": -385.8390197753906, "logps/rejected": -291.5157470703125, "loss": 0.7216, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3705527782440186, "rewards/margins": 0.36907631158828735, "rewards/rejected": 3.001476287841797, "step": 30610 }, { "epoch": 1.4216073169599333, "grad_norm": 188.00811767578125, "learning_rate": 2.1475091694136216e-07, "logits/chosen": -18.967884063720703, "logits/rejected": -18.938512802124023, "logps/chosen": -391.47296142578125, "logps/rejected": -395.44024658203125, "loss": 1.2856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0370640754699707, "rewards/margins": -0.23041772842407227, "rewards/rejected": 3.2674813270568848, "step": 30620 }, { "epoch": 1.4220715910673662, "grad_norm": 57.22439956665039, "learning_rate": 2.1472306049491618e-07, "logits/chosen": -18.085302352905273, "logits/rejected": -18.184091567993164, "logps/chosen": -352.548583984375, "logps/rejected": -331.7186584472656, "loss": 0.8823, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.34139084815979, "rewards/margins": 0.15016044676303864, "rewards/rejected": 2.191230297088623, "step": 30630 }, { "epoch": 1.4225358651747992, "grad_norm": 152.47866821289062, "learning_rate": 2.1469520404847022e-07, "logits/chosen": -19.737079620361328, "logits/rejected": -19.00112533569336, "logps/chosen": -566.1017456054688, "logps/rejected": -440.615234375, "loss": 0.6008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1572370529174805, "rewards/margins": 0.5590494275093079, "rewards/rejected": 3.598187208175659, "step": 30640 }, { "epoch": 1.4230001392822322, "grad_norm": 30.682723999023438, "learning_rate": 2.1466734760202423e-07, "logits/chosen": -19.353307723999023, "logits/rejected": -18.050094604492188, "logps/chosen": -503.51507568359375, "logps/rejected": -417.93890380859375, "loss": 0.5955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.396381378173828, "rewards/margins": 0.42755383253097534, "rewards/rejected": 2.968827486038208, "step": 30650 }, { "epoch": 1.4234644133896652, "grad_norm": 311.8100280761719, "learning_rate": 2.1463949115557822e-07, "logits/chosen": -18.360885620117188, "logits/rejected": -17.15652084350586, "logps/chosen": -337.0533142089844, "logps/rejected": -284.9842529296875, "loss": 0.8989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.037682056427002, "rewards/margins": 0.5466169118881226, "rewards/rejected": 2.491065502166748, "step": 30660 }, { "epoch": 1.4239286874970982, "grad_norm": 55.28303527832031, "learning_rate": 2.1461163470913226e-07, "logits/chosen": -18.666038513183594, "logits/rejected": -18.425033569335938, "logps/chosen": -361.08795166015625, "logps/rejected": -323.78802490234375, "loss": 0.6103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6543679237365723, "rewards/margins": 0.5325452089309692, "rewards/rejected": 2.1218228340148926, "step": 30670 }, { "epoch": 1.4243929616045312, "grad_norm": 3.224905490875244, "learning_rate": 2.1458377826268628e-07, "logits/chosen": -18.763042449951172, "logits/rejected": -17.904634475708008, "logps/chosen": -490.62945556640625, "logps/rejected": -364.50775146484375, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.597586154937744, "rewards/margins": 1.2828137874603271, "rewards/rejected": 2.314772129058838, "step": 30680 }, { "epoch": 1.4248572357119644, "grad_norm": 97.25707244873047, "learning_rate": 2.1455592181624032e-07, "logits/chosen": -19.46441078186035, "logits/rejected": -18.694454193115234, "logps/chosen": -362.5240478515625, "logps/rejected": -304.1510009765625, "loss": 0.6751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4582858085632324, "rewards/margins": 0.2508373260498047, "rewards/rejected": 2.2074484825134277, "step": 30690 }, { "epoch": 1.4253215098193974, "grad_norm": 38.27247619628906, "learning_rate": 2.145280653697943e-07, "logits/chosen": -18.24656105041504, "logits/rejected": -17.57164192199707, "logps/chosen": -315.0071105957031, "logps/rejected": -238.6331329345703, "loss": 0.4225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4300405979156494, "rewards/margins": 0.8538557291030884, "rewards/rejected": 1.576184630393982, "step": 30700 }, { "epoch": 1.4257857839268304, "grad_norm": 15.19472885131836, "learning_rate": 2.1450020892334835e-07, "logits/chosen": -19.577133178710938, "logits/rejected": -17.998149871826172, "logps/chosen": -366.17913818359375, "logps/rejected": -261.64434814453125, "loss": 0.3611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2828369140625, "rewards/margins": 1.3314764499664307, "rewards/rejected": 1.9513603448867798, "step": 30710 }, { "epoch": 1.4262500580342634, "grad_norm": 62.08136749267578, "learning_rate": 2.1447235247690236e-07, "logits/chosen": -18.893627166748047, "logits/rejected": -17.729358673095703, "logps/chosen": -404.19036865234375, "logps/rejected": -239.53628540039062, "loss": 0.4455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.032423973083496, "rewards/margins": 1.3937116861343384, "rewards/rejected": 2.6387126445770264, "step": 30720 }, { "epoch": 1.4267143321416964, "grad_norm": 5.762147903442383, "learning_rate": 2.1444449603045635e-07, "logits/chosen": -19.52786636352539, "logits/rejected": -19.477706909179688, "logps/chosen": -448.7197265625, "logps/rejected": -338.1002197265625, "loss": 0.6122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.202441215515137, "rewards/margins": 1.1722503900527954, "rewards/rejected": 3.030190944671631, "step": 30730 }, { "epoch": 1.4271786062491296, "grad_norm": 163.32713317871094, "learning_rate": 2.144166395840104e-07, "logits/chosen": -18.237781524658203, "logits/rejected": -17.529434204101562, "logps/chosen": -317.45953369140625, "logps/rejected": -247.33773803710938, "loss": 0.961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7853004932403564, "rewards/margins": 0.34790557622909546, "rewards/rejected": 2.437394618988037, "step": 30740 }, { "epoch": 1.4276428803565624, "grad_norm": 176.49801635742188, "learning_rate": 2.143887831375644e-07, "logits/chosen": -18.59865951538086, "logits/rejected": -18.612462997436523, "logps/chosen": -350.3814392089844, "logps/rejected": -386.9535217285156, "loss": 1.0827, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.500220537185669, "rewards/margins": -0.19320285320281982, "rewards/rejected": 2.693423271179199, "step": 30750 }, { "epoch": 1.4281071544639956, "grad_norm": 92.27560424804688, "learning_rate": 2.1436092669111845e-07, "logits/chosen": -19.08095932006836, "logits/rejected": -19.0074405670166, "logps/chosen": -374.78302001953125, "logps/rejected": -390.9848937988281, "loss": 0.6997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.814563035964966, "rewards/margins": 0.3354368209838867, "rewards/rejected": 2.479126214981079, "step": 30760 }, { "epoch": 1.4285714285714286, "grad_norm": 2.4592647552490234, "learning_rate": 2.1433307024467244e-07, "logits/chosen": -19.552953720092773, "logits/rejected": -18.48456382751465, "logps/chosen": -377.6977233886719, "logps/rejected": -270.7965393066406, "loss": 0.6551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6100642681121826, "rewards/margins": 1.2475498914718628, "rewards/rejected": 2.3625142574310303, "step": 30770 }, { "epoch": 1.4290357026788616, "grad_norm": 15.11758041381836, "learning_rate": 2.1430521379822645e-07, "logits/chosen": -19.027345657348633, "logits/rejected": -18.216442108154297, "logps/chosen": -391.05291748046875, "logps/rejected": -321.17266845703125, "loss": 0.6762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.19769024848938, "rewards/margins": 0.8421894907951355, "rewards/rejected": 2.3555006980895996, "step": 30780 }, { "epoch": 1.4294999767862946, "grad_norm": 66.84606170654297, "learning_rate": 2.142773573517805e-07, "logits/chosen": -17.594390869140625, "logits/rejected": -18.120708465576172, "logps/chosen": -295.8765563964844, "logps/rejected": -346.28167724609375, "loss": 1.0671, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9165012836456299, "rewards/margins": -0.2899152338504791, "rewards/rejected": 2.206416368484497, "step": 30790 }, { "epoch": 1.4299642508937276, "grad_norm": 36.36235046386719, "learning_rate": 2.142495009053345e-07, "logits/chosen": -19.43178367614746, "logits/rejected": -19.41025733947754, "logps/chosen": -485.1173400878906, "logps/rejected": -388.6316833496094, "loss": 0.6578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.014115333557129, "rewards/margins": 0.9510852098464966, "rewards/rejected": 3.0630297660827637, "step": 30800 }, { "epoch": 1.4304285250011608, "grad_norm": 39.63438034057617, "learning_rate": 2.142216444588885e-07, "logits/chosen": -20.251216888427734, "logits/rejected": -19.979305267333984, "logps/chosen": -406.12603759765625, "logps/rejected": -458.48187255859375, "loss": 0.9292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.392211437225342, "rewards/margins": -0.0859391912817955, "rewards/rejected": 3.4781508445739746, "step": 30810 }, { "epoch": 1.4308927991085938, "grad_norm": 71.96223449707031, "learning_rate": 2.1419378801244253e-07, "logits/chosen": -19.389984130859375, "logits/rejected": -18.466228485107422, "logps/chosen": -346.25628662109375, "logps/rejected": -323.39404296875, "loss": 0.5918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4682819843292236, "rewards/margins": 0.8073173761367798, "rewards/rejected": 1.6609643697738647, "step": 30820 }, { "epoch": 1.4313570732160268, "grad_norm": 72.9444351196289, "learning_rate": 2.1416593156599655e-07, "logits/chosen": -18.975339889526367, "logits/rejected": -18.427284240722656, "logps/chosen": -355.695068359375, "logps/rejected": -339.4527587890625, "loss": 0.5671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0158305168151855, "rewards/margins": 0.5016501545906067, "rewards/rejected": 2.5141801834106445, "step": 30830 }, { "epoch": 1.4318213473234598, "grad_norm": 1.1828664541244507, "learning_rate": 2.141380751195506e-07, "logits/chosen": -19.55453109741211, "logits/rejected": -18.923480987548828, "logps/chosen": -442.84246826171875, "logps/rejected": -376.7352600097656, "loss": 0.4212, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8251564502716064, "rewards/margins": 1.162549614906311, "rewards/rejected": 2.662606716156006, "step": 30840 }, { "epoch": 1.4322856214308928, "grad_norm": 55.8885612487793, "learning_rate": 2.1411021867310458e-07, "logits/chosen": -19.283418655395508, "logits/rejected": -18.080074310302734, "logps/chosen": -436.0279846191406, "logps/rejected": -306.58502197265625, "loss": 0.4574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4272003173828125, "rewards/margins": 1.221531867980957, "rewards/rejected": 2.2056684494018555, "step": 30850 }, { "epoch": 1.4327498955383258, "grad_norm": 101.94696807861328, "learning_rate": 2.1408236222665862e-07, "logits/chosen": -19.514299392700195, "logits/rejected": -19.04006576538086, "logps/chosen": -445.131591796875, "logps/rejected": -443.52935791015625, "loss": 1.2353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.279449462890625, "rewards/margins": 0.08841594308614731, "rewards/rejected": 3.191033124923706, "step": 30860 }, { "epoch": 1.4332141696457588, "grad_norm": 18.121116638183594, "learning_rate": 2.1405450578021263e-07, "logits/chosen": -18.910612106323242, "logits/rejected": -17.781198501586914, "logps/chosen": -432.9476623535156, "logps/rejected": -303.2439880371094, "loss": 0.326, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7476394176483154, "rewards/margins": 1.5094051361083984, "rewards/rejected": 2.238234043121338, "step": 30870 }, { "epoch": 1.433678443753192, "grad_norm": 131.35519409179688, "learning_rate": 2.1402664933376667e-07, "logits/chosen": -18.386676788330078, "logits/rejected": -17.825910568237305, "logps/chosen": -447.07843017578125, "logps/rejected": -395.8018798828125, "loss": 0.9331, "rewards/accuracies": 0.5, "rewards/chosen": 3.4749228954315186, "rewards/margins": -0.02101854607462883, "rewards/rejected": 3.495941162109375, "step": 30880 }, { "epoch": 1.434142717860625, "grad_norm": 102.2863998413086, "learning_rate": 2.1399879288732066e-07, "logits/chosen": -17.755359649658203, "logits/rejected": -17.92082977294922, "logps/chosen": -279.90045166015625, "logps/rejected": -336.8531494140625, "loss": 0.7471, "rewards/accuracies": 0.5, "rewards/chosen": 2.6435744762420654, "rewards/margins": 0.3951590955257416, "rewards/rejected": 2.248415470123291, "step": 30890 }, { "epoch": 1.434606991968058, "grad_norm": 148.34890747070312, "learning_rate": 2.1397093644087468e-07, "logits/chosen": -19.613910675048828, "logits/rejected": -19.155107498168945, "logps/chosen": -448.86212158203125, "logps/rejected": -387.27984619140625, "loss": 0.7235, "rewards/accuracies": 0.5, "rewards/chosen": 3.455962657928467, "rewards/margins": 0.515151858329773, "rewards/rejected": 2.9408109188079834, "step": 30900 }, { "epoch": 1.435071266075491, "grad_norm": 120.22882080078125, "learning_rate": 2.1394307999442872e-07, "logits/chosen": -19.561931610107422, "logits/rejected": -18.725202560424805, "logps/chosen": -385.38909912109375, "logps/rejected": -326.765625, "loss": 0.62, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.965379476547241, "rewards/margins": 0.7834299206733704, "rewards/rejected": 2.1819498538970947, "step": 30910 }, { "epoch": 1.435535540182924, "grad_norm": 6.653685092926025, "learning_rate": 2.139152235479827e-07, "logits/chosen": -18.177249908447266, "logits/rejected": -17.06012535095215, "logps/chosen": -395.53326416015625, "logps/rejected": -282.26641845703125, "loss": 0.5467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.93149995803833, "rewards/margins": 0.796520471572876, "rewards/rejected": 2.134979724884033, "step": 30920 }, { "epoch": 1.435999814290357, "grad_norm": 0.3582251965999603, "learning_rate": 2.1388736710153672e-07, "logits/chosen": -19.175294876098633, "logits/rejected": -18.116052627563477, "logps/chosen": -467.016357421875, "logps/rejected": -363.0018615722656, "loss": 0.3661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.237278461456299, "rewards/margins": 1.9235175848007202, "rewards/rejected": 2.313760995864868, "step": 30930 }, { "epoch": 1.43646408839779, "grad_norm": 0.4079313278198242, "learning_rate": 2.1385951065509076e-07, "logits/chosen": -18.9514217376709, "logits/rejected": -18.605663299560547, "logps/chosen": -393.9719543457031, "logps/rejected": -291.7840270996094, "loss": 0.7007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2257766723632812, "rewards/margins": 1.0650814771652222, "rewards/rejected": 2.1606950759887695, "step": 30940 }, { "epoch": 1.4369283625052232, "grad_norm": 1.412319302558899, "learning_rate": 2.1383165420864478e-07, "logits/chosen": -18.578256607055664, "logits/rejected": -17.934751510620117, "logps/chosen": -363.4771423339844, "logps/rejected": -342.7862243652344, "loss": 0.8589, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.597590923309326, "rewards/margins": 0.3861697018146515, "rewards/rejected": 2.211421489715576, "step": 30950 }, { "epoch": 1.4373926366126561, "grad_norm": 42.357765197753906, "learning_rate": 2.138037977621988e-07, "logits/chosen": -18.91590118408203, "logits/rejected": -19.27737808227539, "logps/chosen": -290.43377685546875, "logps/rejected": -292.0521545410156, "loss": 0.7509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.291897773742676, "rewards/margins": -0.012684166431427002, "rewards/rejected": 2.304581880569458, "step": 30960 }, { "epoch": 1.4378569107200891, "grad_norm": 18.440135955810547, "learning_rate": 2.137759413157528e-07, "logits/chosen": -19.07929229736328, "logits/rejected": -18.33388900756836, "logps/chosen": -404.82391357421875, "logps/rejected": -335.3253479003906, "loss": 0.8183, "rewards/accuracies": 0.5, "rewards/chosen": 2.502500057220459, "rewards/margins": 0.13999122381210327, "rewards/rejected": 2.362508773803711, "step": 30970 }, { "epoch": 1.4383211848275221, "grad_norm": 61.057613372802734, "learning_rate": 2.1374808486930682e-07, "logits/chosen": -19.39630126953125, "logits/rejected": -18.60800552368164, "logps/chosen": -379.75152587890625, "logps/rejected": -297.6242980957031, "loss": 0.501, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8255743980407715, "rewards/margins": 0.9897764921188354, "rewards/rejected": 1.835797905921936, "step": 30980 }, { "epoch": 1.4387854589349551, "grad_norm": 54.09918212890625, "learning_rate": 2.1372022842286086e-07, "logits/chosen": -18.580278396606445, "logits/rejected": -18.03652572631836, "logps/chosen": -341.8562927246094, "logps/rejected": -254.889892578125, "loss": 0.4908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2716779708862305, "rewards/margins": 1.0419981479644775, "rewards/rejected": 3.229679822921753, "step": 30990 }, { "epoch": 1.4392497330423883, "grad_norm": 39.677371978759766, "learning_rate": 2.1369237197641485e-07, "logits/chosen": -19.223108291625977, "logits/rejected": -18.278301239013672, "logps/chosen": -436.2518005371094, "logps/rejected": -336.88360595703125, "loss": 0.5554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5230064392089844, "rewards/margins": 1.0379618406295776, "rewards/rejected": 2.485044479370117, "step": 31000 }, { "epoch": 1.4397140071498211, "grad_norm": 1.204954981803894, "learning_rate": 2.136645155299689e-07, "logits/chosen": -19.254261016845703, "logits/rejected": -17.73097038269043, "logps/chosen": -473.1946716308594, "logps/rejected": -263.7926940917969, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": 4.19044828414917, "rewards/margins": 2.404883861541748, "rewards/rejected": 1.7855647802352905, "step": 31010 }, { "epoch": 1.4401782812572543, "grad_norm": 1.8076775074005127, "learning_rate": 2.136366590835229e-07, "logits/chosen": -19.73749542236328, "logits/rejected": -18.254886627197266, "logps/chosen": -495.751220703125, "logps/rejected": -382.2938537597656, "loss": 0.6555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8632073402404785, "rewards/margins": 1.5298163890838623, "rewards/rejected": 3.333390712738037, "step": 31020 }, { "epoch": 1.4406425553646873, "grad_norm": 35.142948150634766, "learning_rate": 2.1360880263707695e-07, "logits/chosen": -18.95307159423828, "logits/rejected": -18.649442672729492, "logps/chosen": -462.5252380371094, "logps/rejected": -432.7227478027344, "loss": 0.6596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.834545850753784, "rewards/margins": 0.6525692343711853, "rewards/rejected": 3.1819770336151123, "step": 31030 }, { "epoch": 1.4411068294721203, "grad_norm": 110.81912994384766, "learning_rate": 2.1358094619063093e-07, "logits/chosen": -18.470874786376953, "logits/rejected": -18.356765747070312, "logps/chosen": -416.95733642578125, "logps/rejected": -416.78009033203125, "loss": 1.1179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1108779907226562, "rewards/margins": -0.005366182420402765, "rewards/rejected": 3.116243839263916, "step": 31040 }, { "epoch": 1.4415711035795533, "grad_norm": 6.988501071929932, "learning_rate": 2.1355308974418495e-07, "logits/chosen": -18.657894134521484, "logits/rejected": -17.92580223083496, "logps/chosen": -432.01092529296875, "logps/rejected": -308.6299743652344, "loss": 0.5866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7006430625915527, "rewards/margins": 1.3034963607788086, "rewards/rejected": 2.397146701812744, "step": 31050 }, { "epoch": 1.4420353776869863, "grad_norm": 48.96125030517578, "learning_rate": 2.13525233297739e-07, "logits/chosen": -18.713687896728516, "logits/rejected": -17.570154190063477, "logps/chosen": -304.27545166015625, "logps/rejected": -226.4508056640625, "loss": 0.4569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.372093439102173, "rewards/margins": 0.9022883176803589, "rewards/rejected": 1.4698054790496826, "step": 31060 }, { "epoch": 1.4424996517944195, "grad_norm": 96.33695220947266, "learning_rate": 2.13497376851293e-07, "logits/chosen": -18.057106018066406, "logits/rejected": -17.865863800048828, "logps/chosen": -437.137939453125, "logps/rejected": -419.23480224609375, "loss": 0.8761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1731808185577393, "rewards/margins": 0.32625752687454224, "rewards/rejected": 2.846923351287842, "step": 31070 }, { "epoch": 1.4429639259018525, "grad_norm": 26.58260726928711, "learning_rate": 2.13469520404847e-07, "logits/chosen": -18.666902542114258, "logits/rejected": -18.15328598022461, "logps/chosen": -278.23822021484375, "logps/rejected": -245.16659545898438, "loss": 0.6325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.000223159790039, "rewards/margins": 0.7911642789840698, "rewards/rejected": 2.2090587615966797, "step": 31080 }, { "epoch": 1.4434282000092855, "grad_norm": 14.101838111877441, "learning_rate": 2.1344166395840103e-07, "logits/chosen": -19.03179359436035, "logits/rejected": -16.98752212524414, "logps/chosen": -373.00775146484375, "logps/rejected": -188.13232421875, "loss": 0.3553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.756908893585205, "rewards/margins": 1.6449880599975586, "rewards/rejected": 2.1119205951690674, "step": 31090 }, { "epoch": 1.4438924741167185, "grad_norm": 58.59067916870117, "learning_rate": 2.1341380751195505e-07, "logits/chosen": -19.400053024291992, "logits/rejected": -18.32903289794922, "logps/chosen": -444.55206298828125, "logps/rejected": -306.185302734375, "loss": 0.3948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8389458656311035, "rewards/margins": 1.6463817358016968, "rewards/rejected": 2.192564010620117, "step": 31100 }, { "epoch": 1.4443567482241515, "grad_norm": 37.82542037963867, "learning_rate": 2.1338595106550906e-07, "logits/chosen": -18.810810089111328, "logits/rejected": -17.93515968322754, "logps/chosen": -345.7723388671875, "logps/rejected": -321.14495849609375, "loss": 0.8503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.64817214012146, "rewards/margins": 0.4254678189754486, "rewards/rejected": 2.2227046489715576, "step": 31110 }, { "epoch": 1.4448210223315845, "grad_norm": 125.54338836669922, "learning_rate": 2.1335809461906308e-07, "logits/chosen": -18.89140510559082, "logits/rejected": -17.781606674194336, "logps/chosen": -444.861328125, "logps/rejected": -322.4167785644531, "loss": 0.4992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2894890308380127, "rewards/margins": 0.7899216413497925, "rewards/rejected": 2.4995672702789307, "step": 31120 }, { "epoch": 1.4452852964390175, "grad_norm": 0.278324693441391, "learning_rate": 2.1333023817261712e-07, "logits/chosen": -17.907390594482422, "logits/rejected": -17.29288673400879, "logps/chosen": -456.38604736328125, "logps/rejected": -406.9249267578125, "loss": 1.0072, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.133455753326416, "rewards/margins": 0.24813878536224365, "rewards/rejected": 2.885317325592041, "step": 31130 }, { "epoch": 1.4457495705464507, "grad_norm": 68.9345474243164, "learning_rate": 2.1330238172617113e-07, "logits/chosen": -18.420391082763672, "logits/rejected": -17.597339630126953, "logps/chosen": -438.5440368652344, "logps/rejected": -314.65093994140625, "loss": 0.72, "rewards/accuracies": 0.5, "rewards/chosen": 3.4925239086151123, "rewards/margins": 0.9926430583000183, "rewards/rejected": 2.4998810291290283, "step": 31140 }, { "epoch": 1.4462138446538837, "grad_norm": 54.28596878051758, "learning_rate": 2.1327452527972512e-07, "logits/chosen": -19.244531631469727, "logits/rejected": -17.699459075927734, "logps/chosen": -422.6610412597656, "logps/rejected": -247.08566284179688, "loss": 0.4843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.934516191482544, "rewards/margins": 1.7923336029052734, "rewards/rejected": 2.1421823501586914, "step": 31150 }, { "epoch": 1.4466781187613167, "grad_norm": 134.2545166015625, "learning_rate": 2.1324666883327916e-07, "logits/chosen": -18.484729766845703, "logits/rejected": -18.560138702392578, "logps/chosen": -366.94293212890625, "logps/rejected": -415.2256774902344, "loss": 0.7602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.179381847381592, "rewards/margins": 0.07391967624425888, "rewards/rejected": 3.105462074279785, "step": 31160 }, { "epoch": 1.4471423928687497, "grad_norm": 59.02168655395508, "learning_rate": 2.1321881238683318e-07, "logits/chosen": -19.42913055419922, "logits/rejected": -18.541515350341797, "logps/chosen": -352.16766357421875, "logps/rejected": -238.88998413085938, "loss": 0.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.037307024002075, "rewards/margins": 0.9208566546440125, "rewards/rejected": 2.116450548171997, "step": 31170 }, { "epoch": 1.4476066669761827, "grad_norm": 14.305400848388672, "learning_rate": 2.1319095594038722e-07, "logits/chosen": -18.664094924926758, "logits/rejected": -17.290985107421875, "logps/chosen": -463.223876953125, "logps/rejected": -299.34344482421875, "loss": 0.4302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.071376323699951, "rewards/margins": 1.772030234336853, "rewards/rejected": 2.2993462085723877, "step": 31180 }, { "epoch": 1.448070941083616, "grad_norm": 166.69195556640625, "learning_rate": 2.131630994939412e-07, "logits/chosen": -18.68081283569336, "logits/rejected": -18.094894409179688, "logps/chosen": -470.07763671875, "logps/rejected": -427.8248596191406, "loss": 1.0144, "rewards/accuracies": 0.5, "rewards/chosen": 3.153407335281372, "rewards/margins": 0.15257346630096436, "rewards/rejected": 3.0008339881896973, "step": 31190 }, { "epoch": 1.4485352151910487, "grad_norm": 49.64976119995117, "learning_rate": 2.1313524304749522e-07, "logits/chosen": -18.257265090942383, "logits/rejected": -17.826501846313477, "logps/chosen": -365.1450500488281, "logps/rejected": -345.0062255859375, "loss": 0.6481, "rewards/accuracies": 0.5, "rewards/chosen": 2.962763547897339, "rewards/margins": 0.9549379348754883, "rewards/rejected": 2.0078256130218506, "step": 31200 }, { "epoch": 1.4489994892984819, "grad_norm": 7.16885232925415, "learning_rate": 2.1310738660104926e-07, "logits/chosen": -18.432188034057617, "logits/rejected": -17.295209884643555, "logps/chosen": -434.8053283691406, "logps/rejected": -339.47052001953125, "loss": 0.4516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.673722505569458, "rewards/margins": 1.3712455034255981, "rewards/rejected": 2.302476644515991, "step": 31210 }, { "epoch": 1.4494637634059149, "grad_norm": 77.23054504394531, "learning_rate": 2.1307953015460327e-07, "logits/chosen": -18.700551986694336, "logits/rejected": -18.26772689819336, "logps/chosen": -395.53204345703125, "logps/rejected": -362.16546630859375, "loss": 0.5363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8145580291748047, "rewards/margins": 0.6538265943527222, "rewards/rejected": 2.160731554031372, "step": 31220 }, { "epoch": 1.4499280375133479, "grad_norm": 33.753238677978516, "learning_rate": 2.1305167370815726e-07, "logits/chosen": -19.369672775268555, "logits/rejected": -17.15384292602539, "logps/chosen": -397.2786865234375, "logps/rejected": -202.6220703125, "loss": 0.2305, "rewards/accuracies": 1.0, "rewards/chosen": 2.859112501144409, "rewards/margins": 1.6771643161773682, "rewards/rejected": 1.1819484233856201, "step": 31230 }, { "epoch": 1.4503923116207809, "grad_norm": 5.9811601638793945, "learning_rate": 2.130238172617113e-07, "logits/chosen": -18.904050827026367, "logits/rejected": -17.590595245361328, "logps/chosen": -435.1163635253906, "logps/rejected": -260.87445068359375, "loss": 0.3594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8414881229400635, "rewards/margins": 1.3185356855392456, "rewards/rejected": 1.5229520797729492, "step": 31240 }, { "epoch": 1.4508565857282139, "grad_norm": 8.987343788146973, "learning_rate": 2.1299596081526532e-07, "logits/chosen": -18.151399612426758, "logits/rejected": -16.771400451660156, "logps/chosen": -486.4283142089844, "logps/rejected": -288.5191955566406, "loss": 0.3231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.210798263549805, "rewards/margins": 2.0490355491638184, "rewards/rejected": 2.161762237548828, "step": 31250 }, { "epoch": 1.451320859835647, "grad_norm": 48.61117172241211, "learning_rate": 2.1296810436881936e-07, "logits/chosen": -19.68301773071289, "logits/rejected": -18.70281982421875, "logps/chosen": -381.7328796386719, "logps/rejected": -275.73394775390625, "loss": 0.3859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.499697208404541, "rewards/margins": 1.5712642669677734, "rewards/rejected": 1.9284330606460571, "step": 31260 }, { "epoch": 1.45178513394308, "grad_norm": 7.0188188552856445, "learning_rate": 2.1294024792237335e-07, "logits/chosen": -18.556306838989258, "logits/rejected": -18.270038604736328, "logps/chosen": -371.43572998046875, "logps/rejected": -351.2893371582031, "loss": 0.6618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.104801893234253, "rewards/margins": 0.44411778450012207, "rewards/rejected": 2.660684108734131, "step": 31270 }, { "epoch": 1.452249408050513, "grad_norm": 90.78608703613281, "learning_rate": 2.129123914759274e-07, "logits/chosen": -17.77803611755371, "logits/rejected": -17.195682525634766, "logps/chosen": -375.3921203613281, "logps/rejected": -305.67254638671875, "loss": 1.0092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.171731948852539, "rewards/margins": 1.424429178237915, "rewards/rejected": 2.747303009033203, "step": 31280 }, { "epoch": 1.452713682157946, "grad_norm": 66.97635650634766, "learning_rate": 2.128845350294814e-07, "logits/chosen": -18.911815643310547, "logits/rejected": -18.126794815063477, "logps/chosen": -359.08258056640625, "logps/rejected": -278.51043701171875, "loss": 0.8595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.100726366043091, "rewards/margins": 1.24873948097229, "rewards/rejected": 1.8519868850708008, "step": 31290 }, { "epoch": 1.453177956265379, "grad_norm": 169.75657653808594, "learning_rate": 2.128566785830354e-07, "logits/chosen": -18.86652183532715, "logits/rejected": -18.113819122314453, "logps/chosen": -468.9393615722656, "logps/rejected": -369.50140380859375, "loss": 0.7814, "rewards/accuracies": 0.5, "rewards/chosen": 3.432337522506714, "rewards/margins": 0.40325164794921875, "rewards/rejected": 3.0290865898132324, "step": 31300 }, { "epoch": 1.453642230372812, "grad_norm": 175.7132568359375, "learning_rate": 2.1282882213658943e-07, "logits/chosen": -19.035091400146484, "logits/rejected": -18.587955474853516, "logps/chosen": -517.6220703125, "logps/rejected": -453.25579833984375, "loss": 0.7909, "rewards/accuracies": 0.5, "rewards/chosen": 3.49079966545105, "rewards/margins": 0.48546481132507324, "rewards/rejected": 3.0053353309631348, "step": 31310 }, { "epoch": 1.454106504480245, "grad_norm": 26.599292755126953, "learning_rate": 2.1280096569014345e-07, "logits/chosen": -19.171674728393555, "logits/rejected": -18.236724853515625, "logps/chosen": -300.6485900878906, "logps/rejected": -203.62429809570312, "loss": 0.4625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.935027837753296, "rewards/margins": 1.617553949356079, "rewards/rejected": 1.317474126815796, "step": 31320 }, { "epoch": 1.4545707785876782, "grad_norm": 274.4673767089844, "learning_rate": 2.127731092436975e-07, "logits/chosen": -18.43194580078125, "logits/rejected": -18.808780670166016, "logps/chosen": -356.3583068847656, "logps/rejected": -339.11199951171875, "loss": 0.9392, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0106568336486816, "rewards/margins": 0.11022262275218964, "rewards/rejected": 2.9004337787628174, "step": 31330 }, { "epoch": 1.4550350526951112, "grad_norm": 4.783423900604248, "learning_rate": 2.1274525279725148e-07, "logits/chosen": -19.61923599243164, "logits/rejected": -17.687183380126953, "logps/chosen": -418.8388671875, "logps/rejected": -238.7437286376953, "loss": 0.4079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.298846244812012, "rewards/margins": 2.119661808013916, "rewards/rejected": 2.1791839599609375, "step": 31340 }, { "epoch": 1.4554993268025442, "grad_norm": 34.2986946105957, "learning_rate": 2.127173963508055e-07, "logits/chosen": -19.38425064086914, "logits/rejected": -18.244688034057617, "logps/chosen": -395.56561279296875, "logps/rejected": -316.39825439453125, "loss": 0.4692, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4983317852020264, "rewards/margins": 1.2774406671524048, "rewards/rejected": 2.220891237258911, "step": 31350 }, { "epoch": 1.4559636009099772, "grad_norm": 185.3156280517578, "learning_rate": 2.1268953990435953e-07, "logits/chosen": -18.356647491455078, "logits/rejected": -17.747583389282227, "logps/chosen": -553.4156494140625, "logps/rejected": -425.04791259765625, "loss": 0.5996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.662110328674316, "rewards/margins": 1.0308544635772705, "rewards/rejected": 3.631256103515625, "step": 31360 }, { "epoch": 1.4564278750174102, "grad_norm": 148.72425842285156, "learning_rate": 2.1266168345791355e-07, "logits/chosen": -19.254627227783203, "logits/rejected": -18.351795196533203, "logps/chosen": -367.17852783203125, "logps/rejected": -269.63140869140625, "loss": 0.8579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.504779100418091, "rewards/margins": 1.3805619478225708, "rewards/rejected": 2.1242167949676514, "step": 31370 }, { "epoch": 1.4568921491248434, "grad_norm": 14.098735809326172, "learning_rate": 2.1263382701146756e-07, "logits/chosen": -19.01089096069336, "logits/rejected": -17.874317169189453, "logps/chosen": -324.0998840332031, "logps/rejected": -218.3816680908203, "loss": 0.4684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.205105781555176, "rewards/margins": 1.2992788553237915, "rewards/rejected": 0.9058265686035156, "step": 31380 }, { "epoch": 1.4573564232322762, "grad_norm": 0.89913010597229, "learning_rate": 2.1260597056502157e-07, "logits/chosen": -17.52408218383789, "logits/rejected": -17.70737648010254, "logps/chosen": -362.01544189453125, "logps/rejected": -349.31463623046875, "loss": 0.9875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5338802337646484, "rewards/margins": 0.4237957000732422, "rewards/rejected": 2.110084295272827, "step": 31390 }, { "epoch": 1.4578206973397094, "grad_norm": 26.045263290405273, "learning_rate": 2.125781141185756e-07, "logits/chosen": -18.558643341064453, "logits/rejected": -17.385189056396484, "logps/chosen": -472.6109924316406, "logps/rejected": -369.8470153808594, "loss": 0.5226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.442699432373047, "rewards/margins": 0.8874349594116211, "rewards/rejected": 2.5552642345428467, "step": 31400 }, { "epoch": 1.4582849714471424, "grad_norm": 18.77790641784668, "learning_rate": 2.1255025767212963e-07, "logits/chosen": -19.166339874267578, "logits/rejected": -17.830934524536133, "logps/chosen": -572.6649169921875, "logps/rejected": -447.67767333984375, "loss": 0.3224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.943158149719238, "rewards/margins": 1.8882509469985962, "rewards/rejected": 3.0549070835113525, "step": 31410 }, { "epoch": 1.4587492455545754, "grad_norm": 106.08882904052734, "learning_rate": 2.1252240122568362e-07, "logits/chosen": -18.521394729614258, "logits/rejected": -18.81052589416504, "logps/chosen": -382.6044006347656, "logps/rejected": -430.1932067871094, "loss": 0.9879, "rewards/accuracies": 0.5, "rewards/chosen": 3.172635078430176, "rewards/margins": -0.13050436973571777, "rewards/rejected": 3.3031392097473145, "step": 31420 }, { "epoch": 1.4592135196620084, "grad_norm": 8.022910118103027, "learning_rate": 2.1249454477923766e-07, "logits/chosen": -20.34395980834961, "logits/rejected": -19.432518005371094, "logps/chosen": -433.6748046875, "logps/rejected": -338.0575866699219, "loss": 0.4837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.144651412963867, "rewards/margins": 0.9210284352302551, "rewards/rejected": 3.2236227989196777, "step": 31430 }, { "epoch": 1.4596777937694414, "grad_norm": 43.96832275390625, "learning_rate": 2.1246668833279167e-07, "logits/chosen": -19.699813842773438, "logits/rejected": -18.922700881958008, "logps/chosen": -377.16204833984375, "logps/rejected": -314.60919189453125, "loss": 0.5039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.117643356323242, "rewards/margins": 0.7881288528442383, "rewards/rejected": 2.329514980316162, "step": 31440 }, { "epoch": 1.4601420678768746, "grad_norm": 65.98356628417969, "learning_rate": 2.1243883188634571e-07, "logits/chosen": -19.233150482177734, "logits/rejected": -18.91326904296875, "logps/chosen": -372.7154235839844, "logps/rejected": -329.2430725097656, "loss": 0.658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1225218772888184, "rewards/margins": 0.43528109788894653, "rewards/rejected": 2.6872410774230957, "step": 31450 }, { "epoch": 1.4606063419843076, "grad_norm": 18.4799747467041, "learning_rate": 2.124109754398997e-07, "logits/chosen": -18.617935180664062, "logits/rejected": -17.89756202697754, "logps/chosen": -339.0461730957031, "logps/rejected": -261.34808349609375, "loss": 0.5928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2839043140411377, "rewards/margins": 1.1308459043502808, "rewards/rejected": 1.1530582904815674, "step": 31460 }, { "epoch": 1.4610706160917406, "grad_norm": 156.4451904296875, "learning_rate": 2.1238311899345372e-07, "logits/chosen": -18.855710983276367, "logits/rejected": -17.316062927246094, "logps/chosen": -552.7117309570312, "logps/rejected": -351.58978271484375, "loss": 0.5408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.564549922943115, "rewards/margins": 1.6415355205535889, "rewards/rejected": 2.9230144023895264, "step": 31470 }, { "epoch": 1.4615348901991736, "grad_norm": 96.45660400390625, "learning_rate": 2.1235526254700776e-07, "logits/chosen": -19.200843811035156, "logits/rejected": -18.150562286376953, "logps/chosen": -346.53857421875, "logps/rejected": -274.49798583984375, "loss": 0.4174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8711111545562744, "rewards/margins": 0.9183575510978699, "rewards/rejected": 1.9527537822723389, "step": 31480 }, { "epoch": 1.4619991643066066, "grad_norm": 208.32138061523438, "learning_rate": 2.1232740610056175e-07, "logits/chosen": -18.414188385009766, "logits/rejected": -18.323680877685547, "logps/chosen": -305.6812438964844, "logps/rejected": -268.3454284667969, "loss": 1.0875, "rewards/accuracies": 0.5, "rewards/chosen": 1.865216612815857, "rewards/margins": -0.3616372346878052, "rewards/rejected": 2.226854085922241, "step": 31490 }, { "epoch": 1.4624634384140396, "grad_norm": 34.27886962890625, "learning_rate": 2.1229954965411576e-07, "logits/chosen": -19.181554794311523, "logits/rejected": -19.186939239501953, "logps/chosen": -402.8001708984375, "logps/rejected": -398.20574951171875, "loss": 0.8418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2583065032958984, "rewards/margins": 0.2599563002586365, "rewards/rejected": 2.9983506202697754, "step": 31500 }, { "epoch": 1.4629277125214726, "grad_norm": 166.47381591796875, "learning_rate": 2.122716932076698e-07, "logits/chosen": -18.39030647277832, "logits/rejected": -18.037899017333984, "logps/chosen": -383.35931396484375, "logps/rejected": -304.0367126464844, "loss": 0.5871, "rewards/accuracies": 0.5, "rewards/chosen": 3.6184494495391846, "rewards/margins": 0.9949640035629272, "rewards/rejected": 2.6234850883483887, "step": 31510 }, { "epoch": 1.4633919866289058, "grad_norm": 11.391522407531738, "learning_rate": 2.1224383676122382e-07, "logits/chosen": -18.15885353088379, "logits/rejected": -17.988054275512695, "logps/chosen": -367.9764709472656, "logps/rejected": -347.25567626953125, "loss": 0.898, "rewards/accuracies": 0.5, "rewards/chosen": 2.6614773273468018, "rewards/margins": 0.34797200560569763, "rewards/rejected": 2.3135054111480713, "step": 31520 }, { "epoch": 1.4638562607363388, "grad_norm": 146.01748657226562, "learning_rate": 2.1221598031477783e-07, "logits/chosen": -18.73305320739746, "logits/rejected": -17.32056999206543, "logps/chosen": -519.9589233398438, "logps/rejected": -380.17864990234375, "loss": 0.7475, "rewards/accuracies": 0.5, "rewards/chosen": 3.5682175159454346, "rewards/margins": 0.2324577271938324, "rewards/rejected": 3.3357596397399902, "step": 31530 }, { "epoch": 1.4643205348437718, "grad_norm": 5.412684917449951, "learning_rate": 2.1218812386833185e-07, "logits/chosen": -18.5390625, "logits/rejected": -16.59983253479004, "logps/chosen": -438.0204162597656, "logps/rejected": -220.4698486328125, "loss": 0.3373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.383845090866089, "rewards/margins": 1.7521499395370483, "rewards/rejected": 1.6316951513290405, "step": 31540 }, { "epoch": 1.4647848089512048, "grad_norm": 0.9457202553749084, "learning_rate": 2.1216026742188589e-07, "logits/chosen": -18.73915672302246, "logits/rejected": -18.12425422668457, "logps/chosen": -330.10601806640625, "logps/rejected": -287.0003967285156, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9914369583129883, "rewards/margins": 0.7754064798355103, "rewards/rejected": 2.2160303592681885, "step": 31550 }, { "epoch": 1.4652490830586378, "grad_norm": 153.05612182617188, "learning_rate": 2.121324109754399e-07, "logits/chosen": -19.02902603149414, "logits/rejected": -18.50606918334961, "logps/chosen": -477.1405334472656, "logps/rejected": -462.15191650390625, "loss": 0.608, "rewards/accuracies": 0.5, "rewards/chosen": 3.920654296875, "rewards/margins": 0.5822749137878418, "rewards/rejected": 3.338379383087158, "step": 31560 }, { "epoch": 1.465713357166071, "grad_norm": 83.16970825195312, "learning_rate": 2.121045545289939e-07, "logits/chosen": -18.46982192993164, "logits/rejected": -17.760868072509766, "logps/chosen": -375.1323547363281, "logps/rejected": -226.1862030029297, "loss": 0.5198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9894745349884033, "rewards/margins": 1.46928870677948, "rewards/rejected": 1.5201858282089233, "step": 31570 }, { "epoch": 1.4661776312735038, "grad_norm": 239.10958862304688, "learning_rate": 2.1207669808254793e-07, "logits/chosen": -18.586917877197266, "logits/rejected": -18.0333251953125, "logps/chosen": -353.64849853515625, "logps/rejected": -341.02545166015625, "loss": 0.7291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8099184036254883, "rewards/margins": 0.5513668656349182, "rewards/rejected": 2.258551597595215, "step": 31580 }, { "epoch": 1.466641905380937, "grad_norm": 224.96392822265625, "learning_rate": 2.1204884163610194e-07, "logits/chosen": -19.046100616455078, "logits/rejected": -18.662675857543945, "logps/chosen": -267.6133728027344, "logps/rejected": -311.3339538574219, "loss": 0.7876, "rewards/accuracies": 0.5, "rewards/chosen": 3.0687203407287598, "rewards/margins": 0.7173009514808655, "rewards/rejected": 2.351419448852539, "step": 31590 }, { "epoch": 1.46710617948837, "grad_norm": 23.20682716369629, "learning_rate": 2.1202098518965599e-07, "logits/chosen": -18.703153610229492, "logits/rejected": -18.033336639404297, "logps/chosen": -404.27227783203125, "logps/rejected": -321.659423828125, "loss": 0.637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.072145938873291, "rewards/margins": 0.5765140652656555, "rewards/rejected": 2.4956319332122803, "step": 31600 }, { "epoch": 1.467570453595803, "grad_norm": 145.2144775390625, "learning_rate": 2.1199312874320997e-07, "logits/chosen": -18.773494720458984, "logits/rejected": -17.737659454345703, "logps/chosen": -351.8387756347656, "logps/rejected": -277.92999267578125, "loss": 0.6743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7940266132354736, "rewards/margins": 0.38422325253486633, "rewards/rejected": 2.4098029136657715, "step": 31610 }, { "epoch": 1.468034727703236, "grad_norm": 22.084121704101562, "learning_rate": 2.11965272296764e-07, "logits/chosen": -19.248287200927734, "logits/rejected": -17.881990432739258, "logps/chosen": -505.6507873535156, "logps/rejected": -366.71002197265625, "loss": 0.6409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.178927421569824, "rewards/margins": 1.3936647176742554, "rewards/rejected": 2.7852625846862793, "step": 31620 }, { "epoch": 1.468499001810669, "grad_norm": 140.398681640625, "learning_rate": 2.1193741585031803e-07, "logits/chosen": -19.30231285095215, "logits/rejected": -18.186059951782227, "logps/chosen": -317.81207275390625, "logps/rejected": -187.50857543945312, "loss": 0.4057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.584311008453369, "rewards/margins": 1.1916615962982178, "rewards/rejected": 1.3926492929458618, "step": 31630 }, { "epoch": 1.4689632759181022, "grad_norm": 2.9456186294555664, "learning_rate": 2.1190955940387204e-07, "logits/chosen": -18.212963104248047, "logits/rejected": -17.611019134521484, "logps/chosen": -409.5840148925781, "logps/rejected": -346.69659423828125, "loss": 0.5257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6715450286865234, "rewards/margins": 1.1232500076293945, "rewards/rejected": 2.548295021057129, "step": 31640 }, { "epoch": 1.4694275500255352, "grad_norm": 14.02881145477295, "learning_rate": 2.1188170295742603e-07, "logits/chosen": -18.98651695251465, "logits/rejected": -17.90765953063965, "logps/chosen": -445.97747802734375, "logps/rejected": -334.396240234375, "loss": 0.8439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5607669353485107, "rewards/margins": 1.193893551826477, "rewards/rejected": 2.3668735027313232, "step": 31650 }, { "epoch": 1.4698918241329681, "grad_norm": 34.76118087768555, "learning_rate": 2.1185384651098007e-07, "logits/chosen": -20.228126525878906, "logits/rejected": -19.340869903564453, "logps/chosen": -337.2420654296875, "logps/rejected": -333.57220458984375, "loss": 0.9237, "rewards/accuracies": 0.5, "rewards/chosen": 3.3789074420928955, "rewards/margins": -0.14819744229316711, "rewards/rejected": 3.5271048545837402, "step": 31660 }, { "epoch": 1.4703560982404011, "grad_norm": 164.2078094482422, "learning_rate": 2.118259900645341e-07, "logits/chosen": -18.985313415527344, "logits/rejected": -18.320213317871094, "logps/chosen": -461.432373046875, "logps/rejected": -354.83807373046875, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": 3.78741192817688, "rewards/margins": 0.7950037717819214, "rewards/rejected": 2.992408275604248, "step": 31670 }, { "epoch": 1.4708203723478341, "grad_norm": 179.88612365722656, "learning_rate": 2.1179813361808813e-07, "logits/chosen": -18.646804809570312, "logits/rejected": -18.31888198852539, "logps/chosen": -436.1060485839844, "logps/rejected": -403.44024658203125, "loss": 0.6744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.460601806640625, "rewards/margins": 0.780954122543335, "rewards/rejected": 2.679647922515869, "step": 31680 }, { "epoch": 1.4712846464552671, "grad_norm": 44.85462188720703, "learning_rate": 2.1177027717164212e-07, "logits/chosen": -18.144933700561523, "logits/rejected": -17.54291343688965, "logps/chosen": -361.5083923339844, "logps/rejected": -222.04989624023438, "loss": 0.3635, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0860300064086914, "rewards/margins": 1.5275356769561768, "rewards/rejected": 1.5584943294525146, "step": 31690 }, { "epoch": 1.4717489205627001, "grad_norm": 40.414093017578125, "learning_rate": 2.1174242072519616e-07, "logits/chosen": -19.138187408447266, "logits/rejected": -18.08989715576172, "logps/chosen": -523.5676879882812, "logps/rejected": -325.5196228027344, "loss": 0.4436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.597092866897583, "rewards/margins": 1.4752393960952759, "rewards/rejected": 2.1218531131744385, "step": 31700 }, { "epoch": 1.4722131946701333, "grad_norm": 146.35679626464844, "learning_rate": 2.1171456427875017e-07, "logits/chosen": -18.79773712158203, "logits/rejected": -18.601905822753906, "logps/chosen": -271.2618713378906, "logps/rejected": -235.1287078857422, "loss": 0.8378, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.154503345489502, "rewards/margins": 0.27407732605934143, "rewards/rejected": 1.8804256916046143, "step": 31710 }, { "epoch": 1.4726774687775663, "grad_norm": 62.988975524902344, "learning_rate": 2.1168670783230416e-07, "logits/chosen": -18.404449462890625, "logits/rejected": -17.910961151123047, "logps/chosen": -365.5957336425781, "logps/rejected": -294.51666259765625, "loss": 0.4243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6541244983673096, "rewards/margins": 1.4349048137664795, "rewards/rejected": 2.219219923019409, "step": 31720 }, { "epoch": 1.4731417428849993, "grad_norm": 143.2077178955078, "learning_rate": 2.116588513858582e-07, "logits/chosen": -18.80143165588379, "logits/rejected": -17.984127044677734, "logps/chosen": -487.33612060546875, "logps/rejected": -408.66119384765625, "loss": 0.6211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.468784809112549, "rewards/margins": 1.0512502193450928, "rewards/rejected": 2.417534589767456, "step": 31730 }, { "epoch": 1.4736060169924323, "grad_norm": 143.3271484375, "learning_rate": 2.1163099493941222e-07, "logits/chosen": -18.47483253479004, "logits/rejected": -17.686357498168945, "logps/chosen": -447.4363708496094, "logps/rejected": -338.72589111328125, "loss": 0.556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.493001937866211, "rewards/margins": 1.0576775074005127, "rewards/rejected": 2.4353249073028564, "step": 31740 }, { "epoch": 1.4740702910998653, "grad_norm": 38.790279388427734, "learning_rate": 2.1160313849296626e-07, "logits/chosen": -18.177921295166016, "logits/rejected": -17.849699020385742, "logps/chosen": -433.83184814453125, "logps/rejected": -351.4552917480469, "loss": 0.6689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2594125270843506, "rewards/margins": 0.26151150465011597, "rewards/rejected": 1.9979007244110107, "step": 31750 }, { "epoch": 1.4745345652072983, "grad_norm": 28.489038467407227, "learning_rate": 2.1157528204652024e-07, "logits/chosen": -18.354236602783203, "logits/rejected": -18.242229461669922, "logps/chosen": -328.3037109375, "logps/rejected": -282.9887390136719, "loss": 0.4247, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.221308946609497, "rewards/margins": 0.9458174705505371, "rewards/rejected": 2.275491237640381, "step": 31760 }, { "epoch": 1.4749988393147313, "grad_norm": 22.39974021911621, "learning_rate": 2.1154742560007426e-07, "logits/chosen": -19.04440689086914, "logits/rejected": -18.928495407104492, "logps/chosen": -264.0907287597656, "logps/rejected": -290.51727294921875, "loss": 0.5727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9099960327148438, "rewards/margins": 0.6280524134635925, "rewards/rejected": 1.2819435596466064, "step": 31770 }, { "epoch": 1.4754631134221645, "grad_norm": 77.76996612548828, "learning_rate": 2.115195691536283e-07, "logits/chosen": -19.810287475585938, "logits/rejected": -18.70578384399414, "logps/chosen": -372.34075927734375, "logps/rejected": -281.498046875, "loss": 0.3875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5629844665527344, "rewards/margins": 1.0580036640167236, "rewards/rejected": 2.504981279373169, "step": 31780 }, { "epoch": 1.4759273875295975, "grad_norm": 8.493764877319336, "learning_rate": 2.1149171270718231e-07, "logits/chosen": -19.026172637939453, "logits/rejected": -18.01761245727539, "logps/chosen": -355.6767883300781, "logps/rejected": -240.7828826904297, "loss": 0.4868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.215749740600586, "rewards/margins": 0.8074578046798706, "rewards/rejected": 1.4082920551300049, "step": 31790 }, { "epoch": 1.4763916616370305, "grad_norm": 19.59007453918457, "learning_rate": 2.1146385626073633e-07, "logits/chosen": -19.26629066467285, "logits/rejected": -18.550260543823242, "logps/chosen": -490.33380126953125, "logps/rejected": -363.84490966796875, "loss": 0.5015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.731894016265869, "rewards/margins": 0.9770382046699524, "rewards/rejected": 2.7548558712005615, "step": 31800 }, { "epoch": 1.4768559357444635, "grad_norm": 3.480436325073242, "learning_rate": 2.1143599981429034e-07, "logits/chosen": -18.952709197998047, "logits/rejected": -18.018722534179688, "logps/chosen": -440.58837890625, "logps/rejected": -245.7522430419922, "loss": 0.3877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.220568895339966, "rewards/margins": 1.5341732501983643, "rewards/rejected": 1.6863956451416016, "step": 31810 }, { "epoch": 1.4773202098518965, "grad_norm": 40.208106994628906, "learning_rate": 2.1140814336784436e-07, "logits/chosen": -19.20012664794922, "logits/rejected": -18.509361267089844, "logps/chosen": -448.07623291015625, "logps/rejected": -321.4855041503906, "loss": 0.3892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7631335258483887, "rewards/margins": 1.2501555681228638, "rewards/rejected": 2.5129780769348145, "step": 31820 }, { "epoch": 1.4777844839593297, "grad_norm": 64.1181869506836, "learning_rate": 2.113802869213984e-07, "logits/chosen": -18.081134796142578, "logits/rejected": -16.94150161743164, "logps/chosen": -306.98065185546875, "logps/rejected": -200.52096557617188, "loss": 0.5443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.261584997177124, "rewards/margins": 0.9296100735664368, "rewards/rejected": 1.331974983215332, "step": 31830 }, { "epoch": 1.4782487580667625, "grad_norm": 61.11497497558594, "learning_rate": 2.113524304749524e-07, "logits/chosen": -17.839317321777344, "logits/rejected": -18.268510818481445, "logps/chosen": -302.12274169921875, "logps/rejected": -382.5479431152344, "loss": 0.7979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7557477951049805, "rewards/margins": 0.2829124927520752, "rewards/rejected": 2.4728353023529053, "step": 31840 }, { "epoch": 1.4787130321741957, "grad_norm": 56.128150939941406, "learning_rate": 2.1132457402850643e-07, "logits/chosen": -18.362558364868164, "logits/rejected": -17.274765014648438, "logps/chosen": -281.2065124511719, "logps/rejected": -192.17530822753906, "loss": 0.5504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9874870777130127, "rewards/margins": 1.196192741394043, "rewards/rejected": 1.7912943363189697, "step": 31850 }, { "epoch": 1.4791773062816287, "grad_norm": 113.47637939453125, "learning_rate": 2.1129671758206044e-07, "logits/chosen": -18.569721221923828, "logits/rejected": -17.620141983032227, "logps/chosen": -329.9216613769531, "logps/rejected": -213.73123168945312, "loss": 0.6214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.919567346572876, "rewards/margins": 1.276624321937561, "rewards/rejected": 1.6429431438446045, "step": 31860 }, { "epoch": 1.4796415803890617, "grad_norm": 0.1763678640127182, "learning_rate": 2.1126886113561448e-07, "logits/chosen": -19.380525588989258, "logits/rejected": -18.06902503967285, "logps/chosen": -409.3337707519531, "logps/rejected": -309.06298828125, "loss": 0.6965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.479628324508667, "rewards/margins": 1.140235185623169, "rewards/rejected": 2.339393377304077, "step": 31870 }, { "epoch": 1.4801058544964947, "grad_norm": 0.9959450364112854, "learning_rate": 2.1124100468916847e-07, "logits/chosen": -18.668289184570312, "logits/rejected": -16.59616470336914, "logps/chosen": -356.752197265625, "logps/rejected": -140.07708740234375, "loss": 0.3411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4042389392852783, "rewards/margins": 2.2413930892944336, "rewards/rejected": 1.1628457307815552, "step": 31880 }, { "epoch": 1.4805701286039277, "grad_norm": 94.6642074584961, "learning_rate": 2.1121314824272249e-07, "logits/chosen": -18.888858795166016, "logits/rejected": -18.59483528137207, "logps/chosen": -400.61248779296875, "logps/rejected": -325.49847412109375, "loss": 0.4989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.693765640258789, "rewards/margins": 0.7005402445793152, "rewards/rejected": 1.9932256937026978, "step": 31890 }, { "epoch": 1.4810344027113609, "grad_norm": 25.578296661376953, "learning_rate": 2.1118529179627653e-07, "logits/chosen": -18.30672836303711, "logits/rejected": -16.852197647094727, "logps/chosen": -404.9605712890625, "logps/rejected": -224.24459838867188, "loss": 0.2894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9503684043884277, "rewards/margins": 1.7420856952667236, "rewards/rejected": 1.208282709121704, "step": 31900 }, { "epoch": 1.4814986768187939, "grad_norm": 114.03887176513672, "learning_rate": 2.1115743534983052e-07, "logits/chosen": -20.021068572998047, "logits/rejected": -19.135414123535156, "logps/chosen": -477.789794921875, "logps/rejected": -286.3478088378906, "loss": 0.4499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.107333183288574, "rewards/margins": 1.330810785293579, "rewards/rejected": 2.776522636413574, "step": 31910 }, { "epoch": 1.4819629509262269, "grad_norm": 4.519590854644775, "learning_rate": 2.1112957890338453e-07, "logits/chosen": -17.590591430664062, "logits/rejected": -17.857553482055664, "logps/chosen": -357.8473205566406, "logps/rejected": -326.89544677734375, "loss": 0.7531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.974241018295288, "rewards/margins": 0.5543407797813416, "rewards/rejected": 2.419900417327881, "step": 31920 }, { "epoch": 1.4824272250336599, "grad_norm": 27.642366409301758, "learning_rate": 2.1110172245693857e-07, "logits/chosen": -18.944217681884766, "logits/rejected": -18.88043785095215, "logps/chosen": -375.4426574707031, "logps/rejected": -333.1239013671875, "loss": 0.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2829177379608154, "rewards/margins": 0.7264534831047058, "rewards/rejected": 2.556464433670044, "step": 31930 }, { "epoch": 1.4828914991410929, "grad_norm": 135.9470977783203, "learning_rate": 2.1107386601049259e-07, "logits/chosen": -18.139951705932617, "logits/rejected": -17.99228286743164, "logps/chosen": -297.38958740234375, "logps/rejected": -294.812255859375, "loss": 0.8217, "rewards/accuracies": 0.5, "rewards/chosen": 2.0754623413085938, "rewards/margins": 0.09959053993225098, "rewards/rejected": 1.9758716821670532, "step": 31940 }, { "epoch": 1.4833557732485259, "grad_norm": 57.59391403198242, "learning_rate": 2.110460095640466e-07, "logits/chosen": -18.333864212036133, "logits/rejected": -17.810901641845703, "logps/chosen": -423.36163330078125, "logps/rejected": -333.8910827636719, "loss": 0.5661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.439739227294922, "rewards/margins": 1.3923494815826416, "rewards/rejected": 2.047389507293701, "step": 31950 }, { "epoch": 1.4838200473559588, "grad_norm": 54.566226959228516, "learning_rate": 2.1101815311760061e-07, "logits/chosen": -18.252174377441406, "logits/rejected": -18.01648712158203, "logps/chosen": -373.86468505859375, "logps/rejected": -375.55938720703125, "loss": 0.7748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5570030212402344, "rewards/margins": 0.631710410118103, "rewards/rejected": 1.9252923727035522, "step": 31960 }, { "epoch": 1.484284321463392, "grad_norm": 27.401453018188477, "learning_rate": 2.1099029667115466e-07, "logits/chosen": -18.45860481262207, "logits/rejected": -17.849349975585938, "logps/chosen": -387.83282470703125, "logps/rejected": -338.82958984375, "loss": 0.5761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.849886417388916, "rewards/margins": 0.525818943977356, "rewards/rejected": 2.3240675926208496, "step": 31970 }, { "epoch": 1.484748595570825, "grad_norm": 4.585447311401367, "learning_rate": 2.1096244022470867e-07, "logits/chosen": -19.08420181274414, "logits/rejected": -18.882667541503906, "logps/chosen": -395.29058837890625, "logps/rejected": -332.2726745605469, "loss": 0.6045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.818058729171753, "rewards/margins": 1.1992661952972412, "rewards/rejected": 2.6187925338745117, "step": 31980 }, { "epoch": 1.485212869678258, "grad_norm": 84.33760833740234, "learning_rate": 2.1093458377826266e-07, "logits/chosen": -18.673370361328125, "logits/rejected": -18.004131317138672, "logps/chosen": -394.7291564941406, "logps/rejected": -356.1526184082031, "loss": 0.3358, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8594677448272705, "rewards/margins": 1.1772472858428955, "rewards/rejected": 1.682220458984375, "step": 31990 }, { "epoch": 1.485677143785691, "grad_norm": 3.5362675189971924, "learning_rate": 2.109067273318167e-07, "logits/chosen": -19.695003509521484, "logits/rejected": -18.407638549804688, "logps/chosen": -423.36016845703125, "logps/rejected": -298.4048767089844, "loss": 0.4157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.379158973693848, "rewards/margins": 2.031909465789795, "rewards/rejected": 2.3472495079040527, "step": 32000 }, { "epoch": 1.486141417893124, "grad_norm": 118.71344757080078, "learning_rate": 2.1087887088537071e-07, "logits/chosen": -20.151430130004883, "logits/rejected": -18.47553825378418, "logps/chosen": -505.3460998535156, "logps/rejected": -384.57952880859375, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 4.516932964324951, "rewards/margins": 1.9200128316879272, "rewards/rejected": 2.5969197750091553, "step": 32010 }, { "epoch": 1.4866056920005573, "grad_norm": 97.80682373046875, "learning_rate": 2.1085101443892476e-07, "logits/chosen": -18.40633201599121, "logits/rejected": -18.21989631652832, "logps/chosen": -437.06298828125, "logps/rejected": -413.82415771484375, "loss": 0.7467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8848276138305664, "rewards/margins": 0.14313966035842896, "rewards/rejected": 2.7416882514953613, "step": 32020 }, { "epoch": 1.48706996610799, "grad_norm": 39.740386962890625, "learning_rate": 2.1082315799247874e-07, "logits/chosen": -17.621227264404297, "logits/rejected": -17.3441104888916, "logps/chosen": -333.95147705078125, "logps/rejected": -299.27410888671875, "loss": 0.5776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.225306987762451, "rewards/margins": 0.5954494476318359, "rewards/rejected": 1.6298576593399048, "step": 32030 }, { "epoch": 1.4875342402154232, "grad_norm": 32.638404846191406, "learning_rate": 2.1079530154603276e-07, "logits/chosen": -18.168102264404297, "logits/rejected": -17.197473526000977, "logps/chosen": -352.47357177734375, "logps/rejected": -256.353759765625, "loss": 0.6524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.442536354064941, "rewards/margins": 1.5884571075439453, "rewards/rejected": 2.854079246520996, "step": 32040 }, { "epoch": 1.4879985143228562, "grad_norm": 22.205957412719727, "learning_rate": 2.107674450995868e-07, "logits/chosen": -18.96663475036621, "logits/rejected": -18.030290603637695, "logps/chosen": -446.12469482421875, "logps/rejected": -325.06121826171875, "loss": 0.3898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3929905891418457, "rewards/margins": 1.2708046436309814, "rewards/rejected": 2.122185468673706, "step": 32050 }, { "epoch": 1.4884627884302892, "grad_norm": 65.06891632080078, "learning_rate": 2.107395886531408e-07, "logits/chosen": -17.77471160888672, "logits/rejected": -17.016063690185547, "logps/chosen": -352.8072204589844, "logps/rejected": -207.2900848388672, "loss": 0.5299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.647521495819092, "rewards/margins": 0.9769002795219421, "rewards/rejected": 1.6706212759017944, "step": 32060 }, { "epoch": 1.4889270625377222, "grad_norm": 8.695642471313477, "learning_rate": 2.107117322066948e-07, "logits/chosen": -19.473392486572266, "logits/rejected": -18.462888717651367, "logps/chosen": -445.9365234375, "logps/rejected": -320.3125, "loss": 0.4091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5257420539855957, "rewards/margins": 1.5967953205108643, "rewards/rejected": 1.9289467334747314, "step": 32070 }, { "epoch": 1.4893913366451552, "grad_norm": 166.99085998535156, "learning_rate": 2.1068387576024884e-07, "logits/chosen": -19.224124908447266, "logits/rejected": -19.318992614746094, "logps/chosen": -353.12969970703125, "logps/rejected": -364.01025390625, "loss": 0.788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7014412879943848, "rewards/margins": 0.21048060059547424, "rewards/rejected": 2.4909608364105225, "step": 32080 }, { "epoch": 1.4898556107525884, "grad_norm": 13.333600997924805, "learning_rate": 2.1065601931380286e-07, "logits/chosen": -19.287246704101562, "logits/rejected": -18.362430572509766, "logps/chosen": -433.13214111328125, "logps/rejected": -308.59259033203125, "loss": 0.4528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8710479736328125, "rewards/margins": 1.0580999851226807, "rewards/rejected": 2.812948226928711, "step": 32090 }, { "epoch": 1.4903198848600214, "grad_norm": 2.0360233783721924, "learning_rate": 2.1062816286735687e-07, "logits/chosen": -19.872455596923828, "logits/rejected": -18.608535766601562, "logps/chosen": -395.46929931640625, "logps/rejected": -274.39178466796875, "loss": 0.2262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.946260929107666, "rewards/margins": 2.1600501537323, "rewards/rejected": 1.7862106561660767, "step": 32100 }, { "epoch": 1.4907841589674544, "grad_norm": 205.63626098632812, "learning_rate": 2.1060030642091089e-07, "logits/chosen": -18.023813247680664, "logits/rejected": -18.202749252319336, "logps/chosen": -297.730224609375, "logps/rejected": -317.35589599609375, "loss": 1.0473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.301769733428955, "rewards/margins": -0.12068437039852142, "rewards/rejected": 2.4224541187286377, "step": 32110 }, { "epoch": 1.4912484330748874, "grad_norm": 11.448025703430176, "learning_rate": 2.1057244997446493e-07, "logits/chosen": -18.83749008178711, "logits/rejected": -17.930217742919922, "logps/chosen": -465.823974609375, "logps/rejected": -365.36773681640625, "loss": 0.4404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7742016315460205, "rewards/margins": 1.0479509830474854, "rewards/rejected": 2.7262508869171143, "step": 32120 }, { "epoch": 1.4917127071823204, "grad_norm": 51.33997344970703, "learning_rate": 2.1054459352801894e-07, "logits/chosen": -18.226503372192383, "logits/rejected": -19.053993225097656, "logps/chosen": -458.38983154296875, "logps/rejected": -457.89385986328125, "loss": 0.8319, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.9529082775115967, "rewards/margins": 0.24659550189971924, "rewards/rejected": 3.706312894821167, "step": 32130 }, { "epoch": 1.4921769812897534, "grad_norm": 67.89631652832031, "learning_rate": 2.1051673708157293e-07, "logits/chosen": -18.603925704956055, "logits/rejected": -18.998098373413086, "logps/chosen": -324.39111328125, "logps/rejected": -399.1962585449219, "loss": 1.2093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6478912830352783, "rewards/margins": -0.36853352189064026, "rewards/rejected": 3.0164246559143066, "step": 32140 }, { "epoch": 1.4926412553971864, "grad_norm": 194.43643188476562, "learning_rate": 2.1048888063512697e-07, "logits/chosen": -18.602827072143555, "logits/rejected": -17.664865493774414, "logps/chosen": -530.50146484375, "logps/rejected": -327.4306945800781, "loss": 0.408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.258762359619141, "rewards/margins": 1.6096878051757812, "rewards/rejected": 2.6490743160247803, "step": 32150 }, { "epoch": 1.4931055295046196, "grad_norm": 31.824323654174805, "learning_rate": 2.1046102418868099e-07, "logits/chosen": -18.51937484741211, "logits/rejected": -18.429128646850586, "logps/chosen": -334.601318359375, "logps/rejected": -285.6993408203125, "loss": 0.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.690619945526123, "rewards/margins": 0.3956661820411682, "rewards/rejected": 2.2949538230895996, "step": 32160 }, { "epoch": 1.4935698036120526, "grad_norm": 100.06538391113281, "learning_rate": 2.1043316774223503e-07, "logits/chosen": -20.198610305786133, "logits/rejected": -18.91159439086914, "logps/chosen": -369.33453369140625, "logps/rejected": -282.6819152832031, "loss": 0.4931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0828142166137695, "rewards/margins": 1.1890108585357666, "rewards/rejected": 1.893803596496582, "step": 32170 }, { "epoch": 1.4940340777194856, "grad_norm": 7.65358304977417, "learning_rate": 2.1040809694043362e-07, "logits/chosen": -18.597368240356445, "logits/rejected": -17.644432067871094, "logps/chosen": -387.85516357421875, "logps/rejected": -232.73239135742188, "loss": 0.5147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3142077922821045, "rewards/margins": 0.9569799304008484, "rewards/rejected": 2.3572275638580322, "step": 32180 }, { "epoch": 1.4944983518269186, "grad_norm": 30.31566047668457, "learning_rate": 2.1038024049398764e-07, "logits/chosen": -19.467241287231445, "logits/rejected": -19.491405487060547, "logps/chosen": -385.15606689453125, "logps/rejected": -382.197998046875, "loss": 0.6492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.116562604904175, "rewards/margins": 0.3278457224369049, "rewards/rejected": 2.7887167930603027, "step": 32190 }, { "epoch": 1.4949626259343516, "grad_norm": 8.92059326171875, "learning_rate": 2.1035238404754165e-07, "logits/chosen": -18.306087493896484, "logits/rejected": -17.518299102783203, "logps/chosen": -344.2046813964844, "logps/rejected": -247.56600952148438, "loss": 0.6328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7209103107452393, "rewards/margins": 1.0305802822113037, "rewards/rejected": 1.6903297901153564, "step": 32200 }, { "epoch": 1.4954269000417848, "grad_norm": 54.59208297729492, "learning_rate": 2.103245276010957e-07, "logits/chosen": -18.375789642333984, "logits/rejected": -17.129528045654297, "logps/chosen": -408.1043701171875, "logps/rejected": -286.9118957519531, "loss": 0.5998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7476818561553955, "rewards/margins": 1.6423715353012085, "rewards/rejected": 2.1053104400634766, "step": 32210 }, { "epoch": 1.4958911741492176, "grad_norm": 88.34966278076172, "learning_rate": 2.102966711546497e-07, "logits/chosen": -18.83460807800293, "logits/rejected": -17.252342224121094, "logps/chosen": -522.2815551757812, "logps/rejected": -341.63848876953125, "loss": 0.1754, "rewards/accuracies": 1.0, "rewards/chosen": 4.797511577606201, "rewards/margins": 2.64493989944458, "rewards/rejected": 2.1525719165802, "step": 32220 }, { "epoch": 1.4963554482566508, "grad_norm": 65.9505386352539, "learning_rate": 2.102688147082037e-07, "logits/chosen": -18.80202293395996, "logits/rejected": -17.605518341064453, "logps/chosen": -404.31005859375, "logps/rejected": -311.26611328125, "loss": 0.4279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.266315460205078, "rewards/margins": 1.1271860599517822, "rewards/rejected": 3.139129400253296, "step": 32230 }, { "epoch": 1.4968197223640838, "grad_norm": 40.06132507324219, "learning_rate": 2.1024095826175773e-07, "logits/chosen": -18.990453720092773, "logits/rejected": -19.048246383666992, "logps/chosen": -416.19293212890625, "logps/rejected": -455.5299377441406, "loss": 1.2344, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.843287467956543, "rewards/margins": -0.13902291655540466, "rewards/rejected": 2.9823105335235596, "step": 32240 }, { "epoch": 1.4972839964715168, "grad_norm": 13.106121063232422, "learning_rate": 2.1021310181531175e-07, "logits/chosen": -18.151084899902344, "logits/rejected": -18.214492797851562, "logps/chosen": -435.8291015625, "logps/rejected": -379.43255615234375, "loss": 1.1175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8894944190979004, "rewards/margins": -0.02049267292022705, "rewards/rejected": 2.909986972808838, "step": 32250 }, { "epoch": 1.4977482705789498, "grad_norm": 76.28713989257812, "learning_rate": 2.101852453688658e-07, "logits/chosen": -18.185237884521484, "logits/rejected": -17.789491653442383, "logps/chosen": -301.2671203613281, "logps/rejected": -278.38348388671875, "loss": 0.8393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.093703031539917, "rewards/margins": 0.3192726969718933, "rewards/rejected": 2.774430513381958, "step": 32260 }, { "epoch": 1.4982125446863828, "grad_norm": 0.36958813667297363, "learning_rate": 2.1015738892241978e-07, "logits/chosen": -18.726192474365234, "logits/rejected": -17.930160522460938, "logps/chosen": -312.7028503417969, "logps/rejected": -242.51547241210938, "loss": 0.638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.668710708618164, "rewards/margins": 0.951636791229248, "rewards/rejected": 1.7170740365982056, "step": 32270 }, { "epoch": 1.498676818793816, "grad_norm": 44.196197509765625, "learning_rate": 2.101295324759738e-07, "logits/chosen": -18.451826095581055, "logits/rejected": -17.521026611328125, "logps/chosen": -354.4380798339844, "logps/rejected": -216.4543914794922, "loss": 0.6976, "rewards/accuracies": 0.5, "rewards/chosen": 2.8313846588134766, "rewards/margins": 0.9104493260383606, "rewards/rejected": 1.9209353923797607, "step": 32280 }, { "epoch": 1.499141092901249, "grad_norm": 127.10201263427734, "learning_rate": 2.1010167602952783e-07, "logits/chosen": -19.414207458496094, "logits/rejected": -18.647438049316406, "logps/chosen": -447.70477294921875, "logps/rejected": -312.01202392578125, "loss": 0.5087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.808232069015503, "rewards/margins": 0.9370228052139282, "rewards/rejected": 2.871209144592285, "step": 32290 }, { "epoch": 1.499605367008682, "grad_norm": 33.96914291381836, "learning_rate": 2.1007381958308185e-07, "logits/chosen": -18.275386810302734, "logits/rejected": -17.571735382080078, "logps/chosen": -501.4388732910156, "logps/rejected": -381.1626892089844, "loss": 0.7163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.598264217376709, "rewards/margins": 0.725361168384552, "rewards/rejected": 2.872903347015381, "step": 32300 }, { "epoch": 1.500069641116115, "grad_norm": 15.195165634155273, "learning_rate": 2.1004596313663586e-07, "logits/chosen": -17.381359100341797, "logits/rejected": -17.535673141479492, "logps/chosen": -224.92459106445312, "logps/rejected": -260.0594482421875, "loss": 0.6307, "rewards/accuracies": 0.5, "rewards/chosen": 2.280564785003662, "rewards/margins": 0.5711962580680847, "rewards/rejected": 1.7093684673309326, "step": 32310 }, { "epoch": 1.500533915223548, "grad_norm": 48.178897857666016, "learning_rate": 2.1001810669018988e-07, "logits/chosen": -19.227798461914062, "logits/rejected": -17.81370735168457, "logps/chosen": -498.32110595703125, "logps/rejected": -383.6756896972656, "loss": 0.4602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3763020038604736, "rewards/margins": 1.1917277574539185, "rewards/rejected": 2.1845743656158447, "step": 32320 }, { "epoch": 1.5009981893309812, "grad_norm": 32.455684661865234, "learning_rate": 2.0999025024374392e-07, "logits/chosen": -19.0673828125, "logits/rejected": -18.240137100219727, "logps/chosen": -375.3047790527344, "logps/rejected": -255.36026000976562, "loss": 0.3619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.971766471862793, "rewards/margins": 1.3477554321289062, "rewards/rejected": 1.6240110397338867, "step": 32330 }, { "epoch": 1.501462463438414, "grad_norm": 43.30402374267578, "learning_rate": 2.099623937972979e-07, "logits/chosen": -19.387046813964844, "logits/rejected": -18.320505142211914, "logps/chosen": -437.8603515625, "logps/rejected": -332.65155029296875, "loss": 0.3698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.552841901779175, "rewards/margins": 1.2760899066925049, "rewards/rejected": 2.276752233505249, "step": 32340 }, { "epoch": 1.5019267375458472, "grad_norm": 0.7543087005615234, "learning_rate": 2.0993453735085192e-07, "logits/chosen": -19.178707122802734, "logits/rejected": -17.68756103515625, "logps/chosen": -453.24176025390625, "logps/rejected": -286.2154846191406, "loss": 0.288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.094583511352539, "rewards/margins": 2.5631942749023438, "rewards/rejected": 1.5313891172409058, "step": 32350 }, { "epoch": 1.5023910116532802, "grad_norm": 66.69542694091797, "learning_rate": 2.0990668090440596e-07, "logits/chosen": -19.527706146240234, "logits/rejected": -19.083309173583984, "logps/chosen": -482.0126953125, "logps/rejected": -390.60650634765625, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 3.430382251739502, "rewards/margins": 0.5807373523712158, "rewards/rejected": 2.849644422531128, "step": 32360 }, { "epoch": 1.5028552857607131, "grad_norm": 56.87986373901367, "learning_rate": 2.0987882445795998e-07, "logits/chosen": -18.21600341796875, "logits/rejected": -17.614065170288086, "logps/chosen": -430.70477294921875, "logps/rejected": -351.2572326660156, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.162388563156128, "rewards/margins": 0.9966734051704407, "rewards/rejected": 2.165715217590332, "step": 32370 }, { "epoch": 1.5033195598681461, "grad_norm": 71.15475463867188, "learning_rate": 2.0985096801151396e-07, "logits/chosen": -19.21660614013672, "logits/rejected": -17.678190231323242, "logps/chosen": -454.55859375, "logps/rejected": -293.3631591796875, "loss": 0.4541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.179387331008911, "rewards/margins": 1.1204665899276733, "rewards/rejected": 2.058920383453369, "step": 32380 }, { "epoch": 1.5037838339755791, "grad_norm": 116.09486389160156, "learning_rate": 2.09823111565068e-07, "logits/chosen": -18.21773910522461, "logits/rejected": -17.90970230102539, "logps/chosen": -312.3570556640625, "logps/rejected": -282.77484130859375, "loss": 1.2683, "rewards/accuracies": 0.5, "rewards/chosen": 2.8538031578063965, "rewards/margins": 0.6198646426200867, "rewards/rejected": 2.233938694000244, "step": 32390 }, { "epoch": 1.5042481080830123, "grad_norm": 19.405433654785156, "learning_rate": 2.0979525511862202e-07, "logits/chosen": -19.1851863861084, "logits/rejected": -18.396013259887695, "logps/chosen": -318.78741455078125, "logps/rejected": -272.30694580078125, "loss": 0.7639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0596396923065186, "rewards/margins": 0.9222111701965332, "rewards/rejected": 2.1374282836914062, "step": 32400 }, { "epoch": 1.5047123821904451, "grad_norm": 25.40256118774414, "learning_rate": 2.0976739867217606e-07, "logits/chosen": -19.494098663330078, "logits/rejected": -17.74026870727539, "logps/chosen": -497.7744140625, "logps/rejected": -355.1526184082031, "loss": 0.7849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.491548538208008, "rewards/margins": 1.0439584255218506, "rewards/rejected": 2.447589874267578, "step": 32410 }, { "epoch": 1.5051766562978783, "grad_norm": 27.08336067199707, "learning_rate": 2.0973954222573005e-07, "logits/chosen": -20.012531280517578, "logits/rejected": -19.31220245361328, "logps/chosen": -471.82073974609375, "logps/rejected": -407.93072509765625, "loss": 0.8248, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5510685443878174, "rewards/margins": 0.651725172996521, "rewards/rejected": 2.899343967437744, "step": 32420 }, { "epoch": 1.5056409304053113, "grad_norm": 7.354687690734863, "learning_rate": 2.0971168577928406e-07, "logits/chosen": -18.336477279663086, "logits/rejected": -17.405393600463867, "logps/chosen": -345.4977722167969, "logps/rejected": -254.4864959716797, "loss": 0.7035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2204031944274902, "rewards/margins": 0.6971264481544495, "rewards/rejected": 1.5232768058776855, "step": 32430 }, { "epoch": 1.5061052045127443, "grad_norm": 85.47212982177734, "learning_rate": 2.096838293328381e-07, "logits/chosen": -19.81570816040039, "logits/rejected": -18.69538116455078, "logps/chosen": -417.725341796875, "logps/rejected": -364.5670471191406, "loss": 0.4892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8954672813415527, "rewards/margins": 0.6881855130195618, "rewards/rejected": 2.2072815895080566, "step": 32440 }, { "epoch": 1.5065694786201773, "grad_norm": 33.01730728149414, "learning_rate": 2.0965597288639212e-07, "logits/chosen": -18.06290054321289, "logits/rejected": -17.06270980834961, "logps/chosen": -367.26104736328125, "logps/rejected": -268.3866271972656, "loss": 0.2969, "rewards/accuracies": 1.0, "rewards/chosen": 3.445483684539795, "rewards/margins": 1.5769214630126953, "rewards/rejected": 1.8685623407363892, "step": 32450 }, { "epoch": 1.5070337527276103, "grad_norm": 123.19740295410156, "learning_rate": 2.0962811643994613e-07, "logits/chosen": -19.105384826660156, "logits/rejected": -16.987348556518555, "logps/chosen": -370.5514221191406, "logps/rejected": -250.1671905517578, "loss": 0.5171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.162654399871826, "rewards/margins": 1.097172498703003, "rewards/rejected": 2.0654819011688232, "step": 32460 }, { "epoch": 1.5074980268350435, "grad_norm": 96.12068176269531, "learning_rate": 2.0960025999350015e-07, "logits/chosen": -18.456396102905273, "logits/rejected": -17.852540969848633, "logps/chosen": -427.79315185546875, "logps/rejected": -407.62115478515625, "loss": 0.9684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.149643898010254, "rewards/margins": 0.10525672137737274, "rewards/rejected": 3.044387102127075, "step": 32470 }, { "epoch": 1.5079623009424763, "grad_norm": 2.967818260192871, "learning_rate": 2.095724035470542e-07, "logits/chosen": -19.245838165283203, "logits/rejected": -19.080612182617188, "logps/chosen": -474.6007385253906, "logps/rejected": -416.4165954589844, "loss": 0.7491, "rewards/accuracies": 0.5, "rewards/chosen": 4.430130958557129, "rewards/margins": 1.2070280313491821, "rewards/rejected": 3.2231032848358154, "step": 32480 }, { "epoch": 1.5084265750499095, "grad_norm": 184.3001708984375, "learning_rate": 2.095445471006082e-07, "logits/chosen": -18.023290634155273, "logits/rejected": -18.06120491027832, "logps/chosen": -323.1230773925781, "logps/rejected": -355.4336242675781, "loss": 1.0029, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8451268672943115, "rewards/margins": -0.12145330756902695, "rewards/rejected": 2.9665799140930176, "step": 32490 }, { "epoch": 1.5088908491573425, "grad_norm": 38.405250549316406, "learning_rate": 2.095166906541622e-07, "logits/chosen": -18.08063316345215, "logits/rejected": -17.215192794799805, "logps/chosen": -333.1247253417969, "logps/rejected": -275.8087463378906, "loss": 0.6529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.761085271835327, "rewards/margins": 0.7495024800300598, "rewards/rejected": 2.0115833282470703, "step": 32500 }, { "epoch": 1.5093551232647755, "grad_norm": 66.85020446777344, "learning_rate": 2.0948883420771623e-07, "logits/chosen": -18.73407554626465, "logits/rejected": -17.879690170288086, "logps/chosen": -326.59710693359375, "logps/rejected": -258.3645324707031, "loss": 0.4837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5355024337768555, "rewards/margins": 1.2263827323913574, "rewards/rejected": 1.3091195821762085, "step": 32510 }, { "epoch": 1.5098193973722087, "grad_norm": 53.338443756103516, "learning_rate": 2.0946097776127025e-07, "logits/chosen": -18.93426513671875, "logits/rejected": -16.917469024658203, "logps/chosen": -548.61767578125, "logps/rejected": -369.82305908203125, "loss": 0.2245, "rewards/accuracies": 1.0, "rewards/chosen": 3.980931520462036, "rewards/margins": 2.4430432319641113, "rewards/rejected": 1.5378878116607666, "step": 32520 }, { "epoch": 1.5102836714796415, "grad_norm": 32.27399444580078, "learning_rate": 2.094331213148243e-07, "logits/chosen": -18.905380249023438, "logits/rejected": -17.996427536010742, "logps/chosen": -369.13836669921875, "logps/rejected": -276.7691650390625, "loss": 0.5161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9598705768585205, "rewards/margins": 1.9618898630142212, "rewards/rejected": 1.9979808330535889, "step": 32530 }, { "epoch": 1.5107479455870747, "grad_norm": 203.1125946044922, "learning_rate": 2.0940526486837828e-07, "logits/chosen": -18.217479705810547, "logits/rejected": -16.927799224853516, "logps/chosen": -427.50506591796875, "logps/rejected": -267.7503356933594, "loss": 0.496, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.181838035583496, "rewards/margins": 1.5196634531021118, "rewards/rejected": 1.6621748208999634, "step": 32540 }, { "epoch": 1.5112122196945075, "grad_norm": 64.62944793701172, "learning_rate": 2.093774084219323e-07, "logits/chosen": -19.40239906311035, "logits/rejected": -17.855838775634766, "logps/chosen": -386.50823974609375, "logps/rejected": -353.29656982421875, "loss": 1.2353, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.543095350265503, "rewards/margins": -0.3886553645133972, "rewards/rejected": 2.931750774383545, "step": 32550 }, { "epoch": 1.5116764938019407, "grad_norm": 179.4225311279297, "learning_rate": 2.0934955197548633e-07, "logits/chosen": -17.81824493408203, "logits/rejected": -18.353239059448242, "logps/chosen": -437.9884338378906, "logps/rejected": -448.9615173339844, "loss": 1.1529, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.333604335784912, "rewards/margins": -0.08740997314453125, "rewards/rejected": 3.4210143089294434, "step": 32560 }, { "epoch": 1.5121407679093737, "grad_norm": 184.1869659423828, "learning_rate": 2.0932169552904032e-07, "logits/chosen": -18.60018539428711, "logits/rejected": -18.48033905029297, "logps/chosen": -323.35443115234375, "logps/rejected": -305.7093811035156, "loss": 0.8591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4485394954681396, "rewards/margins": 0.2522831857204437, "rewards/rejected": 2.196256160736084, "step": 32570 }, { "epoch": 1.5126050420168067, "grad_norm": 40.071868896484375, "learning_rate": 2.0929383908259436e-07, "logits/chosen": -18.593963623046875, "logits/rejected": -17.913951873779297, "logps/chosen": -351.489501953125, "logps/rejected": -339.4510803222656, "loss": 0.3575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.968008279800415, "rewards/margins": 1.0405112504959106, "rewards/rejected": 1.9274966716766357, "step": 32580 }, { "epoch": 1.51306931612424, "grad_norm": 60.88216018676758, "learning_rate": 2.0926598263614838e-07, "logits/chosen": -18.827428817749023, "logits/rejected": -17.963390350341797, "logps/chosen": -357.6772766113281, "logps/rejected": -340.0765075683594, "loss": 0.9978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.951894998550415, "rewards/margins": 0.01609520986676216, "rewards/rejected": 2.9357995986938477, "step": 32590 }, { "epoch": 1.5135335902316727, "grad_norm": 65.54947662353516, "learning_rate": 2.092381261897024e-07, "logits/chosen": -19.282981872558594, "logits/rejected": -18.425411224365234, "logps/chosen": -412.93798828125, "logps/rejected": -323.0135803222656, "loss": 0.6562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.97845196723938, "rewards/margins": 0.5173943042755127, "rewards/rejected": 2.461057662963867, "step": 32600 }, { "epoch": 1.5139978643391059, "grad_norm": 192.49208068847656, "learning_rate": 2.092102697432564e-07, "logits/chosen": -17.9434871673584, "logits/rejected": -16.791515350341797, "logps/chosen": -330.6258239746094, "logps/rejected": -220.5065460205078, "loss": 0.7218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.156371593475342, "rewards/margins": 0.5510016679763794, "rewards/rejected": 1.6053701639175415, "step": 32610 }, { "epoch": 1.5144621384465389, "grad_norm": 75.2369613647461, "learning_rate": 2.0918241329681042e-07, "logits/chosen": -18.418094635009766, "logits/rejected": -17.473737716674805, "logps/chosen": -410.786865234375, "logps/rejected": -264.18212890625, "loss": 0.5942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1425607204437256, "rewards/margins": 1.0685746669769287, "rewards/rejected": 2.073986530303955, "step": 32620 }, { "epoch": 1.5149264125539719, "grad_norm": 39.52604675292969, "learning_rate": 2.0915455685036446e-07, "logits/chosen": -18.565439224243164, "logits/rejected": -17.77413558959961, "logps/chosen": -443.9519958496094, "logps/rejected": -396.5599060058594, "loss": 0.5206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2833733558654785, "rewards/margins": 0.9220040440559387, "rewards/rejected": 2.3613691329956055, "step": 32630 }, { "epoch": 1.5153906866614049, "grad_norm": 44.79337692260742, "learning_rate": 2.0912670040391847e-07, "logits/chosen": -18.008529663085938, "logits/rejected": -18.326940536499023, "logps/chosen": -355.9733581542969, "logps/rejected": -356.1192321777344, "loss": 1.0844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.745164394378662, "rewards/margins": -0.10679116100072861, "rewards/rejected": 2.8519554138183594, "step": 32640 }, { "epoch": 1.5158549607688379, "grad_norm": 118.66249084472656, "learning_rate": 2.0909884395747246e-07, "logits/chosen": -18.608154296875, "logits/rejected": -18.69158172607422, "logps/chosen": -338.1162109375, "logps/rejected": -339.6867980957031, "loss": 0.736, "rewards/accuracies": 0.5, "rewards/chosen": 2.2011067867279053, "rewards/margins": 0.5252484679222107, "rewards/rejected": 1.6758582592010498, "step": 32650 }, { "epoch": 1.516319234876271, "grad_norm": 33.96427917480469, "learning_rate": 2.090709875110265e-07, "logits/chosen": -17.647689819335938, "logits/rejected": -17.012928009033203, "logps/chosen": -466.5406799316406, "logps/rejected": -331.5102844238281, "loss": 0.78, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1701231002807617, "rewards/margins": 1.071875810623169, "rewards/rejected": 2.0982470512390137, "step": 32660 }, { "epoch": 1.5167835089837038, "grad_norm": 63.16472244262695, "learning_rate": 2.0904313106458052e-07, "logits/chosen": -17.967435836791992, "logits/rejected": -17.56739044189453, "logps/chosen": -325.23583984375, "logps/rejected": -289.45782470703125, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 2.913527488708496, "rewards/margins": 0.9745838046073914, "rewards/rejected": 1.93894362449646, "step": 32670 }, { "epoch": 1.517247783091137, "grad_norm": 143.75021362304688, "learning_rate": 2.0901527461813456e-07, "logits/chosen": -18.101289749145508, "logits/rejected": -17.831249237060547, "logps/chosen": -400.1369323730469, "logps/rejected": -343.0171813964844, "loss": 0.933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.637425184249878, "rewards/margins": 0.5578073263168335, "rewards/rejected": 3.079617977142334, "step": 32680 }, { "epoch": 1.51771205719857, "grad_norm": 3.994138479232788, "learning_rate": 2.0898741817168855e-07, "logits/chosen": -18.53281021118164, "logits/rejected": -18.131641387939453, "logps/chosen": -321.77545166015625, "logps/rejected": -277.22747802734375, "loss": 0.8227, "rewards/accuracies": 0.5, "rewards/chosen": 2.6631274223327637, "rewards/margins": 0.4596555829048157, "rewards/rejected": 2.2034716606140137, "step": 32690 }, { "epoch": 1.518176331306003, "grad_norm": 15.491704940795898, "learning_rate": 2.0895956172524256e-07, "logits/chosen": -18.57919692993164, "logits/rejected": -16.840038299560547, "logps/chosen": -386.4084167480469, "logps/rejected": -184.3760986328125, "loss": 0.252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.493546962738037, "rewards/margins": 2.0907247066497803, "rewards/rejected": 1.4028223752975464, "step": 32700 }, { "epoch": 1.518640605413436, "grad_norm": 10.38105297088623, "learning_rate": 2.089317052787966e-07, "logits/chosen": -19.341938018798828, "logits/rejected": -19.09701156616211, "logps/chosen": -382.9600524902344, "logps/rejected": -364.7884216308594, "loss": 0.5334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7337448596954346, "rewards/margins": 0.8503837585449219, "rewards/rejected": 2.8833603858947754, "step": 32710 }, { "epoch": 1.519104879520869, "grad_norm": 63.694210052490234, "learning_rate": 2.0890384883235062e-07, "logits/chosen": -19.46230697631836, "logits/rejected": -18.666351318359375, "logps/chosen": -499.80462646484375, "logps/rejected": -412.28717041015625, "loss": 0.4247, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.882455348968506, "rewards/margins": 1.323918104171753, "rewards/rejected": 2.558537244796753, "step": 32720 }, { "epoch": 1.5195691536283022, "grad_norm": 1.436821460723877, "learning_rate": 2.0887599238590463e-07, "logits/chosen": -18.447175979614258, "logits/rejected": -18.54888153076172, "logps/chosen": -395.806884765625, "logps/rejected": -406.7031555175781, "loss": 0.7742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7674221992492676, "rewards/margins": 0.4640451967716217, "rewards/rejected": 3.3033766746520996, "step": 32730 }, { "epoch": 1.520033427735735, "grad_norm": 99.82229614257812, "learning_rate": 2.0884813593945865e-07, "logits/chosen": -19.609806060791016, "logits/rejected": -19.749296188354492, "logps/chosen": -417.00927734375, "logps/rejected": -430.65435791015625, "loss": 0.9911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6553587913513184, "rewards/margins": 0.266653448343277, "rewards/rejected": 3.3887054920196533, "step": 32740 }, { "epoch": 1.5204977018431682, "grad_norm": 28.282886505126953, "learning_rate": 2.088202794930127e-07, "logits/chosen": -18.118896484375, "logits/rejected": -18.060077667236328, "logps/chosen": -394.130859375, "logps/rejected": -322.52996826171875, "loss": 0.7105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.265024423599243, "rewards/margins": 0.20161566138267517, "rewards/rejected": 2.063408613204956, "step": 32750 }, { "epoch": 1.5209619759506012, "grad_norm": 143.8983612060547, "learning_rate": 2.0879242304656668e-07, "logits/chosen": -19.167606353759766, "logits/rejected": -18.343997955322266, "logps/chosen": -485.37432861328125, "logps/rejected": -369.66278076171875, "loss": 0.4865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.184731960296631, "rewards/margins": 1.2815676927566528, "rewards/rejected": 2.9031646251678467, "step": 32760 }, { "epoch": 1.5214262500580342, "grad_norm": 27.42194175720215, "learning_rate": 2.087645666001207e-07, "logits/chosen": -18.765274047851562, "logits/rejected": -19.28223991394043, "logps/chosen": -324.36248779296875, "logps/rejected": -276.67498779296875, "loss": 0.7084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.242696523666382, "rewards/margins": 0.3608773350715637, "rewards/rejected": 1.8818190097808838, "step": 32770 }, { "epoch": 1.5218905241654674, "grad_norm": 50.5468635559082, "learning_rate": 2.0873671015367473e-07, "logits/chosen": -19.213537216186523, "logits/rejected": -17.823659896850586, "logps/chosen": -391.77764892578125, "logps/rejected": -202.89259338378906, "loss": 0.2351, "rewards/accuracies": 1.0, "rewards/chosen": 3.0560834407806396, "rewards/margins": 1.8613179922103882, "rewards/rejected": 1.194765329360962, "step": 32780 }, { "epoch": 1.5223547982729002, "grad_norm": 40.599090576171875, "learning_rate": 2.0870885370722875e-07, "logits/chosen": -18.642515182495117, "logits/rejected": -17.277469635009766, "logps/chosen": -389.1434631347656, "logps/rejected": -336.1129455566406, "loss": 0.7875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6957626342773438, "rewards/margins": 0.18079212307929993, "rewards/rejected": 2.514970302581787, "step": 32790 }, { "epoch": 1.5228190723803334, "grad_norm": 139.18313598632812, "learning_rate": 2.0868099726078273e-07, "logits/chosen": -19.165882110595703, "logits/rejected": -18.668678283691406, "logps/chosen": -404.66717529296875, "logps/rejected": -359.3720703125, "loss": 0.6643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.870079517364502, "rewards/margins": 0.674296498298645, "rewards/rejected": 2.1957831382751465, "step": 32800 }, { "epoch": 1.5232833464877664, "grad_norm": 214.218505859375, "learning_rate": 2.0865314081433677e-07, "logits/chosen": -18.278671264648438, "logits/rejected": -16.939498901367188, "logps/chosen": -423.34063720703125, "logps/rejected": -324.7577819824219, "loss": 0.6802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.485480785369873, "rewards/margins": 0.8424087762832642, "rewards/rejected": 2.6430726051330566, "step": 32810 }, { "epoch": 1.5237476205951994, "grad_norm": 64.00043487548828, "learning_rate": 2.086252843678908e-07, "logits/chosen": -18.995960235595703, "logits/rejected": -18.580366134643555, "logps/chosen": -410.9532775878906, "logps/rejected": -438.2635192871094, "loss": 0.7632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3397247791290283, "rewards/margins": 0.23881717026233673, "rewards/rejected": 3.100907802581787, "step": 32820 }, { "epoch": 1.5242118947026324, "grad_norm": 30.0983943939209, "learning_rate": 2.0859742792144483e-07, "logits/chosen": -19.64239501953125, "logits/rejected": -18.30855941772461, "logps/chosen": -414.0636291503906, "logps/rejected": -320.4020080566406, "loss": 0.3903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.119312286376953, "rewards/margins": 1.3573287725448608, "rewards/rejected": 2.7619831562042236, "step": 32830 }, { "epoch": 1.5246761688100654, "grad_norm": 79.05756378173828, "learning_rate": 2.0856957147499882e-07, "logits/chosen": -18.346710205078125, "logits/rejected": -17.183115005493164, "logps/chosen": -426.4327697753906, "logps/rejected": -283.82086181640625, "loss": 0.4207, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5127804279327393, "rewards/margins": 1.2594858407974243, "rewards/rejected": 2.2532944679260254, "step": 32840 }, { "epoch": 1.5251404429174986, "grad_norm": 1.9019081592559814, "learning_rate": 2.0854171502855283e-07, "logits/chosen": -18.614606857299805, "logits/rejected": -17.835744857788086, "logps/chosen": -361.36761474609375, "logps/rejected": -350.90899658203125, "loss": 0.5867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7495009899139404, "rewards/margins": 1.1534860134124756, "rewards/rejected": 2.5960147380828857, "step": 32850 }, { "epoch": 1.5256047170249314, "grad_norm": 197.17262268066406, "learning_rate": 2.0851385858210687e-07, "logits/chosen": -18.493175506591797, "logits/rejected": -17.787460327148438, "logps/chosen": -434.06793212890625, "logps/rejected": -376.43865966796875, "loss": 0.9276, "rewards/accuracies": 0.5, "rewards/chosen": 3.7803826332092285, "rewards/margins": 0.33033856749534607, "rewards/rejected": 3.4500439167022705, "step": 32860 }, { "epoch": 1.5260689911323646, "grad_norm": 19.8391056060791, "learning_rate": 2.084860021356609e-07, "logits/chosen": -17.995023727416992, "logits/rejected": -17.51653480529785, "logps/chosen": -317.34234619140625, "logps/rejected": -306.0332336425781, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 2.8697006702423096, "rewards/margins": 0.6706327199935913, "rewards/rejected": 2.1990678310394287, "step": 32870 }, { "epoch": 1.5265332652397976, "grad_norm": 188.56777954101562, "learning_rate": 2.084581456892149e-07, "logits/chosen": -18.099618911743164, "logits/rejected": -18.171192169189453, "logps/chosen": -419.5267028808594, "logps/rejected": -371.83953857421875, "loss": 0.931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.151548385620117, "rewards/margins": 0.23006577789783478, "rewards/rejected": 2.921482801437378, "step": 32880 }, { "epoch": 1.5269975393472306, "grad_norm": 69.99440002441406, "learning_rate": 2.0843028924276892e-07, "logits/chosen": -18.235599517822266, "logits/rejected": -17.46596908569336, "logps/chosen": -523.2996826171875, "logps/rejected": -383.6531066894531, "loss": 0.4819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.593722105026245, "rewards/margins": 1.1946436166763306, "rewards/rejected": 2.399078607559204, "step": 32890 }, { "epoch": 1.5274618134546636, "grad_norm": 14.591495513916016, "learning_rate": 2.0840243279632296e-07, "logits/chosen": -18.806133270263672, "logits/rejected": -17.906009674072266, "logps/chosen": -346.477294921875, "logps/rejected": -318.7830505371094, "loss": 0.7382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.257776975631714, "rewards/margins": 0.9001725316047668, "rewards/rejected": 2.357604503631592, "step": 32900 }, { "epoch": 1.5279260875620966, "grad_norm": 8.484469413757324, "learning_rate": 2.0837457634987697e-07, "logits/chosen": -19.068822860717773, "logits/rejected": -18.302738189697266, "logps/chosen": -386.30206298828125, "logps/rejected": -291.1158752441406, "loss": 0.4749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.722160816192627, "rewards/margins": 1.2145490646362305, "rewards/rejected": 2.5076117515563965, "step": 32910 }, { "epoch": 1.5283903616695298, "grad_norm": 60.192138671875, "learning_rate": 2.0834671990343096e-07, "logits/chosen": -18.472152709960938, "logits/rejected": -17.82516098022461, "logps/chosen": -248.8355255126953, "logps/rejected": -198.1175537109375, "loss": 0.6963, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1914987564086914, "rewards/margins": 0.5992485880851746, "rewards/rejected": 1.592250108718872, "step": 32920 }, { "epoch": 1.5288546357769626, "grad_norm": 26.80160903930664, "learning_rate": 2.08318863456985e-07, "logits/chosen": -18.538646697998047, "logits/rejected": -18.049144744873047, "logps/chosen": -461.0924377441406, "logps/rejected": -341.2920837402344, "loss": 0.7716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.550856351852417, "rewards/margins": 1.136267900466919, "rewards/rejected": 2.414588451385498, "step": 32930 }, { "epoch": 1.5293189098843958, "grad_norm": 7.347090721130371, "learning_rate": 2.0829100701053902e-07, "logits/chosen": -19.68670082092285, "logits/rejected": -18.033782958984375, "logps/chosen": -376.55413818359375, "logps/rejected": -278.172607421875, "loss": 0.482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5455868244171143, "rewards/margins": 1.7208236455917358, "rewards/rejected": 1.824763298034668, "step": 32940 }, { "epoch": 1.5297831839918288, "grad_norm": 16.28730010986328, "learning_rate": 2.08263150564093e-07, "logits/chosen": -18.7012882232666, "logits/rejected": -17.69197654724121, "logps/chosen": -385.9765625, "logps/rejected": -271.73760986328125, "loss": 0.4281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7242305278778076, "rewards/margins": 0.7353874444961548, "rewards/rejected": 1.9888432025909424, "step": 32950 }, { "epoch": 1.5302474580992618, "grad_norm": 52.97993087768555, "learning_rate": 2.0823529411764705e-07, "logits/chosen": -19.604909896850586, "logits/rejected": -18.05776596069336, "logps/chosen": -316.3229675292969, "logps/rejected": -246.70816040039062, "loss": 0.3621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.649596691131592, "rewards/margins": 1.8744865655899048, "rewards/rejected": 1.7751100063323975, "step": 32960 }, { "epoch": 1.530711732206695, "grad_norm": 163.8486328125, "learning_rate": 2.0820743767120106e-07, "logits/chosen": -18.88108253479004, "logits/rejected": -18.570016860961914, "logps/chosen": -515.8074951171875, "logps/rejected": -479.290283203125, "loss": 0.556, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.169939994812012, "rewards/margins": 0.7716645002365112, "rewards/rejected": 3.398275852203369, "step": 32970 }, { "epoch": 1.5311760063141278, "grad_norm": 129.21746826171875, "learning_rate": 2.081795812247551e-07, "logits/chosen": -19.14445686340332, "logits/rejected": -18.76540184020996, "logps/chosen": -473.13421630859375, "logps/rejected": -503.34918212890625, "loss": 0.5065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.004490852355957, "rewards/margins": 0.9227266311645508, "rewards/rejected": 2.0817644596099854, "step": 32980 }, { "epoch": 1.531640280421561, "grad_norm": 0.20188309252262115, "learning_rate": 2.081517247783091e-07, "logits/chosen": -18.542497634887695, "logits/rejected": -17.70680809020996, "logps/chosen": -482.0413513183594, "logps/rejected": -328.97552490234375, "loss": 0.6406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3005478382110596, "rewards/margins": 0.8370424509048462, "rewards/rejected": 2.463505506515503, "step": 32990 }, { "epoch": 1.532104554528994, "grad_norm": 110.06053924560547, "learning_rate": 2.0812386833186313e-07, "logits/chosen": -18.642866134643555, "logits/rejected": -18.669343948364258, "logps/chosen": -474.21734619140625, "logps/rejected": -522.3370361328125, "loss": 0.8634, "rewards/accuracies": 0.5, "rewards/chosen": 3.7704830169677734, "rewards/margins": -0.17974238097667694, "rewards/rejected": 3.950225353240967, "step": 33000 }, { "epoch": 1.532568828636427, "grad_norm": 34.3568115234375, "learning_rate": 2.0809601188541714e-07, "logits/chosen": -18.724143981933594, "logits/rejected": -18.837078094482422, "logps/chosen": -349.33709716796875, "logps/rejected": -257.6308288574219, "loss": 0.8596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5834801197052, "rewards/margins": -0.03282642364501953, "rewards/rejected": 2.616306781768799, "step": 33010 }, { "epoch": 1.53303310274386, "grad_norm": 52.90473175048828, "learning_rate": 2.0806815543897116e-07, "logits/chosen": -19.134143829345703, "logits/rejected": -19.22058868408203, "logps/chosen": -383.1363220214844, "logps/rejected": -373.6663513183594, "loss": 0.7082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.999040126800537, "rewards/margins": 0.3321065604686737, "rewards/rejected": 3.666933536529541, "step": 33020 }, { "epoch": 1.533497376851293, "grad_norm": 32.076568603515625, "learning_rate": 2.0804029899252517e-07, "logits/chosen": -18.21120834350586, "logits/rejected": -17.5289363861084, "logps/chosen": -386.82196044921875, "logps/rejected": -253.24649047851562, "loss": 0.3785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.599552631378174, "rewards/margins": 1.3829030990600586, "rewards/rejected": 1.2166495323181152, "step": 33030 }, { "epoch": 1.5339616509587262, "grad_norm": 14.878469467163086, "learning_rate": 2.080124425460792e-07, "logits/chosen": -19.54071617126465, "logits/rejected": -18.495342254638672, "logps/chosen": -279.4892578125, "logps/rejected": -190.6057891845703, "loss": 0.4088, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.669499635696411, "rewards/margins": 1.3092067241668701, "rewards/rejected": 1.3602930307388306, "step": 33040 }, { "epoch": 1.534425925066159, "grad_norm": 52.40470504760742, "learning_rate": 2.0798458609963323e-07, "logits/chosen": -18.210716247558594, "logits/rejected": -17.44923973083496, "logps/chosen": -530.399658203125, "logps/rejected": -374.9842834472656, "loss": 0.8985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8611278533935547, "rewards/margins": 0.9408364295959473, "rewards/rejected": 2.9202911853790283, "step": 33050 }, { "epoch": 1.5348901991735922, "grad_norm": 114.12065887451172, "learning_rate": 2.0795672965318724e-07, "logits/chosen": -18.47536849975586, "logits/rejected": -18.186960220336914, "logps/chosen": -320.5271911621094, "logps/rejected": -244.06924438476562, "loss": 0.759, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.241616725921631, "rewards/margins": 0.6129941940307617, "rewards/rejected": 1.6286224126815796, "step": 33060 }, { "epoch": 1.5353544732810251, "grad_norm": 100.51203155517578, "learning_rate": 2.0792887320674123e-07, "logits/chosen": -18.997615814208984, "logits/rejected": -18.058080673217773, "logps/chosen": -412.74664306640625, "logps/rejected": -308.04962158203125, "loss": 0.4434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.423819065093994, "rewards/margins": 1.7429115772247314, "rewards/rejected": 1.6809072494506836, "step": 33070 }, { "epoch": 1.5358187473884581, "grad_norm": 254.78204345703125, "learning_rate": 2.0790101676029527e-07, "logits/chosen": -18.195499420166016, "logits/rejected": -18.452497482299805, "logps/chosen": -346.1689453125, "logps/rejected": -353.29461669921875, "loss": 0.8627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.920447826385498, "rewards/margins": 0.2712958753108978, "rewards/rejected": 2.6491518020629883, "step": 33080 }, { "epoch": 1.5362830214958911, "grad_norm": 56.16845703125, "learning_rate": 2.078731603138493e-07, "logits/chosen": -17.781654357910156, "logits/rejected": -17.326677322387695, "logps/chosen": -277.1260070800781, "logps/rejected": -222.4650115966797, "loss": 0.5942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.536625862121582, "rewards/margins": 0.8567218780517578, "rewards/rejected": 1.6799042224884033, "step": 33090 }, { "epoch": 1.5367472956033241, "grad_norm": 94.71257019042969, "learning_rate": 2.0784530386740333e-07, "logits/chosen": -18.309528350830078, "logits/rejected": -18.06032371520996, "logps/chosen": -374.4906921386719, "logps/rejected": -337.6249084472656, "loss": 1.0778, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.20617413520813, "rewards/margins": -0.13658197224140167, "rewards/rejected": 3.3427562713623047, "step": 33100 }, { "epoch": 1.5372115697107573, "grad_norm": 69.77498626708984, "learning_rate": 2.0781744742095732e-07, "logits/chosen": -19.68067741394043, "logits/rejected": -19.383451461791992, "logps/chosen": -408.81488037109375, "logps/rejected": -365.10028076171875, "loss": 0.6191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.245779037475586, "rewards/margins": 0.8227829933166504, "rewards/rejected": 2.4229960441589355, "step": 33110 }, { "epoch": 1.5376758438181901, "grad_norm": 65.05476379394531, "learning_rate": 2.0778959097451133e-07, "logits/chosen": -19.48164176940918, "logits/rejected": -18.185047149658203, "logps/chosen": -444.0657653808594, "logps/rejected": -297.63104248046875, "loss": 0.3248, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.648533582687378, "rewards/margins": 1.5457350015640259, "rewards/rejected": 2.1027989387512207, "step": 33120 }, { "epoch": 1.5381401179256233, "grad_norm": 320.4989013671875, "learning_rate": 2.0776173452806537e-07, "logits/chosen": -17.984020233154297, "logits/rejected": -18.12226104736328, "logps/chosen": -384.50335693359375, "logps/rejected": -391.6968078613281, "loss": 1.2126, "rewards/accuracies": 0.5, "rewards/chosen": 2.905500888824463, "rewards/margins": -0.06348326057195663, "rewards/rejected": 2.9689841270446777, "step": 33130 }, { "epoch": 1.5386043920330563, "grad_norm": 28.47395133972168, "learning_rate": 2.0773387808161936e-07, "logits/chosen": -17.751840591430664, "logits/rejected": -17.50528335571289, "logps/chosen": -362.19903564453125, "logps/rejected": -325.7688293457031, "loss": 0.7926, "rewards/accuracies": 0.5, "rewards/chosen": 2.1757946014404297, "rewards/margins": 0.31245437264442444, "rewards/rejected": 1.863340139389038, "step": 33140 }, { "epoch": 1.5390686661404893, "grad_norm": 69.96183013916016, "learning_rate": 2.077060216351734e-07, "logits/chosen": -18.604333877563477, "logits/rejected": -18.291244506835938, "logps/chosen": -381.3096618652344, "logps/rejected": -319.40045166015625, "loss": 0.6949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4493021965026855, "rewards/margins": 0.6577877998352051, "rewards/rejected": 2.7915146350860596, "step": 33150 }, { "epoch": 1.5395329402479225, "grad_norm": 246.18641662597656, "learning_rate": 2.0767816518872742e-07, "logits/chosen": -19.177284240722656, "logits/rejected": -18.586212158203125, "logps/chosen": -318.4656982421875, "logps/rejected": -289.238037109375, "loss": 0.7469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.673438549041748, "rewards/margins": 0.9720079302787781, "rewards/rejected": 2.7014307975769043, "step": 33160 }, { "epoch": 1.5399972143553553, "grad_norm": 141.15228271484375, "learning_rate": 2.0765030874228146e-07, "logits/chosen": -18.255338668823242, "logits/rejected": -16.679641723632812, "logps/chosen": -448.38037109375, "logps/rejected": -252.81198120117188, "loss": 0.3204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.143197774887085, "rewards/margins": 1.8026371002197266, "rewards/rejected": 1.3405606746673584, "step": 33170 }, { "epoch": 1.5404614884627885, "grad_norm": 97.58763885498047, "learning_rate": 2.0762245229583544e-07, "logits/chosen": -18.158159255981445, "logits/rejected": -17.34158706665039, "logps/chosen": -554.1194458007812, "logps/rejected": -351.8876037597656, "loss": 0.4607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4111106395721436, "rewards/margins": 1.2228792905807495, "rewards/rejected": 2.1882317066192627, "step": 33180 }, { "epoch": 1.5409257625702215, "grad_norm": 31.941909790039062, "learning_rate": 2.0759459584938946e-07, "logits/chosen": -19.44301414489746, "logits/rejected": -18.46877670288086, "logps/chosen": -370.09783935546875, "logps/rejected": -252.0382843017578, "loss": 0.5396, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.317456007003784, "rewards/margins": 1.284106969833374, "rewards/rejected": 2.033348798751831, "step": 33190 }, { "epoch": 1.5413900366776545, "grad_norm": 111.08565521240234, "learning_rate": 2.075667394029435e-07, "logits/chosen": -18.54563331604004, "logits/rejected": -17.318195343017578, "logps/chosen": -422.29351806640625, "logps/rejected": -305.68304443359375, "loss": 0.9348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8068981170654297, "rewards/margins": 0.46243876218795776, "rewards/rejected": 3.344459056854248, "step": 33200 }, { "epoch": 1.5418543107850875, "grad_norm": 74.38764190673828, "learning_rate": 2.0753888295649751e-07, "logits/chosen": -18.428396224975586, "logits/rejected": -18.124103546142578, "logps/chosen": -422.17205810546875, "logps/rejected": -423.68951416015625, "loss": 0.921, "rewards/accuracies": 0.5, "rewards/chosen": 4.081051826477051, "rewards/margins": 0.4036688208580017, "rewards/rejected": 3.6773829460144043, "step": 33210 }, { "epoch": 1.5423185848925205, "grad_norm": 147.60496520996094, "learning_rate": 2.075110265100515e-07, "logits/chosen": -17.98990821838379, "logits/rejected": -18.183879852294922, "logps/chosen": -387.07891845703125, "logps/rejected": -408.90057373046875, "loss": 0.9995, "rewards/accuracies": 0.5, "rewards/chosen": 2.9655508995056152, "rewards/margins": -0.31131526827812195, "rewards/rejected": 3.2768657207489014, "step": 33220 }, { "epoch": 1.5427828589999537, "grad_norm": 172.44198608398438, "learning_rate": 2.0748317006360554e-07, "logits/chosen": -18.223649978637695, "logits/rejected": -18.001602172851562, "logps/chosen": -357.7863464355469, "logps/rejected": -276.02276611328125, "loss": 0.7865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.435960054397583, "rewards/margins": 0.5647575259208679, "rewards/rejected": 1.8712027072906494, "step": 33230 }, { "epoch": 1.5432471331073865, "grad_norm": 45.714595794677734, "learning_rate": 2.0745531361715956e-07, "logits/chosen": -18.521480560302734, "logits/rejected": -18.264972686767578, "logps/chosen": -428.3661193847656, "logps/rejected": -349.8975524902344, "loss": 0.5067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6097054481506348, "rewards/margins": 0.7812966108322144, "rewards/rejected": 1.828409194946289, "step": 33240 }, { "epoch": 1.5437114072148197, "grad_norm": 138.6682891845703, "learning_rate": 2.074274571707136e-07, "logits/chosen": -18.20187759399414, "logits/rejected": -17.827194213867188, "logps/chosen": -330.3162536621094, "logps/rejected": -259.968994140625, "loss": 0.6773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7856173515319824, "rewards/margins": 0.8436403274536133, "rewards/rejected": 1.9419772624969482, "step": 33250 }, { "epoch": 1.5441756813222527, "grad_norm": 129.20263671875, "learning_rate": 2.073996007242676e-07, "logits/chosen": -18.839641571044922, "logits/rejected": -17.52267074584961, "logps/chosen": -406.374755859375, "logps/rejected": -273.1787414550781, "loss": 0.6966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.11535382270813, "rewards/margins": 1.0019502639770508, "rewards/rejected": 2.1134033203125, "step": 33260 }, { "epoch": 1.5446399554296857, "grad_norm": 130.80230712890625, "learning_rate": 2.073717442778216e-07, "logits/chosen": -18.942960739135742, "logits/rejected": -17.676334381103516, "logps/chosen": -441.56585693359375, "logps/rejected": -286.1493835449219, "loss": 0.817, "rewards/accuracies": 0.5, "rewards/chosen": 3.4951980113983154, "rewards/margins": 1.0112230777740479, "rewards/rejected": 2.4839751720428467, "step": 33270 }, { "epoch": 1.5451042295371187, "grad_norm": 113.1387939453125, "learning_rate": 2.0734388783137564e-07, "logits/chosen": -19.26471710205078, "logits/rejected": -18.87708282470703, "logps/chosen": -423.76348876953125, "logps/rejected": -398.79278564453125, "loss": 0.7344, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8924179077148438, "rewards/margins": 0.2541280686855316, "rewards/rejected": 2.6382899284362793, "step": 33280 }, { "epoch": 1.5455685036445517, "grad_norm": 2.2887089252471924, "learning_rate": 2.0731603138492966e-07, "logits/chosen": -18.562877655029297, "logits/rejected": -18.459121704101562, "logps/chosen": -473.6947326660156, "logps/rejected": -379.1242980957031, "loss": 0.5673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.025462627410889, "rewards/margins": 0.8106240034103394, "rewards/rejected": 3.214838743209839, "step": 33290 }, { "epoch": 1.5460327777519849, "grad_norm": 163.89842224121094, "learning_rate": 2.0728817493848367e-07, "logits/chosen": -18.245296478271484, "logits/rejected": -18.409313201904297, "logps/chosen": -327.12615966796875, "logps/rejected": -326.2602233886719, "loss": 1.3481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5336201190948486, "rewards/margins": -0.3901331424713135, "rewards/rejected": 2.923753261566162, "step": 33300 }, { "epoch": 1.5464970518594177, "grad_norm": 29.223236083984375, "learning_rate": 2.072603184920377e-07, "logits/chosen": -18.76334571838379, "logits/rejected": -18.645435333251953, "logps/chosen": -394.95806884765625, "logps/rejected": -357.6246032714844, "loss": 0.4666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.571528911590576, "rewards/margins": 1.1360821723937988, "rewards/rejected": 2.4354472160339355, "step": 33310 }, { "epoch": 1.5469613259668509, "grad_norm": 8.797460556030273, "learning_rate": 2.0723246204559173e-07, "logits/chosen": -18.602510452270508, "logits/rejected": -17.819740295410156, "logps/chosen": -491.804443359375, "logps/rejected": -452.6858825683594, "loss": 0.8664, "rewards/accuracies": 0.5, "rewards/chosen": 4.03234338760376, "rewards/margins": 0.24009709060192108, "rewards/rejected": 3.7922463417053223, "step": 33320 }, { "epoch": 1.5474256000742839, "grad_norm": 6.976042747497559, "learning_rate": 2.0720460559914572e-07, "logits/chosen": -18.731586456298828, "logits/rejected": -18.343502044677734, "logps/chosen": -318.9454040527344, "logps/rejected": -291.8978576660156, "loss": 0.9777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6684346199035645, "rewards/margins": 0.4314401149749756, "rewards/rejected": 2.2369942665100098, "step": 33330 }, { "epoch": 1.5478898741817169, "grad_norm": 103.79195404052734, "learning_rate": 2.0717674915269973e-07, "logits/chosen": -18.405086517333984, "logits/rejected": -18.339380264282227, "logps/chosen": -242.113037109375, "logps/rejected": -222.52511596679688, "loss": 0.5427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8167788982391357, "rewards/margins": 0.5115436315536499, "rewards/rejected": 1.3052351474761963, "step": 33340 }, { "epoch": 1.54835414828915, "grad_norm": 42.676822662353516, "learning_rate": 2.0714889270625377e-07, "logits/chosen": -19.572351455688477, "logits/rejected": -18.61165428161621, "logps/chosen": -349.23846435546875, "logps/rejected": -307.5798645019531, "loss": 1.0372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.841745615005493, "rewards/margins": 0.22341246902942657, "rewards/rejected": 2.618333101272583, "step": 33350 }, { "epoch": 1.5488184223965829, "grad_norm": 16.133485794067383, "learning_rate": 2.0712103625980779e-07, "logits/chosen": -17.685626983642578, "logits/rejected": -17.007591247558594, "logps/chosen": -306.4634704589844, "logps/rejected": -224.48025512695312, "loss": 0.7701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4427828788757324, "rewards/margins": 0.39641493558883667, "rewards/rejected": 2.04636812210083, "step": 33360 }, { "epoch": 1.549282696504016, "grad_norm": 5.466521263122559, "learning_rate": 2.0709317981336177e-07, "logits/chosen": -17.740087509155273, "logits/rejected": -17.04755973815918, "logps/chosen": -264.2160339355469, "logps/rejected": -189.7963104248047, "loss": 0.3519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2397663593292236, "rewards/margins": 1.1830981969833374, "rewards/rejected": 1.0566679239273071, "step": 33370 }, { "epoch": 1.5497469706114488, "grad_norm": 53.56270980834961, "learning_rate": 2.0706532336691582e-07, "logits/chosen": -19.896514892578125, "logits/rejected": -17.635295867919922, "logps/chosen": -380.5441589355469, "logps/rejected": -288.0305480957031, "loss": 0.6404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.774688243865967, "rewards/margins": 0.9431379437446594, "rewards/rejected": 1.8315509557724, "step": 33380 }, { "epoch": 1.550211244718882, "grad_norm": 2.4230034351348877, "learning_rate": 2.0703746692046983e-07, "logits/chosen": -18.861652374267578, "logits/rejected": -18.358715057373047, "logps/chosen": -411.7398986816406, "logps/rejected": -388.190185546875, "loss": 1.0421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1965670585632324, "rewards/margins": 0.48299533128738403, "rewards/rejected": 2.713571548461914, "step": 33390 }, { "epoch": 1.550675518826315, "grad_norm": 1.6325263977050781, "learning_rate": 2.0700961047402387e-07, "logits/chosen": -19.607309341430664, "logits/rejected": -18.745939254760742, "logps/chosen": -360.87579345703125, "logps/rejected": -313.7952880859375, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": 3.621351718902588, "rewards/margins": 0.9307981729507446, "rewards/rejected": 2.6905531883239746, "step": 33400 }, { "epoch": 1.551139792933748, "grad_norm": 6.46872615814209, "learning_rate": 2.0698175402757786e-07, "logits/chosen": -18.237911224365234, "logits/rejected": -17.63632583618164, "logps/chosen": -362.6887512207031, "logps/rejected": -309.53387451171875, "loss": 0.7461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6750662326812744, "rewards/margins": 0.5875824093818665, "rewards/rejected": 2.0874831676483154, "step": 33410 }, { "epoch": 1.5516040670411813, "grad_norm": 0.638778567314148, "learning_rate": 2.069538975811319e-07, "logits/chosen": -18.64181137084961, "logits/rejected": -17.509750366210938, "logps/chosen": -540.9931640625, "logps/rejected": -359.69671630859375, "loss": 0.4271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6028380393981934, "rewards/margins": 1.6494989395141602, "rewards/rejected": 1.9533392190933228, "step": 33420 }, { "epoch": 1.552068341148614, "grad_norm": 5.609134197235107, "learning_rate": 2.0692604113468591e-07, "logits/chosen": -18.23862075805664, "logits/rejected": -16.955459594726562, "logps/chosen": -331.3687438964844, "logps/rejected": -253.77011108398438, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": 2.8523640632629395, "rewards/margins": 2.116687536239624, "rewards/rejected": 0.7356761693954468, "step": 33430 }, { "epoch": 1.5525326152560472, "grad_norm": 87.27363586425781, "learning_rate": 2.0689818468823993e-07, "logits/chosen": -19.795724868774414, "logits/rejected": -18.902206420898438, "logps/chosen": -465.40594482421875, "logps/rejected": -381.4197998046875, "loss": 0.5848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7194480895996094, "rewards/margins": 0.6182305812835693, "rewards/rejected": 3.101217746734619, "step": 33440 }, { "epoch": 1.5529968893634802, "grad_norm": 38.389034271240234, "learning_rate": 2.0687032824179394e-07, "logits/chosen": -19.023151397705078, "logits/rejected": -17.569747924804688, "logps/chosen": -517.3570556640625, "logps/rejected": -377.842529296875, "loss": 0.324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.164554595947266, "rewards/margins": 2.114783763885498, "rewards/rejected": 2.049771308898926, "step": 33450 }, { "epoch": 1.5534611634709132, "grad_norm": 26.040979385375977, "learning_rate": 2.0684247179534796e-07, "logits/chosen": -19.282331466674805, "logits/rejected": -18.549394607543945, "logps/chosen": -352.25885009765625, "logps/rejected": -285.6727294921875, "loss": 0.5783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.222320556640625, "rewards/margins": 0.6818190813064575, "rewards/rejected": 2.540501832962036, "step": 33460 }, { "epoch": 1.5539254375783462, "grad_norm": 2.1835899353027344, "learning_rate": 2.06814615348902e-07, "logits/chosen": -18.55392837524414, "logits/rejected": -17.10625648498535, "logps/chosen": -450.53436279296875, "logps/rejected": -244.25131225585938, "loss": 0.4451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.804763078689575, "rewards/margins": 2.550140619277954, "rewards/rejected": 1.254622220993042, "step": 33470 }, { "epoch": 1.5543897116857792, "grad_norm": 51.24613952636719, "learning_rate": 2.0678675890245601e-07, "logits/chosen": -19.3887882232666, "logits/rejected": -18.88701057434082, "logps/chosen": -321.51678466796875, "logps/rejected": -262.1858825683594, "loss": 0.4967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.26884388923645, "rewards/margins": 1.176896333694458, "rewards/rejected": 2.091947078704834, "step": 33480 }, { "epoch": 1.5548539857932124, "grad_norm": 53.95956039428711, "learning_rate": 2.0675890245601e-07, "logits/chosen": -18.38228988647461, "logits/rejected": -18.787181854248047, "logps/chosen": -390.09222412109375, "logps/rejected": -340.5804748535156, "loss": 0.9439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3047523498535156, "rewards/margins": 0.20764899253845215, "rewards/rejected": 3.0971035957336426, "step": 33490 }, { "epoch": 1.5553182599006452, "grad_norm": 15.703048706054688, "learning_rate": 2.0673104600956404e-07, "logits/chosen": -18.608047485351562, "logits/rejected": -17.433462142944336, "logps/chosen": -522.7510375976562, "logps/rejected": -363.1702575683594, "loss": 0.5441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.897855281829834, "rewards/margins": 0.839570164680481, "rewards/rejected": 3.0582854747772217, "step": 33500 }, { "epoch": 1.5557825340080784, "grad_norm": 22.887672424316406, "learning_rate": 2.0670318956311806e-07, "logits/chosen": -18.532344818115234, "logits/rejected": -18.51837730407715, "logps/chosen": -370.5166320800781, "logps/rejected": -377.4362487792969, "loss": 1.1469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2153396606445312, "rewards/margins": 0.38006076216697693, "rewards/rejected": 2.8352789878845215, "step": 33510 }, { "epoch": 1.5562468081155114, "grad_norm": 6.945438861846924, "learning_rate": 2.066753331166721e-07, "logits/chosen": -18.125825881958008, "logits/rejected": -17.01415252685547, "logps/chosen": -420.8365783691406, "logps/rejected": -257.1830749511719, "loss": 0.4754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8420023918151855, "rewards/margins": 2.079333543777466, "rewards/rejected": 1.762669324874878, "step": 33520 }, { "epoch": 1.5567110822229444, "grad_norm": 145.0572967529297, "learning_rate": 2.0664747667022609e-07, "logits/chosen": -17.640962600708008, "logits/rejected": -17.357999801635742, "logps/chosen": -328.82568359375, "logps/rejected": -225.5938262939453, "loss": 0.9379, "rewards/accuracies": 0.5, "rewards/chosen": 1.8798887729644775, "rewards/margins": 0.1490807980298996, "rewards/rejected": 1.730807900428772, "step": 33530 }, { "epoch": 1.5571753563303774, "grad_norm": 40.853084564208984, "learning_rate": 2.066196202237801e-07, "logits/chosen": -19.017498016357422, "logits/rejected": -17.97379493713379, "logps/chosen": -471.861572265625, "logps/rejected": -357.85894775390625, "loss": 0.3954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.716308116912842, "rewards/margins": 1.4580323696136475, "rewards/rejected": 2.2582757472991943, "step": 33540 }, { "epoch": 1.5576396304378104, "grad_norm": 69.66326141357422, "learning_rate": 2.0659176377733414e-07, "logits/chosen": -19.533283233642578, "logits/rejected": -17.783031463623047, "logps/chosen": -460.96710205078125, "logps/rejected": -302.90399169921875, "loss": 0.4023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8125171661376953, "rewards/margins": 1.7970879077911377, "rewards/rejected": 2.015429735183716, "step": 33550 }, { "epoch": 1.5581039045452436, "grad_norm": 63.82131576538086, "learning_rate": 2.0656390733088813e-07, "logits/chosen": -19.127918243408203, "logits/rejected": -18.070842742919922, "logps/chosen": -505.6175842285156, "logps/rejected": -359.453857421875, "loss": 0.3682, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.651280164718628, "rewards/margins": 1.2770233154296875, "rewards/rejected": 2.3742563724517822, "step": 33560 }, { "epoch": 1.5585681786526764, "grad_norm": 145.7748565673828, "learning_rate": 2.0653605088444217e-07, "logits/chosen": -18.802703857421875, "logits/rejected": -17.111339569091797, "logps/chosen": -349.45916748046875, "logps/rejected": -182.50582885742188, "loss": 0.5196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8721423149108887, "rewards/margins": 1.3441601991653442, "rewards/rejected": 1.5279823541641235, "step": 33570 }, { "epoch": 1.5590324527601096, "grad_norm": 24.683034896850586, "learning_rate": 2.0650819443799619e-07, "logits/chosen": -18.167644500732422, "logits/rejected": -17.157588958740234, "logps/chosen": -414.9259338378906, "logps/rejected": -342.33001708984375, "loss": 0.6496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2730724811553955, "rewards/margins": 1.0323832035064697, "rewards/rejected": 2.2406890392303467, "step": 33580 }, { "epoch": 1.5594967268675426, "grad_norm": 64.07908630371094, "learning_rate": 2.0648033799155023e-07, "logits/chosen": -18.25819206237793, "logits/rejected": -17.559534072875977, "logps/chosen": -298.71002197265625, "logps/rejected": -232.8734130859375, "loss": 0.5278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3077518939971924, "rewards/margins": 0.8629506826400757, "rewards/rejected": 1.444800853729248, "step": 33590 }, { "epoch": 1.5599610009749756, "grad_norm": 58.71228790283203, "learning_rate": 2.0645248154510421e-07, "logits/chosen": -18.803218841552734, "logits/rejected": -17.446659088134766, "logps/chosen": -414.97344970703125, "logps/rejected": -263.36212158203125, "loss": 0.4591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.551578998565674, "rewards/margins": 1.4708768129348755, "rewards/rejected": 2.080702066421509, "step": 33600 }, { "epoch": 1.5604252750824088, "grad_norm": 33.60044860839844, "learning_rate": 2.0642462509865823e-07, "logits/chosen": -19.161958694458008, "logits/rejected": -18.795602798461914, "logps/chosen": -441.28192138671875, "logps/rejected": -383.0051574707031, "loss": 0.616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.051673889160156, "rewards/margins": 0.9688360095024109, "rewards/rejected": 3.082838296890259, "step": 33610 }, { "epoch": 1.5608895491898416, "grad_norm": 72.45935821533203, "learning_rate": 2.0639676865221227e-07, "logits/chosen": -19.179725646972656, "logits/rejected": -19.373111724853516, "logps/chosen": -388.652099609375, "logps/rejected": -385.35784912109375, "loss": 1.0927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.616502285003662, "rewards/margins": -0.09591229259967804, "rewards/rejected": 2.712414503097534, "step": 33620 }, { "epoch": 1.5613538232972748, "grad_norm": 2.407761573791504, "learning_rate": 2.0636891220576628e-07, "logits/chosen": -18.96854591369629, "logits/rejected": -18.0816650390625, "logps/chosen": -336.3412780761719, "logps/rejected": -319.6753845214844, "loss": 0.6847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1128170490264893, "rewards/margins": 0.5070264935493469, "rewards/rejected": 2.605790615081787, "step": 33630 }, { "epoch": 1.5618180974047078, "grad_norm": 57.068275451660156, "learning_rate": 2.0634105575932027e-07, "logits/chosen": -18.376995086669922, "logits/rejected": -18.18465805053711, "logps/chosen": -355.7687072753906, "logps/rejected": -328.62469482421875, "loss": 1.1458, "rewards/accuracies": 0.5, "rewards/chosen": 2.365722417831421, "rewards/margins": 0.0010859966278076172, "rewards/rejected": 2.364636182785034, "step": 33640 }, { "epoch": 1.5622823715121408, "grad_norm": 188.8404541015625, "learning_rate": 2.0631319931287431e-07, "logits/chosen": -18.487804412841797, "logits/rejected": -17.9038143157959, "logps/chosen": -434.70880126953125, "logps/rejected": -381.6064758300781, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.622328519821167, "rewards/margins": 1.0712690353393555, "rewards/rejected": 2.5510597229003906, "step": 33650 }, { "epoch": 1.5627466456195738, "grad_norm": 129.4510498046875, "learning_rate": 2.0628534286642833e-07, "logits/chosen": -18.527965545654297, "logits/rejected": -17.426837921142578, "logps/chosen": -378.3535461425781, "logps/rejected": -255.8159942626953, "loss": 0.4584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4548301696777344, "rewards/margins": 1.3122014999389648, "rewards/rejected": 2.1426289081573486, "step": 33660 }, { "epoch": 1.5632109197270068, "grad_norm": 147.61732482910156, "learning_rate": 2.0625748641998237e-07, "logits/chosen": -18.82114601135254, "logits/rejected": -17.63865089416504, "logps/chosen": -428.40179443359375, "logps/rejected": -319.5028381347656, "loss": 0.4648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.742220401763916, "rewards/margins": 1.2132558822631836, "rewards/rejected": 2.5289645195007324, "step": 33670 }, { "epoch": 1.56367519383444, "grad_norm": 183.8931884765625, "learning_rate": 2.0622962997353636e-07, "logits/chosen": -19.410680770874023, "logits/rejected": -18.061904907226562, "logps/chosen": -353.718994140625, "logps/rejected": -238.7028350830078, "loss": 0.8353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2169857025146484, "rewards/margins": 0.9326190948486328, "rewards/rejected": 2.2843663692474365, "step": 33680 }, { "epoch": 1.5641394679418728, "grad_norm": 53.29520034790039, "learning_rate": 2.0620177352709037e-07, "logits/chosen": -20.467729568481445, "logits/rejected": -18.924306869506836, "logps/chosen": -419.5311584472656, "logps/rejected": -262.43682861328125, "loss": 0.3344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.176417827606201, "rewards/margins": 1.5345605611801147, "rewards/rejected": 1.641857385635376, "step": 33690 }, { "epoch": 1.564603742049306, "grad_norm": 7.806001663208008, "learning_rate": 2.061739170806444e-07, "logits/chosen": -19.613231658935547, "logits/rejected": -18.862953186035156, "logps/chosen": -402.8099060058594, "logps/rejected": -428.54132080078125, "loss": 0.5673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2543632984161377, "rewards/margins": 0.8918434381484985, "rewards/rejected": 2.362520217895508, "step": 33700 }, { "epoch": 1.565068016156739, "grad_norm": 94.18276977539062, "learning_rate": 2.0614606063419843e-07, "logits/chosen": -19.08316993713379, "logits/rejected": -18.460411071777344, "logps/chosen": -313.9703674316406, "logps/rejected": -268.8712463378906, "loss": 0.7907, "rewards/accuracies": 0.5, "rewards/chosen": 2.7697691917419434, "rewards/margins": 0.593826413154602, "rewards/rejected": 2.175942897796631, "step": 33710 }, { "epoch": 1.565532290264172, "grad_norm": 105.0770263671875, "learning_rate": 2.0611820418775244e-07, "logits/chosen": -18.855440139770508, "logits/rejected": -18.350963592529297, "logps/chosen": -528.20361328125, "logps/rejected": -437.45806884765625, "loss": 0.4071, "rewards/accuracies": 1.0, "rewards/chosen": 3.3372561931610107, "rewards/margins": 0.9269682168960571, "rewards/rejected": 2.410288095474243, "step": 33720 }, { "epoch": 1.565996564371605, "grad_norm": 79.84143829345703, "learning_rate": 2.0609034774130646e-07, "logits/chosen": -17.9367733001709, "logits/rejected": -18.604032516479492, "logps/chosen": -340.99322509765625, "logps/rejected": -398.4159851074219, "loss": 0.8515, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.292550802230835, "rewards/margins": -0.10697569698095322, "rewards/rejected": 2.399526357650757, "step": 33730 }, { "epoch": 1.566460838479038, "grad_norm": 6.460231781005859, "learning_rate": 2.060624912948605e-07, "logits/chosen": -18.207056045532227, "logits/rejected": -17.499374389648438, "logps/chosen": -330.7304382324219, "logps/rejected": -243.55087280273438, "loss": 0.7362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2271995544433594, "rewards/margins": 0.7099331617355347, "rewards/rejected": 1.5172661542892456, "step": 33740 }, { "epoch": 1.5669251125864712, "grad_norm": 59.59355163574219, "learning_rate": 2.0603463484841449e-07, "logits/chosen": -19.604015350341797, "logits/rejected": -19.869394302368164, "logps/chosen": -314.8617858886719, "logps/rejected": -331.9550476074219, "loss": 0.8573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.22997784614563, "rewards/margins": 0.1421341747045517, "rewards/rejected": 3.087843894958496, "step": 33750 }, { "epoch": 1.567389386693904, "grad_norm": 8.53792667388916, "learning_rate": 2.060067784019685e-07, "logits/chosen": -18.672161102294922, "logits/rejected": -17.880537033081055, "logps/chosen": -443.6175842285156, "logps/rejected": -421.89495849609375, "loss": 0.7486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.089171409606934, "rewards/margins": 0.7683635950088501, "rewards/rejected": 3.320807933807373, "step": 33760 }, { "epoch": 1.5678536608013371, "grad_norm": 6.120398044586182, "learning_rate": 2.0597892195552254e-07, "logits/chosen": -19.284465789794922, "logits/rejected": -18.186717987060547, "logps/chosen": -438.5340270996094, "logps/rejected": -291.4791259765625, "loss": 0.3816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9803853034973145, "rewards/margins": 1.5138288736343384, "rewards/rejected": 2.4665567874908447, "step": 33770 }, { "epoch": 1.5683179349087701, "grad_norm": 66.95885467529297, "learning_rate": 2.0595106550907656e-07, "logits/chosen": -17.89944076538086, "logits/rejected": -17.522113800048828, "logps/chosen": -347.79510498046875, "logps/rejected": -281.1344909667969, "loss": 0.9597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6658902168273926, "rewards/margins": 0.39827531576156616, "rewards/rejected": 2.2676148414611816, "step": 33780 }, { "epoch": 1.5687822090162031, "grad_norm": 18.066837310791016, "learning_rate": 2.0592320906263054e-07, "logits/chosen": -17.785947799682617, "logits/rejected": -17.350414276123047, "logps/chosen": -314.0287170410156, "logps/rejected": -286.4931640625, "loss": 0.568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8783209323883057, "rewards/margins": 0.8016917109489441, "rewards/rejected": 2.0766289234161377, "step": 33790 }, { "epoch": 1.5692464831236363, "grad_norm": 43.871360778808594, "learning_rate": 2.0589535261618458e-07, "logits/chosen": -17.90509796142578, "logits/rejected": -17.032543182373047, "logps/chosen": -385.5576171875, "logps/rejected": -264.37884521484375, "loss": 0.4559, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8425452709198, "rewards/margins": 1.4864660501480103, "rewards/rejected": 1.3560791015625, "step": 33800 }, { "epoch": 1.5697107572310691, "grad_norm": 36.48344802856445, "learning_rate": 2.058674961697386e-07, "logits/chosen": -18.795238494873047, "logits/rejected": -18.051944732666016, "logps/chosen": -298.48004150390625, "logps/rejected": -269.57110595703125, "loss": 0.7614, "rewards/accuracies": 0.5, "rewards/chosen": 2.483839511871338, "rewards/margins": 0.3587523400783539, "rewards/rejected": 2.125087261199951, "step": 33810 }, { "epoch": 1.5701750313385023, "grad_norm": 0.2637750506401062, "learning_rate": 2.0583963972329264e-07, "logits/chosen": -18.54775619506836, "logits/rejected": -18.84430503845215, "logps/chosen": -360.868896484375, "logps/rejected": -304.6419982910156, "loss": 1.4139, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.371258497238159, "rewards/margins": -0.30143946409225464, "rewards/rejected": 3.6726982593536377, "step": 33820 }, { "epoch": 1.5706393054459353, "grad_norm": 70.11478424072266, "learning_rate": 2.0581178327684663e-07, "logits/chosen": -18.39120864868164, "logits/rejected": -17.806081771850586, "logps/chosen": -405.8638000488281, "logps/rejected": -395.13671875, "loss": 0.5199, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.877617597579956, "rewards/margins": 1.0618091821670532, "rewards/rejected": 2.8158087730407715, "step": 33830 }, { "epoch": 1.5711035795533683, "grad_norm": 18.60264015197754, "learning_rate": 2.0578392683040067e-07, "logits/chosen": -18.39912986755371, "logits/rejected": -18.4443416595459, "logps/chosen": -379.6445007324219, "logps/rejected": -349.74090576171875, "loss": 0.9032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2482478618621826, "rewards/margins": 0.628312885761261, "rewards/rejected": 2.6199350357055664, "step": 33840 }, { "epoch": 1.5715678536608013, "grad_norm": 200.84201049804688, "learning_rate": 2.0575607038395468e-07, "logits/chosen": -18.408655166625977, "logits/rejected": -18.15765380859375, "logps/chosen": -405.3492126464844, "logps/rejected": -343.0188293457031, "loss": 1.1642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8519885540008545, "rewards/margins": 0.1948210895061493, "rewards/rejected": 2.657167434692383, "step": 33850 }, { "epoch": 1.5720321277682343, "grad_norm": 32.78207015991211, "learning_rate": 2.057282139375087e-07, "logits/chosen": -19.407312393188477, "logits/rejected": -17.396406173706055, "logps/chosen": -494.52978515625, "logps/rejected": -303.0321960449219, "loss": 0.2851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.365832328796387, "rewards/margins": 2.3235151767730713, "rewards/rejected": 2.0423169136047363, "step": 33860 }, { "epoch": 1.5724964018756675, "grad_norm": 7.459222793579102, "learning_rate": 2.057003574910627e-07, "logits/chosen": -18.199481964111328, "logits/rejected": -17.09638214111328, "logps/chosen": -367.36639404296875, "logps/rejected": -268.2817687988281, "loss": 0.5136, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.774932384490967, "rewards/margins": 1.2466530799865723, "rewards/rejected": 1.5282795429229736, "step": 33870 }, { "epoch": 1.5729606759831003, "grad_norm": 195.2440185546875, "learning_rate": 2.0567250104461673e-07, "logits/chosen": -19.13372802734375, "logits/rejected": -18.307886123657227, "logps/chosen": -406.8074951171875, "logps/rejected": -318.58819580078125, "loss": 0.7982, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.849698066711426, "rewards/margins": 0.12471567094326019, "rewards/rejected": 2.724982261657715, "step": 33880 }, { "epoch": 1.5734249500905335, "grad_norm": 83.43683624267578, "learning_rate": 2.0564464459817077e-07, "logits/chosen": -18.961261749267578, "logits/rejected": -17.839763641357422, "logps/chosen": -438.5703125, "logps/rejected": -336.4915771484375, "loss": 0.3069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.378603935241699, "rewards/margins": 1.9123468399047852, "rewards/rejected": 2.4662575721740723, "step": 33890 }, { "epoch": 1.5738892241979665, "grad_norm": 191.3098907470703, "learning_rate": 2.0561678815172478e-07, "logits/chosen": -19.115297317504883, "logits/rejected": -18.85813331604004, "logps/chosen": -321.4049377441406, "logps/rejected": -295.66900634765625, "loss": 1.0801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5844950675964355, "rewards/margins": 0.00900723971426487, "rewards/rejected": 2.5754878520965576, "step": 33900 }, { "epoch": 1.5743534983053995, "grad_norm": 50.74823760986328, "learning_rate": 2.0558893170527877e-07, "logits/chosen": -18.575096130371094, "logits/rejected": -18.089990615844727, "logps/chosen": -360.4518127441406, "logps/rejected": -342.39044189453125, "loss": 0.6364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3390262126922607, "rewards/margins": 0.6440386772155762, "rewards/rejected": 2.6949870586395264, "step": 33910 }, { "epoch": 1.5748177724128325, "grad_norm": 2.8497486114501953, "learning_rate": 2.055610752588328e-07, "logits/chosen": -18.456661224365234, "logits/rejected": -17.437610626220703, "logps/chosen": -336.9118347167969, "logps/rejected": -249.4173583984375, "loss": 0.5704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1067519187927246, "rewards/margins": 0.8977065086364746, "rewards/rejected": 2.209045886993408, "step": 33920 }, { "epoch": 1.5752820465202655, "grad_norm": 276.16387939453125, "learning_rate": 2.0553321881238683e-07, "logits/chosen": -18.667491912841797, "logits/rejected": -17.744365692138672, "logps/chosen": -437.2724609375, "logps/rejected": -355.29339599609375, "loss": 0.3234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0797905921936035, "rewards/margins": 1.9984257221221924, "rewards/rejected": 2.081364870071411, "step": 33930 }, { "epoch": 1.5757463206276987, "grad_norm": 121.80931854248047, "learning_rate": 2.0550536236594081e-07, "logits/chosen": -19.32973289489746, "logits/rejected": -18.544580459594727, "logps/chosen": -426.3417053222656, "logps/rejected": -388.6052551269531, "loss": 0.7806, "rewards/accuracies": 0.5, "rewards/chosen": 3.9445769786834717, "rewards/margins": 0.49046626687049866, "rewards/rejected": 3.454110622406006, "step": 33940 }, { "epoch": 1.5762105947351315, "grad_norm": 40.88711929321289, "learning_rate": 2.0547750591949486e-07, "logits/chosen": -19.13651466369629, "logits/rejected": -19.383766174316406, "logps/chosen": -450.87811279296875, "logps/rejected": -510.34808349609375, "loss": 0.9869, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.529005527496338, "rewards/margins": -0.3819650709629059, "rewards/rejected": 3.9109702110290527, "step": 33950 }, { "epoch": 1.5766748688425647, "grad_norm": 54.044517517089844, "learning_rate": 2.0544964947304887e-07, "logits/chosen": -19.13181495666504, "logits/rejected": -19.105016708374023, "logps/chosen": -278.7088928222656, "logps/rejected": -325.8041076660156, "loss": 0.9184, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.250387191772461, "rewards/margins": -0.1563488245010376, "rewards/rejected": 2.406735897064209, "step": 33960 }, { "epoch": 1.5771391429499977, "grad_norm": 87.6791763305664, "learning_rate": 2.054217930266029e-07, "logits/chosen": -18.49193000793457, "logits/rejected": -18.709680557250977, "logps/chosen": -345.61016845703125, "logps/rejected": -389.49078369140625, "loss": 1.2747, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5619750022888184, "rewards/margins": -0.43559661507606506, "rewards/rejected": 2.9975714683532715, "step": 33970 }, { "epoch": 1.5776034170574307, "grad_norm": 95.7432861328125, "learning_rate": 2.053939365801569e-07, "logits/chosen": -18.7819766998291, "logits/rejected": -18.495380401611328, "logps/chosen": -347.95281982421875, "logps/rejected": -313.5370788574219, "loss": 0.8323, "rewards/accuracies": 0.5, "rewards/chosen": 2.509753704071045, "rewards/margins": 0.1386178582906723, "rewards/rejected": 2.371135950088501, "step": 33980 }, { "epoch": 1.578067691164864, "grad_norm": 10.420802116394043, "learning_rate": 2.0536608013371094e-07, "logits/chosen": -19.43425941467285, "logits/rejected": -17.750682830810547, "logps/chosen": -516.9180908203125, "logps/rejected": -330.0797119140625, "loss": 0.4149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.176805019378662, "rewards/margins": 1.970526933670044, "rewards/rejected": 2.206278085708618, "step": 33990 }, { "epoch": 1.5785319652722967, "grad_norm": 19.159875869750977, "learning_rate": 2.0533822368726495e-07, "logits/chosen": -19.86745834350586, "logits/rejected": -18.780345916748047, "logps/chosen": -365.31134033203125, "logps/rejected": -347.155029296875, "loss": 0.5779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9109668731689453, "rewards/margins": 0.5887622237205505, "rewards/rejected": 2.322204351425171, "step": 34000 }, { "epoch": 1.5789962393797299, "grad_norm": 4.738236427307129, "learning_rate": 2.05310367240819e-07, "logits/chosen": -18.498987197875977, "logits/rejected": -17.812999725341797, "logps/chosen": -474.1861877441406, "logps/rejected": -373.979736328125, "loss": 1.3221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6935324668884277, "rewards/margins": 0.5361109972000122, "rewards/rejected": 3.157421588897705, "step": 34010 }, { "epoch": 1.5794605134871629, "grad_norm": 36.19586944580078, "learning_rate": 2.0528251079437298e-07, "logits/chosen": -19.33303451538086, "logits/rejected": -17.788164138793945, "logps/chosen": -332.26617431640625, "logps/rejected": -252.617431640625, "loss": 0.2651, "rewards/accuracies": 1.0, "rewards/chosen": 3.8644118309020996, "rewards/margins": 1.5101945400238037, "rewards/rejected": 2.354217290878296, "step": 34020 }, { "epoch": 1.5799247875945959, "grad_norm": 151.05953979492188, "learning_rate": 2.05254654347927e-07, "logits/chosen": -19.181495666503906, "logits/rejected": -19.302642822265625, "logps/chosen": -375.572265625, "logps/rejected": -316.8586730957031, "loss": 0.8116, "rewards/accuracies": 0.5, "rewards/chosen": 2.631676197052002, "rewards/margins": -0.012660229578614235, "rewards/rejected": 2.6443369388580322, "step": 34030 }, { "epoch": 1.5803890617020289, "grad_norm": 66.25054931640625, "learning_rate": 2.0522679790148104e-07, "logits/chosen": -19.01058006286621, "logits/rejected": -18.464481353759766, "logps/chosen": -519.698974609375, "logps/rejected": -400.0267028808594, "loss": 0.5804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.273211479187012, "rewards/margins": 0.8209875822067261, "rewards/rejected": 3.4522242546081543, "step": 34040 }, { "epoch": 1.5808533358094619, "grad_norm": 28.08803939819336, "learning_rate": 2.0519894145503505e-07, "logits/chosen": -19.225887298583984, "logits/rejected": -18.467823028564453, "logps/chosen": -385.13470458984375, "logps/rejected": -297.43255615234375, "loss": 0.4496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.153721809387207, "rewards/margins": 1.6517740488052368, "rewards/rejected": 2.5019478797912598, "step": 34050 }, { "epoch": 1.581317609916895, "grad_norm": 92.06427764892578, "learning_rate": 2.0517108500858904e-07, "logits/chosen": -18.88395881652832, "logits/rejected": -18.105077743530273, "logps/chosen": -427.2893981933594, "logps/rejected": -336.71759033203125, "loss": 0.4316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7034904956817627, "rewards/margins": 1.3148610591888428, "rewards/rejected": 2.38862943649292, "step": 34060 }, { "epoch": 1.5817818840243278, "grad_norm": 64.57770538330078, "learning_rate": 2.0514322856214308e-07, "logits/chosen": -18.480979919433594, "logits/rejected": -17.7481746673584, "logps/chosen": -356.96392822265625, "logps/rejected": -288.5724792480469, "loss": 0.398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.662675619125366, "rewards/margins": 1.155321717262268, "rewards/rejected": 1.5073540210723877, "step": 34070 }, { "epoch": 1.582246158131761, "grad_norm": 18.012006759643555, "learning_rate": 2.051153721156971e-07, "logits/chosen": -19.015850067138672, "logits/rejected": -18.12911605834961, "logps/chosen": -367.9447021484375, "logps/rejected": -269.18841552734375, "loss": 0.3819, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.232577800750732, "rewards/margins": 1.2895430326461792, "rewards/rejected": 2.9430348873138428, "step": 34080 }, { "epoch": 1.582710432239194, "grad_norm": 5.629517555236816, "learning_rate": 2.0508751566925114e-07, "logits/chosen": -19.355745315551758, "logits/rejected": -19.312170028686523, "logps/chosen": -312.3344421386719, "logps/rejected": -397.0398864746094, "loss": 0.5776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1067185401916504, "rewards/margins": 0.6287627816200256, "rewards/rejected": 2.4779558181762695, "step": 34090 }, { "epoch": 1.583174706346627, "grad_norm": 139.7032470703125, "learning_rate": 2.0505965922280513e-07, "logits/chosen": -18.312646865844727, "logits/rejected": -17.966327667236328, "logps/chosen": -411.53717041015625, "logps/rejected": -566.9678344726562, "loss": 0.7202, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5562713146209717, "rewards/margins": 0.6158910989761353, "rewards/rejected": 2.940380573272705, "step": 34100 }, { "epoch": 1.58363898045406, "grad_norm": 19.47532844543457, "learning_rate": 2.0503180277635914e-07, "logits/chosen": -18.870403289794922, "logits/rejected": -18.171049118041992, "logps/chosen": -425.26544189453125, "logps/rejected": -327.0419616699219, "loss": 0.6402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.660654067993164, "rewards/margins": 0.4728925824165344, "rewards/rejected": 2.1877615451812744, "step": 34110 }, { "epoch": 1.584103254561493, "grad_norm": 56.64629364013672, "learning_rate": 2.0500394632991318e-07, "logits/chosen": -18.50223159790039, "logits/rejected": -17.625986099243164, "logps/chosen": -497.82037353515625, "logps/rejected": -382.86016845703125, "loss": 0.7863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.131269931793213, "rewards/margins": 0.6766934394836426, "rewards/rejected": 3.4545772075653076, "step": 34120 }, { "epoch": 1.5845675286689263, "grad_norm": 38.5906867980957, "learning_rate": 2.0497608988346717e-07, "logits/chosen": -19.80911636352539, "logits/rejected": -19.29229736328125, "logps/chosen": -396.8951721191406, "logps/rejected": -346.88427734375, "loss": 1.2266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.426050901412964, "rewards/margins": -0.4502859115600586, "rewards/rejected": 3.8763370513916016, "step": 34130 }, { "epoch": 1.585031802776359, "grad_norm": 39.63736343383789, "learning_rate": 2.049482334370212e-07, "logits/chosen": -19.05086898803711, "logits/rejected": -17.770549774169922, "logps/chosen": -415.9427795410156, "logps/rejected": -328.8171081542969, "loss": 0.3419, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.138574600219727, "rewards/margins": 1.4106143712997437, "rewards/rejected": 2.7279603481292725, "step": 34140 }, { "epoch": 1.5854960768837922, "grad_norm": 80.16109466552734, "learning_rate": 2.0492037699057523e-07, "logits/chosen": -18.37374496459961, "logits/rejected": -17.273799896240234, "logps/chosen": -414.65625, "logps/rejected": -326.6180114746094, "loss": 0.3778, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5992496013641357, "rewards/margins": 1.885685682296753, "rewards/rejected": 1.7135636806488037, "step": 34150 }, { "epoch": 1.5859603509912252, "grad_norm": 44.295997619628906, "learning_rate": 2.0489252054412927e-07, "logits/chosen": -19.353580474853516, "logits/rejected": -18.75269317626953, "logps/chosen": -353.57763671875, "logps/rejected": -318.5888977050781, "loss": 0.8135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.704103469848633, "rewards/margins": 0.2428295612335205, "rewards/rejected": 2.4612739086151123, "step": 34160 }, { "epoch": 1.5864246250986582, "grad_norm": 36.09355545043945, "learning_rate": 2.0486466409768325e-07, "logits/chosen": -19.229543685913086, "logits/rejected": -17.550350189208984, "logps/chosen": -305.327880859375, "logps/rejected": -216.0465545654297, "loss": 0.8537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.667942762374878, "rewards/margins": 0.6355425715446472, "rewards/rejected": 2.032400608062744, "step": 34170 }, { "epoch": 1.5868888992060914, "grad_norm": 18.458093643188477, "learning_rate": 2.0483680765123727e-07, "logits/chosen": -19.50823974609375, "logits/rejected": -19.014169692993164, "logps/chosen": -317.6608581542969, "logps/rejected": -279.2811584472656, "loss": 0.7312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.86391282081604, "rewards/margins": 0.8002327680587769, "rewards/rejected": 2.0636801719665527, "step": 34180 }, { "epoch": 1.5873531733135242, "grad_norm": 72.05484008789062, "learning_rate": 2.048089512047913e-07, "logits/chosen": -19.181699752807617, "logits/rejected": -17.0494384765625, "logps/chosen": -524.06640625, "logps/rejected": -263.8971252441406, "loss": 0.3502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.750826358795166, "rewards/margins": 2.4002647399902344, "rewards/rejected": 2.3505616188049316, "step": 34190 }, { "epoch": 1.5878174474209574, "grad_norm": 19.656272888183594, "learning_rate": 2.0478109475834532e-07, "logits/chosen": -20.169034957885742, "logits/rejected": -18.435501098632812, "logps/chosen": -378.57183837890625, "logps/rejected": -232.93960571289062, "loss": 0.3399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6421921253204346, "rewards/margins": 1.68170166015625, "rewards/rejected": 1.9604904651641846, "step": 34200 }, { "epoch": 1.5882817215283902, "grad_norm": 45.00692367553711, "learning_rate": 2.047532383118993e-07, "logits/chosen": -19.733783721923828, "logits/rejected": -19.071590423583984, "logps/chosen": -477.7001037597656, "logps/rejected": -372.84930419921875, "loss": 0.4173, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9041199684143066, "rewards/margins": 0.9214576482772827, "rewards/rejected": 2.982661724090576, "step": 34210 }, { "epoch": 1.5887459956358234, "grad_norm": 184.51470947265625, "learning_rate": 2.0472538186545335e-07, "logits/chosen": -18.659963607788086, "logits/rejected": -17.523029327392578, "logps/chosen": -485.09552001953125, "logps/rejected": -378.4510192871094, "loss": 0.4858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8598289489746094, "rewards/margins": 1.3713138103485107, "rewards/rejected": 2.4885151386260986, "step": 34220 }, { "epoch": 1.5892102697432564, "grad_norm": 127.7906723022461, "learning_rate": 2.0469752541900737e-07, "logits/chosen": -18.818683624267578, "logits/rejected": -18.770177841186523, "logps/chosen": -399.36968994140625, "logps/rejected": -395.4251708984375, "loss": 1.0344, "rewards/accuracies": 0.5, "rewards/chosen": 3.463444471359253, "rewards/margins": 0.23070594668388367, "rewards/rejected": 3.232738494873047, "step": 34230 }, { "epoch": 1.5896745438506894, "grad_norm": 184.80589294433594, "learning_rate": 2.046696689725614e-07, "logits/chosen": -19.14675521850586, "logits/rejected": -18.67194366455078, "logps/chosen": -464.273681640625, "logps/rejected": -460.6474609375, "loss": 0.6262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.660748243331909, "rewards/margins": 0.9704427719116211, "rewards/rejected": 2.690305233001709, "step": 34240 }, { "epoch": 1.5901388179581226, "grad_norm": 7.387089252471924, "learning_rate": 2.046418125261154e-07, "logits/chosen": -19.726154327392578, "logits/rejected": -18.047080993652344, "logps/chosen": -444.97698974609375, "logps/rejected": -305.55889892578125, "loss": 0.8074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.608396530151367, "rewards/margins": 1.1885669231414795, "rewards/rejected": 2.4198296070098877, "step": 34250 }, { "epoch": 1.5906030920655554, "grad_norm": 111.9561996459961, "learning_rate": 2.0461395607966944e-07, "logits/chosen": -18.826343536376953, "logits/rejected": -17.662212371826172, "logps/chosen": -449.86456298828125, "logps/rejected": -328.974609375, "loss": 0.534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.962092161178589, "rewards/margins": 1.1778138875961304, "rewards/rejected": 1.7842782735824585, "step": 34260 }, { "epoch": 1.5910673661729886, "grad_norm": 23.100059509277344, "learning_rate": 2.0458609963322345e-07, "logits/chosen": -19.51658058166504, "logits/rejected": -18.688358306884766, "logps/chosen": -412.5420837402344, "logps/rejected": -333.4090576171875, "loss": 0.4581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.034125328063965, "rewards/margins": 0.9667463302612305, "rewards/rejected": 2.0673792362213135, "step": 34270 }, { "epoch": 1.5915316402804216, "grad_norm": 61.71820068359375, "learning_rate": 2.0455824318677747e-07, "logits/chosen": -18.940628051757812, "logits/rejected": -18.841541290283203, "logps/chosen": -453.7027282714844, "logps/rejected": -386.6666564941406, "loss": 0.7805, "rewards/accuracies": 0.5, "rewards/chosen": 3.6690127849578857, "rewards/margins": 0.5018407106399536, "rewards/rejected": 3.167172431945801, "step": 34280 }, { "epoch": 1.5919959143878546, "grad_norm": 1.5482383966445923, "learning_rate": 2.0453038674033148e-07, "logits/chosen": -19.528850555419922, "logits/rejected": -18.600744247436523, "logps/chosen": -493.74078369140625, "logps/rejected": -344.26629638671875, "loss": 0.6134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.026215076446533, "rewards/margins": 1.423169493675232, "rewards/rejected": 2.6030452251434326, "step": 34290 }, { "epoch": 1.5924601884952876, "grad_norm": 173.55247497558594, "learning_rate": 2.045025302938855e-07, "logits/chosen": -19.545251846313477, "logits/rejected": -17.858633041381836, "logps/chosen": -342.83953857421875, "logps/rejected": -256.13665771484375, "loss": 0.5516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.737034559249878, "rewards/margins": 0.8835641741752625, "rewards/rejected": 1.8534702062606812, "step": 34300 }, { "epoch": 1.5929244626027206, "grad_norm": 83.11688232421875, "learning_rate": 2.0447467384743954e-07, "logits/chosen": -18.52859115600586, "logits/rejected": -17.33735466003418, "logps/chosen": -474.7610778808594, "logps/rejected": -274.3460998535156, "loss": 0.4509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.022778034210205, "rewards/margins": 1.721975326538086, "rewards/rejected": 2.30080246925354, "step": 34310 }, { "epoch": 1.5933887367101538, "grad_norm": 84.6607666015625, "learning_rate": 2.0444681740099353e-07, "logits/chosen": -18.926834106445312, "logits/rejected": -17.907024383544922, "logps/chosen": -523.2222290039062, "logps/rejected": -368.67010498046875, "loss": 0.4223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.201035737991333, "rewards/margins": 0.9568225145339966, "rewards/rejected": 2.244213104248047, "step": 34320 }, { "epoch": 1.5938530108175866, "grad_norm": 17.444839477539062, "learning_rate": 2.0441896095454754e-07, "logits/chosen": -18.805410385131836, "logits/rejected": -17.9559268951416, "logps/chosen": -417.352783203125, "logps/rejected": -362.24072265625, "loss": 1.067, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.032707691192627, "rewards/margins": 0.8731997609138489, "rewards/rejected": 3.1595077514648438, "step": 34330 }, { "epoch": 1.5943172849250198, "grad_norm": 221.21572875976562, "learning_rate": 2.0439110450810158e-07, "logits/chosen": -19.124988555908203, "logits/rejected": -18.554874420166016, "logps/chosen": -425.8104553222656, "logps/rejected": -371.62615966796875, "loss": 0.8924, "rewards/accuracies": 0.5, "rewards/chosen": 2.63999605178833, "rewards/margins": -0.10064878314733505, "rewards/rejected": 2.740644693374634, "step": 34340 }, { "epoch": 1.5947815590324528, "grad_norm": 185.01913452148438, "learning_rate": 2.043632480616556e-07, "logits/chosen": -19.166288375854492, "logits/rejected": -18.816015243530273, "logps/chosen": -369.8789367675781, "logps/rejected": -400.4031066894531, "loss": 0.9062, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.392259120941162, "rewards/margins": 0.36930328607559204, "rewards/rejected": 3.022956132888794, "step": 34350 }, { "epoch": 1.5952458331398858, "grad_norm": 0.833630383014679, "learning_rate": 2.0433539161520958e-07, "logits/chosen": -19.2005615234375, "logits/rejected": -18.385784149169922, "logps/chosen": -369.1055908203125, "logps/rejected": -295.8487854003906, "loss": 0.3146, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5330958366394043, "rewards/margins": 1.6977885961532593, "rewards/rejected": 1.8353074789047241, "step": 34360 }, { "epoch": 1.5957101072473188, "grad_norm": 1.2945746183395386, "learning_rate": 2.0430753516876362e-07, "logits/chosen": -18.695526123046875, "logits/rejected": -18.213838577270508, "logps/chosen": -360.1579895019531, "logps/rejected": -293.4370422363281, "loss": 0.7529, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0342347621917725, "rewards/margins": 0.4385504722595215, "rewards/rejected": 2.595684051513672, "step": 34370 }, { "epoch": 1.5961743813547518, "grad_norm": 197.6814727783203, "learning_rate": 2.0427967872231764e-07, "logits/chosen": -18.890993118286133, "logits/rejected": -18.17624282836914, "logps/chosen": -379.19281005859375, "logps/rejected": -317.38433837890625, "loss": 0.4281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4207355976104736, "rewards/margins": 1.079309344291687, "rewards/rejected": 2.341426372528076, "step": 34380 }, { "epoch": 1.596638655462185, "grad_norm": 56.11317825317383, "learning_rate": 2.0425182227587168e-07, "logits/chosen": -17.610523223876953, "logits/rejected": -17.326200485229492, "logps/chosen": -269.89727783203125, "logps/rejected": -268.3345947265625, "loss": 0.615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.036909818649292, "rewards/margins": 0.5372781753540039, "rewards/rejected": 1.4996318817138672, "step": 34390 }, { "epoch": 1.5971029295696177, "grad_norm": 10.273056030273438, "learning_rate": 2.0422396582942567e-07, "logits/chosen": -19.845706939697266, "logits/rejected": -18.614437103271484, "logps/chosen": -318.4114990234375, "logps/rejected": -244.6644287109375, "loss": 0.6183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.790165662765503, "rewards/margins": 0.5436261892318726, "rewards/rejected": 2.246539354324341, "step": 34400 }, { "epoch": 1.597567203677051, "grad_norm": 55.30665969848633, "learning_rate": 2.041961093829797e-07, "logits/chosen": -18.135169982910156, "logits/rejected": -18.325469970703125, "logps/chosen": -309.6497802734375, "logps/rejected": -254.0937957763672, "loss": 0.541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2730679512023926, "rewards/margins": 0.5778157114982605, "rewards/rejected": 1.6952524185180664, "step": 34410 }, { "epoch": 1.598031477784484, "grad_norm": 186.67825317382812, "learning_rate": 2.0416825293653372e-07, "logits/chosen": -18.694536209106445, "logits/rejected": -18.952442169189453, "logps/chosen": -413.0503845214844, "logps/rejected": -457.90899658203125, "loss": 1.2328, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.737377643585205, "rewards/margins": -0.7424208521842957, "rewards/rejected": 3.4797985553741455, "step": 34420 }, { "epoch": 1.598495751891917, "grad_norm": 41.1046257019043, "learning_rate": 2.0414039649008776e-07, "logits/chosen": -18.067195892333984, "logits/rejected": -17.704614639282227, "logps/chosen": -420.1632385253906, "logps/rejected": -352.19183349609375, "loss": 0.582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8510196208953857, "rewards/margins": 0.6103066205978394, "rewards/rejected": 3.240713119506836, "step": 34430 }, { "epoch": 1.5989600259993502, "grad_norm": 3.9877536296844482, "learning_rate": 2.0411254004364175e-07, "logits/chosen": -20.366825103759766, "logits/rejected": -17.914026260375977, "logps/chosen": -397.296630859375, "logps/rejected": -247.2284393310547, "loss": 0.5647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.305276870727539, "rewards/margins": 1.6391099691390991, "rewards/rejected": 1.6661670207977295, "step": 34440 }, { "epoch": 1.599424300106783, "grad_norm": 16.410188674926758, "learning_rate": 2.0408468359719577e-07, "logits/chosen": -18.4251651763916, "logits/rejected": -18.054363250732422, "logps/chosen": -298.5863952636719, "logps/rejected": -261.885498046875, "loss": 1.1235, "rewards/accuracies": 0.5, "rewards/chosen": 2.2416653633117676, "rewards/margins": -0.15670588612556458, "rewards/rejected": 2.3983712196350098, "step": 34450 }, { "epoch": 1.5998885742142162, "grad_norm": 115.3127212524414, "learning_rate": 2.040596127953944e-07, "logits/chosen": -18.001813888549805, "logits/rejected": -18.87063980102539, "logps/chosen": -347.72454833984375, "logps/rejected": -473.9303283691406, "loss": 1.3818, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6836726665496826, "rewards/margins": -0.46897298097610474, "rewards/rejected": 4.152645587921143, "step": 34460 }, { "epoch": 1.6003528483216491, "grad_norm": 20.710660934448242, "learning_rate": 2.040317563489484e-07, "logits/chosen": -19.304180145263672, "logits/rejected": -18.003015518188477, "logps/chosen": -365.2534484863281, "logps/rejected": -296.8533935546875, "loss": 0.3031, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.930544376373291, "rewards/margins": 1.7992208003997803, "rewards/rejected": 2.1313233375549316, "step": 34470 }, { "epoch": 1.6008171224290821, "grad_norm": 170.98114013671875, "learning_rate": 2.0400389990250244e-07, "logits/chosen": -19.982192993164062, "logits/rejected": -18.81207275390625, "logps/chosen": -326.4530334472656, "logps/rejected": -241.4821319580078, "loss": 0.6602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0444931983947754, "rewards/margins": 0.5683609247207642, "rewards/rejected": 1.4761319160461426, "step": 34480 }, { "epoch": 1.6012813965365151, "grad_norm": 193.75047302246094, "learning_rate": 2.0397604345605643e-07, "logits/chosen": -18.74833106994629, "logits/rejected": -17.72452735900879, "logps/chosen": -343.5632019042969, "logps/rejected": -279.6390686035156, "loss": 0.6891, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.588019847869873, "rewards/margins": 1.469292402267456, "rewards/rejected": 2.118727684020996, "step": 34490 }, { "epoch": 1.6017456706439481, "grad_norm": 41.27764129638672, "learning_rate": 2.0394818700961047e-07, "logits/chosen": -19.094013214111328, "logits/rejected": -18.795976638793945, "logps/chosen": -351.2781677246094, "logps/rejected": -363.9027404785156, "loss": 0.8526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.672063112258911, "rewards/margins": 0.15834596753120422, "rewards/rejected": 2.5137171745300293, "step": 34500 }, { "epoch": 1.6022099447513813, "grad_norm": 22.05039405822754, "learning_rate": 2.039203305631645e-07, "logits/chosen": -19.836589813232422, "logits/rejected": -18.445634841918945, "logps/chosen": -363.76727294921875, "logps/rejected": -223.0837860107422, "loss": 0.3592, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1836371421813965, "rewards/margins": 1.3054174184799194, "rewards/rejected": 1.8782202005386353, "step": 34510 }, { "epoch": 1.6026742188588141, "grad_norm": 8.171966552734375, "learning_rate": 2.0389247411671853e-07, "logits/chosen": -19.41565704345703, "logits/rejected": -19.321807861328125, "logps/chosen": -423.6470642089844, "logps/rejected": -425.99603271484375, "loss": 0.7839, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2265326976776123, "rewards/margins": 0.6671551465988159, "rewards/rejected": 2.559377670288086, "step": 34520 }, { "epoch": 1.6031384929662473, "grad_norm": 27.654447555541992, "learning_rate": 2.0386461767027252e-07, "logits/chosen": -19.386661529541016, "logits/rejected": -18.85165023803711, "logps/chosen": -403.29742431640625, "logps/rejected": -332.2912292480469, "loss": 0.631, "rewards/accuracies": 0.5, "rewards/chosen": 2.5704636573791504, "rewards/margins": 0.42524704337120056, "rewards/rejected": 2.145216941833496, "step": 34530 }, { "epoch": 1.6036027670736803, "grad_norm": 98.37350463867188, "learning_rate": 2.0383676122382653e-07, "logits/chosen": -19.430606842041016, "logits/rejected": -18.846364974975586, "logps/chosen": -396.71905517578125, "logps/rejected": -303.1951599121094, "loss": 0.6234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.199899196624756, "rewards/margins": 0.9114203453063965, "rewards/rejected": 2.2884788513183594, "step": 34540 }, { "epoch": 1.6040670411811133, "grad_norm": 39.878719329833984, "learning_rate": 2.0380890477738057e-07, "logits/chosen": -18.10243797302246, "logits/rejected": -17.91094970703125, "logps/chosen": -253.5084991455078, "logps/rejected": -218.3627166748047, "loss": 0.4295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9568020105361938, "rewards/margins": 0.9931610822677612, "rewards/rejected": 0.9636405110359192, "step": 34550 }, { "epoch": 1.6045313152885463, "grad_norm": 149.73817443847656, "learning_rate": 2.037810483309346e-07, "logits/chosen": -20.21042251586914, "logits/rejected": -19.915908813476562, "logps/chosen": -539.6642456054688, "logps/rejected": -442.271240234375, "loss": 0.5673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.943414211273193, "rewards/margins": 1.5968334674835205, "rewards/rejected": 3.346580982208252, "step": 34560 }, { "epoch": 1.6049955893959793, "grad_norm": 46.647315979003906, "learning_rate": 2.0375319188448857e-07, "logits/chosen": -19.304004669189453, "logits/rejected": -17.81346893310547, "logps/chosen": -502.8960876464844, "logps/rejected": -367.5567626953125, "loss": 0.321, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.24149227142334, "rewards/margins": 1.589404821395874, "rewards/rejected": 2.6520869731903076, "step": 34570 }, { "epoch": 1.6054598635034125, "grad_norm": 52.35768127441406, "learning_rate": 2.0372533543804262e-07, "logits/chosen": -19.657373428344727, "logits/rejected": -18.48063087463379, "logps/chosen": -363.13702392578125, "logps/rejected": -288.7007141113281, "loss": 0.8433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.399099826812744, "rewards/margins": 0.20983080565929413, "rewards/rejected": 2.1892690658569336, "step": 34580 }, { "epoch": 1.6059241376108453, "grad_norm": 29.7714900970459, "learning_rate": 2.0369747899159663e-07, "logits/chosen": -18.584793090820312, "logits/rejected": -17.772836685180664, "logps/chosen": -366.4189453125, "logps/rejected": -239.0480499267578, "loss": 0.3933, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.348048686981201, "rewards/margins": 1.4802830219268799, "rewards/rejected": 1.8677656650543213, "step": 34590 }, { "epoch": 1.6063884117182785, "grad_norm": 78.44898986816406, "learning_rate": 2.0366962254515064e-07, "logits/chosen": -19.556495666503906, "logits/rejected": -18.444168090820312, "logps/chosen": -519.1331787109375, "logps/rejected": -420.124755859375, "loss": 0.5141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.854283571243286, "rewards/margins": 1.302603840827942, "rewards/rejected": 2.551679849624634, "step": 34600 }, { "epoch": 1.6068526858257115, "grad_norm": 92.13692474365234, "learning_rate": 2.0364176609870466e-07, "logits/chosen": -19.00323486328125, "logits/rejected": -18.569120407104492, "logps/chosen": -281.0039367675781, "logps/rejected": -312.21319580078125, "loss": 0.9945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2275424003601074, "rewards/margins": -0.10125680267810822, "rewards/rejected": 2.328799247741699, "step": 34610 }, { "epoch": 1.6073169599331445, "grad_norm": 149.86598205566406, "learning_rate": 2.0361390965225867e-07, "logits/chosen": -18.655555725097656, "logits/rejected": -17.70585823059082, "logps/chosen": -513.67529296875, "logps/rejected": -331.199951171875, "loss": 0.3613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.066997051239014, "rewards/margins": 1.7604519128799438, "rewards/rejected": 2.306544780731201, "step": 34620 }, { "epoch": 1.6077812340405777, "grad_norm": 131.86488342285156, "learning_rate": 2.0358605320581272e-07, "logits/chosen": -18.737396240234375, "logits/rejected": -17.76751708984375, "logps/chosen": -395.6728820800781, "logps/rejected": -276.1379699707031, "loss": 0.244, "rewards/accuracies": 1.0, "rewards/chosen": 2.9722094535827637, "rewards/margins": 1.6067641973495483, "rewards/rejected": 1.3654451370239258, "step": 34630 }, { "epoch": 1.6082455081480105, "grad_norm": 119.78832244873047, "learning_rate": 2.035581967593667e-07, "logits/chosen": -18.318355560302734, "logits/rejected": -17.411453247070312, "logps/chosen": -397.7091064453125, "logps/rejected": -297.88970947265625, "loss": 0.6487, "rewards/accuracies": 0.5, "rewards/chosen": 3.5469894409179688, "rewards/margins": 1.355994462966919, "rewards/rejected": 2.19099497795105, "step": 34640 }, { "epoch": 1.6087097822554437, "grad_norm": 184.19839477539062, "learning_rate": 2.0353034031292074e-07, "logits/chosen": -19.09261703491211, "logits/rejected": -19.152591705322266, "logps/chosen": -394.60650634765625, "logps/rejected": -387.8609924316406, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.55474853515625, "rewards/margins": 0.8068276643753052, "rewards/rejected": 2.747920513153076, "step": 34650 }, { "epoch": 1.6091740563628767, "grad_norm": 0.7911074161529541, "learning_rate": 2.0350248386647476e-07, "logits/chosen": -18.880800247192383, "logits/rejected": -17.52134895324707, "logps/chosen": -359.0381774902344, "logps/rejected": -240.1748046875, "loss": 0.5872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6947808265686035, "rewards/margins": 1.1739381551742554, "rewards/rejected": 1.5208425521850586, "step": 34660 }, { "epoch": 1.6096383304703097, "grad_norm": 6.056875705718994, "learning_rate": 2.034746274200288e-07, "logits/chosen": -19.31474494934082, "logits/rejected": -18.588420867919922, "logps/chosen": -456.2001037597656, "logps/rejected": -341.9753112792969, "loss": 0.2735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7628226280212402, "rewards/margins": 1.7725902795791626, "rewards/rejected": 1.990233063697815, "step": 34670 }, { "epoch": 1.6101026045777427, "grad_norm": 61.64647674560547, "learning_rate": 2.034467709735828e-07, "logits/chosen": -18.074199676513672, "logits/rejected": -18.691425323486328, "logps/chosen": -300.97100830078125, "logps/rejected": -349.854248046875, "loss": 0.7653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.945465087890625, "rewards/margins": 0.2520391345024109, "rewards/rejected": 2.6934256553649902, "step": 34680 }, { "epoch": 1.6105668786851757, "grad_norm": 60.64046096801758, "learning_rate": 2.034189145271368e-07, "logits/chosen": -20.787734985351562, "logits/rejected": -19.027450561523438, "logps/chosen": -394.39459228515625, "logps/rejected": -236.8543243408203, "loss": 0.1975, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.440085411071777, "rewards/margins": 2.5809638500213623, "rewards/rejected": 1.859121322631836, "step": 34690 }, { "epoch": 1.611031152792609, "grad_norm": 35.03410339355469, "learning_rate": 2.0339105808069084e-07, "logits/chosen": -19.01145362854004, "logits/rejected": -19.200170516967773, "logps/chosen": -279.59820556640625, "logps/rejected": -310.986572265625, "loss": 1.0757, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6142077445983887, "rewards/margins": -0.13854405283927917, "rewards/rejected": 2.7527518272399902, "step": 34700 }, { "epoch": 1.6114954269000417, "grad_norm": 228.39157104492188, "learning_rate": 2.0336320163424486e-07, "logits/chosen": -20.012319564819336, "logits/rejected": -19.012191772460938, "logps/chosen": -405.25604248046875, "logps/rejected": -274.7823791503906, "loss": 0.6353, "rewards/accuracies": 0.5, "rewards/chosen": 3.2895798683166504, "rewards/margins": 1.386216163635254, "rewards/rejected": 1.903363585472107, "step": 34710 }, { "epoch": 1.6119597010074749, "grad_norm": 56.102481842041016, "learning_rate": 2.0333534518779885e-07, "logits/chosen": -19.010475158691406, "logits/rejected": -18.379228591918945, "logps/chosen": -468.9864807128906, "logps/rejected": -383.7703857421875, "loss": 0.6522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.308851718902588, "rewards/margins": 1.3108280897140503, "rewards/rejected": 2.998023271560669, "step": 34720 }, { "epoch": 1.6124239751149079, "grad_norm": 126.7437515258789, "learning_rate": 2.033074887413529e-07, "logits/chosen": -18.357559204101562, "logits/rejected": -18.377315521240234, "logps/chosen": -491.03533935546875, "logps/rejected": -447.935791015625, "loss": 1.3359, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.915715456008911, "rewards/margins": -0.458101361989975, "rewards/rejected": 4.373816967010498, "step": 34730 }, { "epoch": 1.6128882492223409, "grad_norm": 36.582035064697266, "learning_rate": 2.032796322949069e-07, "logits/chosen": -19.308002471923828, "logits/rejected": -17.997066497802734, "logps/chosen": -422.7821350097656, "logps/rejected": -306.5050354003906, "loss": 0.4464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0701069831848145, "rewards/margins": 1.513213038444519, "rewards/rejected": 2.556893825531006, "step": 34740 }, { "epoch": 1.6133525233297739, "grad_norm": 65.24774932861328, "learning_rate": 2.0325177584846094e-07, "logits/chosen": -18.833757400512695, "logits/rejected": -19.143468856811523, "logps/chosen": -349.17999267578125, "logps/rejected": -390.7877502441406, "loss": 0.854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5619977712631226, "rewards/margins": -0.1265159547328949, "rewards/rejected": 1.6885137557983398, "step": 34750 }, { "epoch": 1.6138167974372069, "grad_norm": 0.6247054934501648, "learning_rate": 2.0322391940201493e-07, "logits/chosen": -19.510242462158203, "logits/rejected": -17.919816970825195, "logps/chosen": -443.6184997558594, "logps/rejected": -314.037841796875, "loss": 0.7007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5932857990264893, "rewards/margins": 1.267325758934021, "rewards/rejected": 2.325960159301758, "step": 34760 }, { "epoch": 1.61428107154464, "grad_norm": 52.85428237915039, "learning_rate": 2.0319606295556897e-07, "logits/chosen": -19.080650329589844, "logits/rejected": -18.286087036132812, "logps/chosen": -285.5682373046875, "logps/rejected": -233.760986328125, "loss": 0.3161, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.238591432571411, "rewards/margins": 1.36594557762146, "rewards/rejected": 1.872645616531372, "step": 34770 }, { "epoch": 1.6147453456520728, "grad_norm": 58.94129943847656, "learning_rate": 2.0316820650912299e-07, "logits/chosen": -18.53312110900879, "logits/rejected": -18.653240203857422, "logps/chosen": -317.48272705078125, "logps/rejected": -324.0579833984375, "loss": 1.1615, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.95908784866333, "rewards/margins": 0.31534355878829956, "rewards/rejected": 2.6437439918518066, "step": 34780 }, { "epoch": 1.615209619759506, "grad_norm": 76.51573944091797, "learning_rate": 2.0314035006267697e-07, "logits/chosen": -20.640066146850586, "logits/rejected": -19.22425651550293, "logps/chosen": -462.3716735839844, "logps/rejected": -297.70758056640625, "loss": 0.2342, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7058472633361816, "rewards/margins": 1.7815431356430054, "rewards/rejected": 1.9243040084838867, "step": 34790 }, { "epoch": 1.615673893866939, "grad_norm": 211.59613037109375, "learning_rate": 2.0311249361623102e-07, "logits/chosen": -19.45594596862793, "logits/rejected": -18.60232162475586, "logps/chosen": -329.0508728027344, "logps/rejected": -318.4164733886719, "loss": 1.1124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2352371215820312, "rewards/margins": 0.33313050866127014, "rewards/rejected": 2.902106761932373, "step": 34800 }, { "epoch": 1.616138167974372, "grad_norm": 111.67070770263672, "learning_rate": 2.0308463716978503e-07, "logits/chosen": -19.017236709594727, "logits/rejected": -19.27398681640625, "logps/chosen": -513.6177978515625, "logps/rejected": -520.932373046875, "loss": 1.134, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.685080051422119, "rewards/margins": -0.6151472926139832, "rewards/rejected": 3.300226926803589, "step": 34810 }, { "epoch": 1.6166024420818053, "grad_norm": 236.28929138183594, "learning_rate": 2.0305678072333907e-07, "logits/chosen": -18.06161117553711, "logits/rejected": -18.124141693115234, "logps/chosen": -350.6637268066406, "logps/rejected": -342.41607666015625, "loss": 1.0584, "rewards/accuracies": 0.5, "rewards/chosen": 3.3419430255889893, "rewards/margins": 0.48308712244033813, "rewards/rejected": 2.858855962753296, "step": 34820 }, { "epoch": 1.617066716189238, "grad_norm": 26.977439880371094, "learning_rate": 2.0302892427689306e-07, "logits/chosen": -20.198123931884766, "logits/rejected": -18.976408004760742, "logps/chosen": -472.20037841796875, "logps/rejected": -382.481689453125, "loss": 0.3372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.785888195037842, "rewards/margins": 1.6871347427368164, "rewards/rejected": 3.0987539291381836, "step": 34830 }, { "epoch": 1.6175309902966712, "grad_norm": 16.452598571777344, "learning_rate": 2.0300106783044707e-07, "logits/chosen": -18.883480072021484, "logits/rejected": -18.276620864868164, "logps/chosen": -392.4532165527344, "logps/rejected": -345.58062744140625, "loss": 0.9296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6465365886688232, "rewards/margins": 0.6526879668235779, "rewards/rejected": 2.9938488006591797, "step": 34840 }, { "epoch": 1.617995264404104, "grad_norm": 85.3320083618164, "learning_rate": 2.0297321138400111e-07, "logits/chosen": -19.188159942626953, "logits/rejected": -18.43178939819336, "logps/chosen": -512.5355224609375, "logps/rejected": -388.7176818847656, "loss": 0.7591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6961636543273926, "rewards/margins": 0.8764761686325073, "rewards/rejected": 2.819687604904175, "step": 34850 }, { "epoch": 1.6184595385115372, "grad_norm": 123.63417053222656, "learning_rate": 2.0294535493755513e-07, "logits/chosen": -18.988895416259766, "logits/rejected": -19.43523406982422, "logps/chosen": -389.684326171875, "logps/rejected": -379.00238037109375, "loss": 1.0346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5983970165252686, "rewards/margins": 0.3248656392097473, "rewards/rejected": 3.273531675338745, "step": 34860 }, { "epoch": 1.6189238126189702, "grad_norm": 1.603014588356018, "learning_rate": 2.0291749849110912e-07, "logits/chosen": -18.838293075561523, "logits/rejected": -18.4399471282959, "logps/chosen": -390.92755126953125, "logps/rejected": -303.97650146484375, "loss": 0.5344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.905783176422119, "rewards/margins": 1.4910647869110107, "rewards/rejected": 2.4147181510925293, "step": 34870 }, { "epoch": 1.6193880867264032, "grad_norm": 100.3228759765625, "learning_rate": 2.0288964204466316e-07, "logits/chosen": -19.18766975402832, "logits/rejected": -18.19119644165039, "logps/chosen": -395.6658630371094, "logps/rejected": -264.734375, "loss": 0.3822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6157500743865967, "rewards/margins": 1.3480122089385986, "rewards/rejected": 2.267738103866577, "step": 34880 }, { "epoch": 1.6198523608338364, "grad_norm": 34.6096305847168, "learning_rate": 2.0286178559821717e-07, "logits/chosen": -18.10299301147461, "logits/rejected": -17.43246841430664, "logps/chosen": -396.82574462890625, "logps/rejected": -323.8291320800781, "loss": 0.3656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1392605304718018, "rewards/margins": 1.306282639503479, "rewards/rejected": 1.8329778909683228, "step": 34890 }, { "epoch": 1.6203166349412692, "grad_norm": 34.69496536254883, "learning_rate": 2.0283392915177121e-07, "logits/chosen": -18.96083641052246, "logits/rejected": -18.180755615234375, "logps/chosen": -367.3414611816406, "logps/rejected": -258.303466796875, "loss": 0.4121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4888527393341064, "rewards/margins": 1.518223524093628, "rewards/rejected": 1.970629096031189, "step": 34900 }, { "epoch": 1.6207809090487024, "grad_norm": 120.740966796875, "learning_rate": 2.028060727053252e-07, "logits/chosen": -18.791797637939453, "logits/rejected": -17.517932891845703, "logps/chosen": -414.49090576171875, "logps/rejected": -310.3055114746094, "loss": 0.5227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.562716007232666, "rewards/margins": 0.7947182655334473, "rewards/rejected": 1.7679977416992188, "step": 34910 }, { "epoch": 1.6212451831561354, "grad_norm": 15.393279075622559, "learning_rate": 2.0277821625887924e-07, "logits/chosen": -19.885459899902344, "logits/rejected": -19.200233459472656, "logps/chosen": -413.47528076171875, "logps/rejected": -381.00225830078125, "loss": 0.6369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0308265686035156, "rewards/margins": 0.6644647121429443, "rewards/rejected": 2.3663620948791504, "step": 34920 }, { "epoch": 1.6217094572635684, "grad_norm": 55.13956069946289, "learning_rate": 2.0275035981243326e-07, "logits/chosen": -18.065746307373047, "logits/rejected": -18.61532974243164, "logps/chosen": -309.3695983886719, "logps/rejected": -367.0119323730469, "loss": 1.3158, "rewards/accuracies": 0.5, "rewards/chosen": 2.272512674331665, "rewards/margins": -0.4459146559238434, "rewards/rejected": 2.7184276580810547, "step": 34930 }, { "epoch": 1.6221737313710014, "grad_norm": 65.96365356445312, "learning_rate": 2.027225033659873e-07, "logits/chosen": -19.70210838317871, "logits/rejected": -18.138080596923828, "logps/chosen": -391.216796875, "logps/rejected": -285.4339294433594, "loss": 0.7596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2313599586486816, "rewards/margins": 1.0396859645843506, "rewards/rejected": 2.191673755645752, "step": 34940 }, { "epoch": 1.6226380054784344, "grad_norm": 52.95037078857422, "learning_rate": 2.0269464691954129e-07, "logits/chosen": -18.563671112060547, "logits/rejected": -17.796188354492188, "logps/chosen": -331.3917541503906, "logps/rejected": -265.41534423828125, "loss": 0.5895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.777374744415283, "rewards/margins": 0.8226083517074585, "rewards/rejected": 1.9547662734985352, "step": 34950 }, { "epoch": 1.6231022795858676, "grad_norm": 60.35563278198242, "learning_rate": 2.026667904730953e-07, "logits/chosen": -19.125837326049805, "logits/rejected": -18.88254737854004, "logps/chosen": -365.0944519042969, "logps/rejected": -316.1874084472656, "loss": 0.4756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9575743675231934, "rewards/margins": 0.919925332069397, "rewards/rejected": 2.037648916244507, "step": 34960 }, { "epoch": 1.6235665536933004, "grad_norm": 204.7823486328125, "learning_rate": 2.0263893402664934e-07, "logits/chosen": -18.702075958251953, "logits/rejected": -18.28249740600586, "logps/chosen": -371.835205078125, "logps/rejected": -304.49871826171875, "loss": 0.9925, "rewards/accuracies": 0.5, "rewards/chosen": 3.172081708908081, "rewards/margins": 0.7900556325912476, "rewards/rejected": 2.3820266723632812, "step": 34970 }, { "epoch": 1.6240308278007336, "grad_norm": 3.974675416946411, "learning_rate": 2.0261107758020333e-07, "logits/chosen": -19.509462356567383, "logits/rejected": -17.943567276000977, "logps/chosen": -341.0163269042969, "logps/rejected": -217.82601928710938, "loss": 0.4846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.020164966583252, "rewards/margins": 1.7668508291244507, "rewards/rejected": 1.2533142566680908, "step": 34980 }, { "epoch": 1.6244951019081666, "grad_norm": 122.02130126953125, "learning_rate": 2.0258322113375734e-07, "logits/chosen": -19.655899047851562, "logits/rejected": -18.904569625854492, "logps/chosen": -426.8374938964844, "logps/rejected": -344.736083984375, "loss": 0.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.922091245651245, "rewards/margins": 0.6663147211074829, "rewards/rejected": 2.2557766437530518, "step": 34990 }, { "epoch": 1.6249593760155996, "grad_norm": 33.63351058959961, "learning_rate": 2.0255536468731139e-07, "logits/chosen": -19.54570960998535, "logits/rejected": -19.019744873046875, "logps/chosen": -474.68017578125, "logps/rejected": -397.8294982910156, "loss": 0.4215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.332703590393066, "rewards/margins": 1.8111979961395264, "rewards/rejected": 2.521505117416382, "step": 35000 }, { "epoch": 1.6254236501230328, "grad_norm": 140.64671325683594, "learning_rate": 2.025275082408654e-07, "logits/chosen": -19.048404693603516, "logits/rejected": -18.266948699951172, "logps/chosen": -395.89154052734375, "logps/rejected": -301.91619873046875, "loss": 0.7605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1754517555236816, "rewards/margins": 0.5868180990219116, "rewards/rejected": 2.5886337757110596, "step": 35010 }, { "epoch": 1.6258879242304656, "grad_norm": 50.75466537475586, "learning_rate": 2.0249965179441941e-07, "logits/chosen": -20.15439796447754, "logits/rejected": -19.11518096923828, "logps/chosen": -375.44866943359375, "logps/rejected": -268.61944580078125, "loss": 0.329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7479827404022217, "rewards/margins": 1.371585488319397, "rewards/rejected": 2.3763973712921143, "step": 35020 }, { "epoch": 1.6263521983378988, "grad_norm": 35.00197982788086, "learning_rate": 2.0247179534797343e-07, "logits/chosen": -19.065387725830078, "logits/rejected": -18.425457000732422, "logps/chosen": -424.6468200683594, "logps/rejected": -473.3949279785156, "loss": 0.9631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.566237688064575, "rewards/margins": 0.3580080568790436, "rewards/rejected": 3.2082297801971436, "step": 35030 }, { "epoch": 1.6268164724453316, "grad_norm": 2.843416452407837, "learning_rate": 2.0244393890152744e-07, "logits/chosen": -18.191967010498047, "logits/rejected": -17.493480682373047, "logps/chosen": -439.0061950683594, "logps/rejected": -312.9763488769531, "loss": 0.4618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4255173206329346, "rewards/margins": 1.4186080694198608, "rewards/rejected": 2.0069093704223633, "step": 35040 }, { "epoch": 1.6272807465527648, "grad_norm": 64.41692352294922, "learning_rate": 2.0241608245508148e-07, "logits/chosen": -19.19042205810547, "logits/rejected": -18.93695068359375, "logps/chosen": -405.0580139160156, "logps/rejected": -384.32098388671875, "loss": 0.6227, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.98046875, "rewards/margins": 0.38281330466270447, "rewards/rejected": 3.5976555347442627, "step": 35050 }, { "epoch": 1.6277450206601978, "grad_norm": 103.39664459228516, "learning_rate": 2.0238822600863547e-07, "logits/chosen": -20.28557014465332, "logits/rejected": -19.054460525512695, "logps/chosen": -552.2449340820312, "logps/rejected": -442.27789306640625, "loss": 0.3967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.557260990142822, "rewards/margins": 1.3003679513931274, "rewards/rejected": 3.2568931579589844, "step": 35060 }, { "epoch": 1.6282092947676308, "grad_norm": 72.55452728271484, "learning_rate": 2.0236036956218951e-07, "logits/chosen": -20.67200469970703, "logits/rejected": -19.649185180664062, "logps/chosen": -480.65557861328125, "logps/rejected": -395.3108215332031, "loss": 0.6386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.749089479446411, "rewards/margins": 1.0060206651687622, "rewards/rejected": 2.743068218231201, "step": 35070 }, { "epoch": 1.628673568875064, "grad_norm": 172.5098419189453, "learning_rate": 2.0233251311574353e-07, "logits/chosen": -18.591665267944336, "logits/rejected": -17.188556671142578, "logps/chosen": -340.1399841308594, "logps/rejected": -259.76654052734375, "loss": 0.6043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5599026679992676, "rewards/margins": 0.8117591738700867, "rewards/rejected": 1.7481437921524048, "step": 35080 }, { "epoch": 1.6291378429824968, "grad_norm": 94.66056060791016, "learning_rate": 2.0230465666929757e-07, "logits/chosen": -18.68153953552246, "logits/rejected": -18.607982635498047, "logps/chosen": -394.69464111328125, "logps/rejected": -377.048583984375, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": 3.4990413188934326, "rewards/margins": 0.25842171907424927, "rewards/rejected": 3.240619659423828, "step": 35090 }, { "epoch": 1.62960211708993, "grad_norm": 165.86940002441406, "learning_rate": 2.0227680022285156e-07, "logits/chosen": -18.73582649230957, "logits/rejected": -18.865198135375977, "logps/chosen": -369.93218994140625, "logps/rejected": -392.10601806640625, "loss": 0.9149, "rewards/accuracies": 0.5, "rewards/chosen": 2.4099316596984863, "rewards/margins": 0.14256621897220612, "rewards/rejected": 2.267364978790283, "step": 35100 }, { "epoch": 1.630066391197363, "grad_norm": 27.338212966918945, "learning_rate": 2.0224894377640557e-07, "logits/chosen": -17.920612335205078, "logits/rejected": -17.071735382080078, "logps/chosen": -413.2915954589844, "logps/rejected": -243.0310821533203, "loss": 0.6104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.561145067214966, "rewards/margins": 0.9355882406234741, "rewards/rejected": 1.6255569458007812, "step": 35110 }, { "epoch": 1.630530665304796, "grad_norm": 37.5009765625, "learning_rate": 2.022210873299596e-07, "logits/chosen": -19.38271713256836, "logits/rejected": -18.384891510009766, "logps/chosen": -330.4400634765625, "logps/rejected": -297.6503601074219, "loss": 0.3838, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.246677875518799, "rewards/margins": 1.1186667680740356, "rewards/rejected": 2.1280109882354736, "step": 35120 }, { "epoch": 1.630994939412229, "grad_norm": 68.57245635986328, "learning_rate": 2.0219323088351363e-07, "logits/chosen": -19.813922882080078, "logits/rejected": -19.256366729736328, "logps/chosen": -361.8562316894531, "logps/rejected": -368.63836669921875, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1859092712402344, "rewards/margins": 0.6933329701423645, "rewards/rejected": 2.4925763607025146, "step": 35130 }, { "epoch": 1.631459213519662, "grad_norm": 6.485719203948975, "learning_rate": 2.0216537443706762e-07, "logits/chosen": -18.41254234313965, "logits/rejected": -19.167842864990234, "logps/chosen": -254.13134765625, "logps/rejected": -329.7611999511719, "loss": 1.1932, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.1775550842285156, "rewards/margins": -0.42584162950515747, "rewards/rejected": 2.6033968925476074, "step": 35140 }, { "epoch": 1.6319234876270952, "grad_norm": 30.21062660217285, "learning_rate": 2.0213751799062166e-07, "logits/chosen": -19.23236083984375, "logits/rejected": -18.36354637145996, "logps/chosen": -351.64837646484375, "logps/rejected": -192.64675903320312, "loss": 0.9988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2416296005249023, "rewards/margins": 0.7761621475219727, "rewards/rejected": 1.4654672145843506, "step": 35150 }, { "epoch": 1.632387761734528, "grad_norm": 90.02214050292969, "learning_rate": 2.0210966154417567e-07, "logits/chosen": -19.551610946655273, "logits/rejected": -19.69797134399414, "logps/chosen": -302.35699462890625, "logps/rejected": -320.0830078125, "loss": 1.1103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2525649070739746, "rewards/margins": -0.11946399509906769, "rewards/rejected": 2.3720285892486572, "step": 35160 }, { "epoch": 1.6328520358419611, "grad_norm": 28.240345001220703, "learning_rate": 2.0208180509772969e-07, "logits/chosen": -17.569181442260742, "logits/rejected": -17.297931671142578, "logps/chosen": -282.16107177734375, "logps/rejected": -208.67822265625, "loss": 0.3713, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3634679317474365, "rewards/margins": 1.140202522277832, "rewards/rejected": 1.2232654094696045, "step": 35170 }, { "epoch": 1.6333163099493941, "grad_norm": 26.57573890686035, "learning_rate": 2.020539486512837e-07, "logits/chosen": -18.161388397216797, "logits/rejected": -18.163311004638672, "logps/chosen": -342.95391845703125, "logps/rejected": -285.59954833984375, "loss": 0.918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0621325969696045, "rewards/margins": 0.05784636735916138, "rewards/rejected": 2.004286289215088, "step": 35180 }, { "epoch": 1.6337805840568271, "grad_norm": 102.81233978271484, "learning_rate": 2.0202609220483774e-07, "logits/chosen": -18.33846092224121, "logits/rejected": -18.67259407043457, "logps/chosen": -388.84222412109375, "logps/rejected": -494.13348388671875, "loss": 0.7574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5882885456085205, "rewards/margins": 0.4878184199333191, "rewards/rejected": 3.1004700660705566, "step": 35190 }, { "epoch": 1.6342448581642601, "grad_norm": 55.77485275268555, "learning_rate": 2.0199823575839176e-07, "logits/chosen": -18.547069549560547, "logits/rejected": -18.058155059814453, "logps/chosen": -338.08453369140625, "logps/rejected": -243.2981719970703, "loss": 0.831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.483508348464966, "rewards/margins": 0.5434225797653198, "rewards/rejected": 1.9400854110717773, "step": 35200 }, { "epoch": 1.6347091322716931, "grad_norm": 7.8886494636535645, "learning_rate": 2.0197037931194574e-07, "logits/chosen": -19.112993240356445, "logits/rejected": -17.948040008544922, "logps/chosen": -433.7574157714844, "logps/rejected": -287.3011474609375, "loss": 0.718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.98909330368042, "rewards/margins": 1.0814834833145142, "rewards/rejected": 1.9076099395751953, "step": 35210 }, { "epoch": 1.6351734063791263, "grad_norm": 36.233985900878906, "learning_rate": 2.0194252286549978e-07, "logits/chosen": -19.572118759155273, "logits/rejected": -18.52564811706543, "logps/chosen": -515.1915283203125, "logps/rejected": -418.89178466796875, "loss": 0.5675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8008530139923096, "rewards/margins": 1.1370424032211304, "rewards/rejected": 2.6638104915618896, "step": 35220 }, { "epoch": 1.635637680486559, "grad_norm": 54.67341232299805, "learning_rate": 2.019146664190538e-07, "logits/chosen": -19.34733009338379, "logits/rejected": -18.074207305908203, "logps/chosen": -464.76751708984375, "logps/rejected": -365.80224609375, "loss": 0.8092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6324551105499268, "rewards/margins": 0.9281859397888184, "rewards/rejected": 2.7042689323425293, "step": 35230 }, { "epoch": 1.6361019545939923, "grad_norm": 34.091827392578125, "learning_rate": 2.0188680997260784e-07, "logits/chosen": -19.117271423339844, "logits/rejected": -18.595144271850586, "logps/chosen": -300.6875305175781, "logps/rejected": -249.2833709716797, "loss": 0.9009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0899696350097656, "rewards/margins": 0.23985068500041962, "rewards/rejected": 1.850118637084961, "step": 35240 }, { "epoch": 1.6365662287014253, "grad_norm": 101.6327133178711, "learning_rate": 2.0185895352616183e-07, "logits/chosen": -19.524499893188477, "logits/rejected": -19.126728057861328, "logps/chosen": -275.2115783691406, "logps/rejected": -229.47708129882812, "loss": 0.911, "rewards/accuracies": 0.5, "rewards/chosen": 1.7810325622558594, "rewards/margins": 0.221779465675354, "rewards/rejected": 1.5592530965805054, "step": 35250 }, { "epoch": 1.6370305028088583, "grad_norm": 1.6383333206176758, "learning_rate": 2.0183109707971584e-07, "logits/chosen": -18.55113983154297, "logits/rejected": -17.672489166259766, "logps/chosen": -338.4887390136719, "logps/rejected": -251.4482879638672, "loss": 0.3262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5308144092559814, "rewards/margins": 1.6015799045562744, "rewards/rejected": 1.9292348623275757, "step": 35260 }, { "epoch": 1.6374947769162915, "grad_norm": 250.4969940185547, "learning_rate": 2.0180324063326988e-07, "logits/chosen": -19.023466110229492, "logits/rejected": -19.887842178344727, "logps/chosen": -471.1878967285156, "logps/rejected": -487.033935546875, "loss": 1.0484, "rewards/accuracies": 0.5, "rewards/chosen": 3.934633255004883, "rewards/margins": 0.0626080185174942, "rewards/rejected": 3.87202525138855, "step": 35270 }, { "epoch": 1.6379590510237243, "grad_norm": 5.268354892730713, "learning_rate": 2.017753841868239e-07, "logits/chosen": -18.666879653930664, "logits/rejected": -17.541749954223633, "logps/chosen": -387.97430419921875, "logps/rejected": -256.80548095703125, "loss": 0.4724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7207419872283936, "rewards/margins": 1.0862705707550049, "rewards/rejected": 1.6344711780548096, "step": 35280 }, { "epoch": 1.6384233251311575, "grad_norm": 15.389337539672852, "learning_rate": 2.0174752774037789e-07, "logits/chosen": -18.091440200805664, "logits/rejected": -18.309045791625977, "logps/chosen": -233.41201782226562, "logps/rejected": -289.3629150390625, "loss": 0.9043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.131155252456665, "rewards/margins": 0.37726113200187683, "rewards/rejected": 1.7538944482803345, "step": 35290 }, { "epoch": 1.6388875992385905, "grad_norm": 39.796630859375, "learning_rate": 2.0171967129393193e-07, "logits/chosen": -18.652835845947266, "logits/rejected": -18.81204605102539, "logps/chosen": -366.41656494140625, "logps/rejected": -364.5201110839844, "loss": 0.736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.334312915802002, "rewards/margins": 0.02867899462580681, "rewards/rejected": 2.305634021759033, "step": 35300 }, { "epoch": 1.6393518733460235, "grad_norm": 3.280697822570801, "learning_rate": 2.0169181484748594e-07, "logits/chosen": -18.41750144958496, "logits/rejected": -17.59612464904785, "logps/chosen": -315.3785095214844, "logps/rejected": -250.2706298828125, "loss": 0.5764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2734627723693848, "rewards/margins": 1.5301384925842285, "rewards/rejected": 1.7433242797851562, "step": 35310 }, { "epoch": 1.6398161474534565, "grad_norm": 50.94623565673828, "learning_rate": 2.0166395840103998e-07, "logits/chosen": -18.86223793029785, "logits/rejected": -17.146406173706055, "logps/chosen": -399.07830810546875, "logps/rejected": -280.4588317871094, "loss": 0.6794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9987895488739014, "rewards/margins": 1.180121898651123, "rewards/rejected": 1.8186676502227783, "step": 35320 }, { "epoch": 1.6402804215608895, "grad_norm": 16.383163452148438, "learning_rate": 2.0163610195459397e-07, "logits/chosen": -18.73811912536621, "logits/rejected": -17.491687774658203, "logps/chosen": -489.9315490722656, "logps/rejected": -335.25811767578125, "loss": 0.5087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.025382995605469, "rewards/margins": 1.4990848302841187, "rewards/rejected": 2.5262985229492188, "step": 35330 }, { "epoch": 1.6407446956683227, "grad_norm": 148.38612365722656, "learning_rate": 2.01608245508148e-07, "logits/chosen": -19.022253036499023, "logits/rejected": -19.11349105834961, "logps/chosen": -524.7828369140625, "logps/rejected": -439.38470458984375, "loss": 0.8416, "rewards/accuracies": 0.5, "rewards/chosen": 3.386890411376953, "rewards/margins": 0.1715337336063385, "rewards/rejected": 3.2153563499450684, "step": 35340 }, { "epoch": 1.6412089697757555, "grad_norm": 35.75102615356445, "learning_rate": 2.0158038906170203e-07, "logits/chosen": -18.948261260986328, "logits/rejected": -18.236221313476562, "logps/chosen": -484.46246337890625, "logps/rejected": -372.1646728515625, "loss": 0.4456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7412428855895996, "rewards/margins": 1.3740012645721436, "rewards/rejected": 2.367241382598877, "step": 35350 }, { "epoch": 1.6416732438831887, "grad_norm": 15.235627174377441, "learning_rate": 2.0155253261525607e-07, "logits/chosen": -19.660999298095703, "logits/rejected": -18.81353187561035, "logps/chosen": -353.44354248046875, "logps/rejected": -341.6640625, "loss": 0.6729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.106503963470459, "rewards/margins": 0.6367896795272827, "rewards/rejected": 2.469714403152466, "step": 35360 }, { "epoch": 1.6421375179906217, "grad_norm": 16.97924041748047, "learning_rate": 2.0152467616881006e-07, "logits/chosen": -19.56239891052246, "logits/rejected": -19.344181060791016, "logps/chosen": -443.252197265625, "logps/rejected": -385.7247009277344, "loss": 0.8238, "rewards/accuracies": 0.5, "rewards/chosen": 3.6185097694396973, "rewards/margins": 0.16044124960899353, "rewards/rejected": 3.458068370819092, "step": 35370 }, { "epoch": 1.6426017920980547, "grad_norm": 27.271251678466797, "learning_rate": 2.0149681972236407e-07, "logits/chosen": -18.024227142333984, "logits/rejected": -17.6854190826416, "logps/chosen": -313.5819396972656, "logps/rejected": -260.4505310058594, "loss": 0.6949, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1758294105529785, "rewards/margins": 0.5416935086250305, "rewards/rejected": 1.6341358423233032, "step": 35380 }, { "epoch": 1.6430660662054877, "grad_norm": 4.716803550720215, "learning_rate": 2.014689632759181e-07, "logits/chosen": -19.945934295654297, "logits/rejected": -18.832019805908203, "logps/chosen": -480.1521911621094, "logps/rejected": -289.4368896484375, "loss": 0.7096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2817142009735107, "rewards/margins": 1.217495083808899, "rewards/rejected": 2.0642189979553223, "step": 35390 }, { "epoch": 1.6435303403129207, "grad_norm": 106.42182922363281, "learning_rate": 2.014411068294721e-07, "logits/chosen": -19.534618377685547, "logits/rejected": -18.962373733520508, "logps/chosen": -481.1641540527344, "logps/rejected": -385.1488342285156, "loss": 0.4544, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.981875419616699, "rewards/margins": 0.896920382976532, "rewards/rejected": 2.0849547386169434, "step": 35400 }, { "epoch": 1.6439946144203539, "grad_norm": 62.02156448364258, "learning_rate": 2.0141325038302611e-07, "logits/chosen": -17.86865234375, "logits/rejected": -17.50469398498535, "logps/chosen": -316.24078369140625, "logps/rejected": -291.3163757324219, "loss": 0.7531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4068093299865723, "rewards/margins": 0.3335267901420593, "rewards/rejected": 2.0732827186584473, "step": 35410 }, { "epoch": 1.6444588885277867, "grad_norm": 110.8995590209961, "learning_rate": 2.0138539393658015e-07, "logits/chosen": -19.00838851928711, "logits/rejected": -18.43697166442871, "logps/chosen": -469.7964782714844, "logps/rejected": -400.145263671875, "loss": 0.703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1476120948791504, "rewards/margins": 0.7766801118850708, "rewards/rejected": 2.370932102203369, "step": 35420 }, { "epoch": 1.6449231626352199, "grad_norm": 113.14839935302734, "learning_rate": 2.0135753749013417e-07, "logits/chosen": -18.855083465576172, "logits/rejected": -18.19021987915039, "logps/chosen": -394.5973205566406, "logps/rejected": -264.1786193847656, "loss": 0.5151, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2343528270721436, "rewards/margins": 0.7403705716133118, "rewards/rejected": 1.4939820766448975, "step": 35430 }, { "epoch": 1.6453874367426529, "grad_norm": 99.66244506835938, "learning_rate": 2.0132968104368818e-07, "logits/chosen": -20.760456085205078, "logits/rejected": -19.702953338623047, "logps/chosen": -430.25811767578125, "logps/rejected": -314.2684631347656, "loss": 0.4722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.862138032913208, "rewards/margins": 1.1517150402069092, "rewards/rejected": 1.7104228734970093, "step": 35440 }, { "epoch": 1.6458517108500859, "grad_norm": 11.771214485168457, "learning_rate": 2.013018245972422e-07, "logits/chosen": -19.058116912841797, "logits/rejected": -17.720096588134766, "logps/chosen": -508.41046142578125, "logps/rejected": -315.7432556152344, "loss": 0.3641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8649799823760986, "rewards/margins": 1.5811620950698853, "rewards/rejected": 2.283817768096924, "step": 35450 }, { "epoch": 1.646315984957519, "grad_norm": 116.4475326538086, "learning_rate": 2.012739681507962e-07, "logits/chosen": -18.30768394470215, "logits/rejected": -18.087488174438477, "logps/chosen": -319.7944641113281, "logps/rejected": -342.48028564453125, "loss": 1.1857, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0331385135650635, "rewards/margins": 0.1456930935382843, "rewards/rejected": 2.8874452114105225, "step": 35460 }, { "epoch": 1.6467802590649518, "grad_norm": 199.67552185058594, "learning_rate": 2.0124611170435025e-07, "logits/chosen": -19.847957611083984, "logits/rejected": -18.63269805908203, "logps/chosen": -327.1537170410156, "logps/rejected": -299.61541748046875, "loss": 0.6023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.094494342803955, "rewards/margins": 1.209887146949768, "rewards/rejected": 2.8846070766448975, "step": 35470 }, { "epoch": 1.647244533172385, "grad_norm": 113.70222473144531, "learning_rate": 2.0121825525790424e-07, "logits/chosen": -18.279739379882812, "logits/rejected": -18.625341415405273, "logps/chosen": -331.3253479003906, "logps/rejected": -322.49249267578125, "loss": 0.5788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.036842107772827, "rewards/margins": 0.5519052743911743, "rewards/rejected": 2.4849371910095215, "step": 35480 }, { "epoch": 1.647708807279818, "grad_norm": 54.098426818847656, "learning_rate": 2.0119039881145828e-07, "logits/chosen": -18.860151290893555, "logits/rejected": -17.81053352355957, "logps/chosen": -441.57965087890625, "logps/rejected": -351.20867919921875, "loss": 0.6707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5106029510498047, "rewards/margins": 0.9742438197135925, "rewards/rejected": 2.5363590717315674, "step": 35490 }, { "epoch": 1.648173081387251, "grad_norm": 141.19496154785156, "learning_rate": 2.011625423650123e-07, "logits/chosen": -20.586000442504883, "logits/rejected": -19.916812896728516, "logps/chosen": -483.76031494140625, "logps/rejected": -384.76336669921875, "loss": 0.3758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.821114540100098, "rewards/margins": 1.2106115818023682, "rewards/rejected": 3.6105027198791504, "step": 35500 }, { "epoch": 1.648637355494684, "grad_norm": 80.35440826416016, "learning_rate": 2.0113468591856634e-07, "logits/chosen": -19.696563720703125, "logits/rejected": -19.21995735168457, "logps/chosen": -490.6822814941406, "logps/rejected": -461.6526794433594, "loss": 0.6774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7470321655273438, "rewards/margins": 0.35491520166397095, "rewards/rejected": 3.3921170234680176, "step": 35510 }, { "epoch": 1.649101629602117, "grad_norm": 171.31396484375, "learning_rate": 2.0110682947212033e-07, "logits/chosen": -18.522109985351562, "logits/rejected": -18.986194610595703, "logps/chosen": -388.4756164550781, "logps/rejected": -379.72113037109375, "loss": 1.5159, "rewards/accuracies": 0.5, "rewards/chosen": 2.928917407989502, "rewards/margins": -0.5817351341247559, "rewards/rejected": 3.5106520652770996, "step": 35520 }, { "epoch": 1.6495659037095503, "grad_norm": 89.31449890136719, "learning_rate": 2.0107897302567434e-07, "logits/chosen": -18.252155303955078, "logits/rejected": -19.060176849365234, "logps/chosen": -405.98443603515625, "logps/rejected": -485.3518981933594, "loss": 0.9994, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3953471183776855, "rewards/margins": -0.23136024177074432, "rewards/rejected": 3.6267075538635254, "step": 35530 }, { "epoch": 1.650030177816983, "grad_norm": 24.75301170349121, "learning_rate": 2.0105111657922838e-07, "logits/chosen": -19.611221313476562, "logits/rejected": -19.080310821533203, "logps/chosen": -385.8776550292969, "logps/rejected": -314.8069763183594, "loss": 1.0016, "rewards/accuracies": 0.5, "rewards/chosen": 3.4224791526794434, "rewards/margins": 0.225897878408432, "rewards/rejected": 3.1965808868408203, "step": 35540 }, { "epoch": 1.6504944519244162, "grad_norm": 58.24736022949219, "learning_rate": 2.010232601327824e-07, "logits/chosen": -18.566160202026367, "logits/rejected": -18.03409767150879, "logps/chosen": -341.03973388671875, "logps/rejected": -257.91229248046875, "loss": 0.5102, "rewards/accuracies": 0.5, "rewards/chosen": 2.4439492225646973, "rewards/margins": 1.0506391525268555, "rewards/rejected": 1.3933100700378418, "step": 35550 }, { "epoch": 1.6509587260318492, "grad_norm": 51.090084075927734, "learning_rate": 2.0099540368633638e-07, "logits/chosen": -19.707006454467773, "logits/rejected": -17.782133102416992, "logps/chosen": -387.67620849609375, "logps/rejected": -229.451171875, "loss": 0.4016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.614712953567505, "rewards/margins": 1.6011669635772705, "rewards/rejected": 2.0135459899902344, "step": 35560 }, { "epoch": 1.6514230001392822, "grad_norm": 61.38330078125, "learning_rate": 2.0096754723989043e-07, "logits/chosen": -18.35040855407715, "logits/rejected": -17.471567153930664, "logps/chosen": -373.4967346191406, "logps/rejected": -297.3498229980469, "loss": 0.5842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.823594808578491, "rewards/margins": 0.91346675157547, "rewards/rejected": 1.9101282358169556, "step": 35570 }, { "epoch": 1.6518872742467152, "grad_norm": 11.687095642089844, "learning_rate": 2.0093969079344444e-07, "logits/chosen": -18.858028411865234, "logits/rejected": -17.57402229309082, "logps/chosen": -459.146240234375, "logps/rejected": -311.1773376464844, "loss": 1.2216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5934650897979736, "rewards/margins": 0.5379816293716431, "rewards/rejected": 3.05548357963562, "step": 35580 }, { "epoch": 1.6523515483541482, "grad_norm": 20.67938232421875, "learning_rate": 2.0091183434699845e-07, "logits/chosen": -19.653989791870117, "logits/rejected": -18.54340362548828, "logps/chosen": -320.89813232421875, "logps/rejected": -323.0989074707031, "loss": 0.5742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9274089336395264, "rewards/margins": 1.2577155828475952, "rewards/rejected": 1.6696932315826416, "step": 35590 }, { "epoch": 1.6528158224615814, "grad_norm": 46.13583755493164, "learning_rate": 2.0088397790055247e-07, "logits/chosen": -18.49881362915039, "logits/rejected": -18.232213973999023, "logps/chosen": -311.1493835449219, "logps/rejected": -288.4012451171875, "loss": 0.6591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6795907020568848, "rewards/margins": 0.5321474075317383, "rewards/rejected": 2.1474437713623047, "step": 35600 }, { "epoch": 1.6532800965690142, "grad_norm": 1.692268967628479, "learning_rate": 2.008561214541065e-07, "logits/chosen": -18.03647232055664, "logits/rejected": -17.523113250732422, "logps/chosen": -364.01336669921875, "logps/rejected": -286.216552734375, "loss": 0.5956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7627615928649902, "rewards/margins": 1.5116562843322754, "rewards/rejected": 2.251105308532715, "step": 35610 }, { "epoch": 1.6537443706764474, "grad_norm": 7.2197184562683105, "learning_rate": 2.0082826500766052e-07, "logits/chosen": -19.07146453857422, "logits/rejected": -17.495086669921875, "logps/chosen": -514.8306884765625, "logps/rejected": -273.8348388671875, "loss": 0.2251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.061300754547119, "rewards/margins": 2.274381637573242, "rewards/rejected": 1.7869192361831665, "step": 35620 }, { "epoch": 1.6542086447838804, "grad_norm": 115.03982543945312, "learning_rate": 2.008004085612145e-07, "logits/chosen": -19.404163360595703, "logits/rejected": -18.465734481811523, "logps/chosen": -289.7460021972656, "logps/rejected": -195.86265563964844, "loss": 0.3328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.641878604888916, "rewards/margins": 1.481076955795288, "rewards/rejected": 1.1608011722564697, "step": 35630 }, { "epoch": 1.6546729188913134, "grad_norm": 17.949861526489258, "learning_rate": 2.0077255211476855e-07, "logits/chosen": -19.35140609741211, "logits/rejected": -18.549684524536133, "logps/chosen": -424.4662170410156, "logps/rejected": -336.0352478027344, "loss": 0.4727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3492445945739746, "rewards/margins": 1.4074842929840088, "rewards/rejected": 1.9417603015899658, "step": 35640 }, { "epoch": 1.6551371929987466, "grad_norm": 2.8651387691497803, "learning_rate": 2.0074469566832257e-07, "logits/chosen": -18.374286651611328, "logits/rejected": -17.5600528717041, "logps/chosen": -309.46539306640625, "logps/rejected": -261.7848815917969, "loss": 0.7046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0571560859680176, "rewards/margins": 1.1198484897613525, "rewards/rejected": 1.9373077154159546, "step": 35650 }, { "epoch": 1.6556014671061794, "grad_norm": 19.983779907226562, "learning_rate": 2.007168392218766e-07, "logits/chosen": -18.844091415405273, "logits/rejected": -18.908889770507812, "logps/chosen": -419.5735778808594, "logps/rejected": -415.19329833984375, "loss": 1.0451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.224492311477661, "rewards/margins": 0.022423435002565384, "rewards/rejected": 3.20206880569458, "step": 35660 }, { "epoch": 1.6560657412136126, "grad_norm": 2.4836156368255615, "learning_rate": 2.006889827754306e-07, "logits/chosen": -19.01352882385254, "logits/rejected": -17.484262466430664, "logps/chosen": -373.2889404296875, "logps/rejected": -228.9513702392578, "loss": 0.5158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.43452525138855, "rewards/margins": 1.905815839767456, "rewards/rejected": 1.528709053993225, "step": 35670 }, { "epoch": 1.6565300153210454, "grad_norm": 0.6666108965873718, "learning_rate": 2.006611263289846e-07, "logits/chosen": -18.664825439453125, "logits/rejected": -18.869430541992188, "logps/chosen": -262.47900390625, "logps/rejected": -272.9476013183594, "loss": 1.5259, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3303418159484863, "rewards/margins": -0.5736010670661926, "rewards/rejected": 2.903942823410034, "step": 35680 }, { "epoch": 1.6569942894284786, "grad_norm": 60.70173263549805, "learning_rate": 2.0063326988253865e-07, "logits/chosen": -18.181501388549805, "logits/rejected": -18.39080810546875, "logps/chosen": -394.9651794433594, "logps/rejected": -344.11761474609375, "loss": 0.931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.78554105758667, "rewards/margins": 0.24083784222602844, "rewards/rejected": 2.5447030067443848, "step": 35690 }, { "epoch": 1.6574585635359116, "grad_norm": 71.09652709960938, "learning_rate": 2.0060541343609267e-07, "logits/chosen": -18.244495391845703, "logits/rejected": -18.045703887939453, "logps/chosen": -365.01141357421875, "logps/rejected": -367.44873046875, "loss": 0.5616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1320223808288574, "rewards/margins": 0.7930802702903748, "rewards/rejected": 2.338942050933838, "step": 35700 }, { "epoch": 1.6579228376433446, "grad_norm": 120.14126586914062, "learning_rate": 2.0057755698964666e-07, "logits/chosen": -18.456375122070312, "logits/rejected": -17.016639709472656, "logps/chosen": -410.58447265625, "logps/rejected": -328.568115234375, "loss": 0.473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9917373657226562, "rewards/margins": 1.7745929956436157, "rewards/rejected": 2.21714448928833, "step": 35710 }, { "epoch": 1.6583871117507778, "grad_norm": 36.88850402832031, "learning_rate": 2.005497005432007e-07, "logits/chosen": -19.550321578979492, "logits/rejected": -17.401275634765625, "logps/chosen": -356.77777099609375, "logps/rejected": -247.7661590576172, "loss": 0.4293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2402853965759277, "rewards/margins": 1.52352774143219, "rewards/rejected": 1.7167580127716064, "step": 35720 }, { "epoch": 1.6588513858582106, "grad_norm": 46.172279357910156, "learning_rate": 2.005218440967547e-07, "logits/chosen": -17.74198341369629, "logits/rejected": -17.636837005615234, "logps/chosen": -366.4601135253906, "logps/rejected": -340.6084899902344, "loss": 1.0131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0500757694244385, "rewards/margins": 0.3735351264476776, "rewards/rejected": 2.6765406131744385, "step": 35730 }, { "epoch": 1.6593156599656438, "grad_norm": 139.88194274902344, "learning_rate": 2.0049398765030875e-07, "logits/chosen": -18.78530502319336, "logits/rejected": -18.280826568603516, "logps/chosen": -382.5887145996094, "logps/rejected": -363.6029968261719, "loss": 0.4495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.479236125946045, "rewards/margins": 1.1005653142929077, "rewards/rejected": 2.3786706924438477, "step": 35740 }, { "epoch": 1.6597799340730768, "grad_norm": 252.60443115234375, "learning_rate": 2.0046613120386274e-07, "logits/chosen": -19.11468505859375, "logits/rejected": -19.23185920715332, "logps/chosen": -433.36260986328125, "logps/rejected": -355.65728759765625, "loss": 1.1272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1129913330078125, "rewards/margins": 0.5793956518173218, "rewards/rejected": 3.533595561981201, "step": 35750 }, { "epoch": 1.6602442081805098, "grad_norm": 68.25569152832031, "learning_rate": 2.0043827475741678e-07, "logits/chosen": -18.659099578857422, "logits/rejected": -18.651124954223633, "logps/chosen": -396.2124938964844, "logps/rejected": -362.15899658203125, "loss": 0.5686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6698544025421143, "rewards/margins": 0.4904589056968689, "rewards/rejected": 2.1793949604034424, "step": 35760 }, { "epoch": 1.6607084822879428, "grad_norm": 119.62630462646484, "learning_rate": 2.004104183109708e-07, "logits/chosen": -18.08233070373535, "logits/rejected": -17.83824920654297, "logps/chosen": -370.9309387207031, "logps/rejected": -652.652099609375, "loss": 0.6831, "rewards/accuracies": 0.5, "rewards/chosen": 3.4507203102111816, "rewards/margins": 1.1222546100616455, "rewards/rejected": 2.328465700149536, "step": 35770 }, { "epoch": 1.6611727563953758, "grad_norm": 38.486473083496094, "learning_rate": 2.0038256186452478e-07, "logits/chosen": -18.797443389892578, "logits/rejected": -17.627140045166016, "logps/chosen": -422.0061950683594, "logps/rejected": -293.87774658203125, "loss": 0.2487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5957400798797607, "rewards/margins": 1.9158942699432373, "rewards/rejected": 1.6798458099365234, "step": 35780 }, { "epoch": 1.661637030502809, "grad_norm": 3.040489435195923, "learning_rate": 2.0035470541807882e-07, "logits/chosen": -20.26808738708496, "logits/rejected": -19.471942901611328, "logps/chosen": -378.0035095214844, "logps/rejected": -337.8227844238281, "loss": 0.6657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.09691858291626, "rewards/margins": 1.2225143909454346, "rewards/rejected": 2.8744044303894043, "step": 35790 }, { "epoch": 1.6621013046102417, "grad_norm": 175.1843719482422, "learning_rate": 2.0032684897163284e-07, "logits/chosen": -19.977096557617188, "logits/rejected": -19.801767349243164, "logps/chosen": -363.2142639160156, "logps/rejected": -282.9759521484375, "loss": 0.6147, "rewards/accuracies": 0.5, "rewards/chosen": 3.516486406326294, "rewards/margins": 0.8088085055351257, "rewards/rejected": 2.7076778411865234, "step": 35800 }, { "epoch": 1.662565578717675, "grad_norm": 61.71678161621094, "learning_rate": 2.0029899252518688e-07, "logits/chosen": -18.012069702148438, "logits/rejected": -18.32286262512207, "logps/chosen": -313.5522155761719, "logps/rejected": -289.663330078125, "loss": 0.665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1213345527648926, "rewards/margins": 0.49815019965171814, "rewards/rejected": 2.6231844425201416, "step": 35810 }, { "epoch": 1.663029852825108, "grad_norm": 103.33431243896484, "learning_rate": 2.0027113607874087e-07, "logits/chosen": -18.159992218017578, "logits/rejected": -17.844905853271484, "logps/chosen": -292.07354736328125, "logps/rejected": -255.81082153320312, "loss": 0.7461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7810215950012207, "rewards/margins": 0.5993285179138184, "rewards/rejected": 2.1816933155059814, "step": 35820 }, { "epoch": 1.663494126932541, "grad_norm": 21.221277236938477, "learning_rate": 2.0024327963229488e-07, "logits/chosen": -19.506637573242188, "logits/rejected": -17.950214385986328, "logps/chosen": -395.99835205078125, "logps/rejected": -257.74786376953125, "loss": 0.4498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8348402976989746, "rewards/margins": 1.4379475116729736, "rewards/rejected": 1.3968923091888428, "step": 35830 }, { "epoch": 1.6639584010399742, "grad_norm": 79.09835815429688, "learning_rate": 2.0021542318584892e-07, "logits/chosen": -18.93807029724121, "logits/rejected": -18.459667205810547, "logps/chosen": -471.8812561035156, "logps/rejected": -396.77001953125, "loss": 0.4537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2625434398651123, "rewards/margins": 0.8587539792060852, "rewards/rejected": 2.4037890434265137, "step": 35840 }, { "epoch": 1.664422675147407, "grad_norm": 7.522374629974365, "learning_rate": 2.0018756673940294e-07, "logits/chosen": -18.393299102783203, "logits/rejected": -18.000120162963867, "logps/chosen": -360.05487060546875, "logps/rejected": -248.5336151123047, "loss": 0.7431, "rewards/accuracies": 0.5, "rewards/chosen": 2.7593045234680176, "rewards/margins": 0.6920340657234192, "rewards/rejected": 2.067270517349243, "step": 35850 }, { "epoch": 1.6648869492548402, "grad_norm": 0.440950483083725, "learning_rate": 2.0015971029295695e-07, "logits/chosen": -19.602882385253906, "logits/rejected": -18.103900909423828, "logps/chosen": -412.36358642578125, "logps/rejected": -362.8974609375, "loss": 0.7412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.450812578201294, "rewards/margins": 0.8454698324203491, "rewards/rejected": 2.6053428649902344, "step": 35860 }, { "epoch": 1.665351223362273, "grad_norm": 160.9451446533203, "learning_rate": 2.0013185384651097e-07, "logits/chosen": -18.386857986450195, "logits/rejected": -17.208984375, "logps/chosen": -276.40045166015625, "logps/rejected": -186.29226684570312, "loss": 0.5045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3783364295959473, "rewards/margins": 1.0482107400894165, "rewards/rejected": 1.3301258087158203, "step": 35870 }, { "epoch": 1.6658154974697061, "grad_norm": 264.9822998046875, "learning_rate": 2.0010399740006498e-07, "logits/chosen": -18.3896541595459, "logits/rejected": -18.995820999145508, "logps/chosen": -303.8365783691406, "logps/rejected": -403.99432373046875, "loss": 1.8447, "rewards/accuracies": 0.5, "rewards/chosen": 2.6029722690582275, "rewards/margins": -1.0220203399658203, "rewards/rejected": 3.624992847442627, "step": 35880 }, { "epoch": 1.6662797715771391, "grad_norm": 4.961690425872803, "learning_rate": 2.0007614095361902e-07, "logits/chosen": -19.110179901123047, "logits/rejected": -18.723148345947266, "logps/chosen": -308.020263671875, "logps/rejected": -286.78643798828125, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4203872680664062, "rewards/margins": 0.8691235780715942, "rewards/rejected": 2.5512640476226807, "step": 35890 }, { "epoch": 1.6667440456845721, "grad_norm": 18.438600540161133, "learning_rate": 2.00048284507173e-07, "logits/chosen": -19.03531265258789, "logits/rejected": -17.5450439453125, "logps/chosen": -511.0044860839844, "logps/rejected": -294.71295166015625, "loss": 0.9769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8829636573791504, "rewards/margins": 1.3225853443145752, "rewards/rejected": 2.5603785514831543, "step": 35900 }, { "epoch": 1.6672083197920053, "grad_norm": 1.3948025703430176, "learning_rate": 2.0002042806072705e-07, "logits/chosen": -18.794158935546875, "logits/rejected": -18.308452606201172, "logps/chosen": -412.00482177734375, "logps/rejected": -307.94696044921875, "loss": 0.6682, "rewards/accuracies": 0.5, "rewards/chosen": 3.8105311393737793, "rewards/margins": 0.9854987859725952, "rewards/rejected": 2.8250324726104736, "step": 35910 }, { "epoch": 1.6676725938994381, "grad_norm": 36.106285095214844, "learning_rate": 1.9999257161428107e-07, "logits/chosen": -19.535863876342773, "logits/rejected": -17.485254287719727, "logps/chosen": -363.17950439453125, "logps/rejected": -219.3507843017578, "loss": 0.5024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7449002265930176, "rewards/margins": 1.6584535837173462, "rewards/rejected": 1.086446762084961, "step": 35920 }, { "epoch": 1.6681368680068713, "grad_norm": 36.5155029296875, "learning_rate": 1.999647151678351e-07, "logits/chosen": -18.578508377075195, "logits/rejected": -17.895784378051758, "logps/chosen": -365.49395751953125, "logps/rejected": -230.35989379882812, "loss": 0.5288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0973987579345703, "rewards/margins": 1.1162340641021729, "rewards/rejected": 1.9811646938323975, "step": 35930 }, { "epoch": 1.6686011421143043, "grad_norm": 43.1768913269043, "learning_rate": 1.999368587213891e-07, "logits/chosen": -18.139490127563477, "logits/rejected": -17.07356071472168, "logps/chosen": -476.33843994140625, "logps/rejected": -247.2860870361328, "loss": 0.3932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.355180263519287, "rewards/margins": 1.7989383935928345, "rewards/rejected": 1.5562423467636108, "step": 35940 }, { "epoch": 1.6690654162217373, "grad_norm": 30.867801666259766, "learning_rate": 1.999090022749431e-07, "logits/chosen": -18.89044189453125, "logits/rejected": -18.507070541381836, "logps/chosen": -403.21002197265625, "logps/rejected": -370.7601318359375, "loss": 1.0148, "rewards/accuracies": 0.5, "rewards/chosen": 2.960810661315918, "rewards/margins": -0.06096668168902397, "rewards/rejected": 3.0217771530151367, "step": 35950 }, { "epoch": 1.6695296903291703, "grad_norm": 5.171899318695068, "learning_rate": 1.9988114582849715e-07, "logits/chosen": -20.017810821533203, "logits/rejected": -18.896512985229492, "logps/chosen": -322.8949279785156, "logps/rejected": -243.53475952148438, "loss": 0.4653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.213993787765503, "rewards/margins": 1.0906354188919067, "rewards/rejected": 2.123358726501465, "step": 35960 }, { "epoch": 1.6699939644366033, "grad_norm": 72.51431274414062, "learning_rate": 1.9985328938205114e-07, "logits/chosen": -18.422786712646484, "logits/rejected": -17.52161407470703, "logps/chosen": -406.34735107421875, "logps/rejected": -322.6549072265625, "loss": 0.6827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4782874584198, "rewards/margins": 0.6511651873588562, "rewards/rejected": 2.827122211456299, "step": 35970 }, { "epoch": 1.6704582385440365, "grad_norm": 126.30784606933594, "learning_rate": 1.9982543293560515e-07, "logits/chosen": -18.690752029418945, "logits/rejected": -18.628002166748047, "logps/chosen": -413.8785705566406, "logps/rejected": -436.19940185546875, "loss": 0.7457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.106724739074707, "rewards/margins": 0.15855123102664948, "rewards/rejected": 3.9481730461120605, "step": 35980 }, { "epoch": 1.6709225126514693, "grad_norm": 21.672056198120117, "learning_rate": 1.997975764891592e-07, "logits/chosen": -19.779691696166992, "logits/rejected": -18.15406608581543, "logps/chosen": -274.46429443359375, "logps/rejected": -190.35470581054688, "loss": 0.6553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.260746717453003, "rewards/margins": 1.2444560527801514, "rewards/rejected": 1.0162907838821411, "step": 35990 }, { "epoch": 1.6713867867589025, "grad_norm": 23.714662551879883, "learning_rate": 1.997697200427132e-07, "logits/chosen": -19.08974838256836, "logits/rejected": -17.99017333984375, "logps/chosen": -419.4949645996094, "logps/rejected": -315.10797119140625, "loss": 0.6038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0555474758148193, "rewards/margins": 0.978315532207489, "rewards/rejected": 2.0772318840026855, "step": 36000 }, { "epoch": 1.6718510608663355, "grad_norm": 82.04595947265625, "learning_rate": 1.9974186359626722e-07, "logits/chosen": -18.764949798583984, "logits/rejected": -18.13778305053711, "logps/chosen": -320.3946838378906, "logps/rejected": -265.8667907714844, "loss": 0.6383, "rewards/accuracies": 0.5, "rewards/chosen": 2.528273105621338, "rewards/margins": 0.5500487089157104, "rewards/rejected": 1.9782243967056274, "step": 36010 }, { "epoch": 1.6723153349737685, "grad_norm": 93.23323059082031, "learning_rate": 1.9971400714982124e-07, "logits/chosen": -19.421201705932617, "logits/rejected": -18.270469665527344, "logps/chosen": -290.75042724609375, "logps/rejected": -266.68157958984375, "loss": 0.8288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1838126182556152, "rewards/margins": 0.23585724830627441, "rewards/rejected": 1.9479553699493408, "step": 36020 }, { "epoch": 1.6727796090812015, "grad_norm": 226.23562622070312, "learning_rate": 1.9968615070337528e-07, "logits/chosen": -18.887859344482422, "logits/rejected": -19.049694061279297, "logps/chosen": -284.8559875488281, "logps/rejected": -329.10858154296875, "loss": 1.2043, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.413262128829956, "rewards/margins": -0.3673926889896393, "rewards/rejected": 2.7806549072265625, "step": 36030 }, { "epoch": 1.6732438831886345, "grad_norm": 106.58890533447266, "learning_rate": 1.996582942569293e-07, "logits/chosen": -19.15468406677246, "logits/rejected": -17.794998168945312, "logps/chosen": -314.4808044433594, "logps/rejected": -266.545654296875, "loss": 0.5426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0008912086486816, "rewards/margins": 1.3751471042633057, "rewards/rejected": 1.625744104385376, "step": 36040 }, { "epoch": 1.6737081572960677, "grad_norm": 3.502358913421631, "learning_rate": 1.9963043781048328e-07, "logits/chosen": -18.35589027404785, "logits/rejected": -18.322368621826172, "logps/chosen": -329.60723876953125, "logps/rejected": -258.2671813964844, "loss": 0.5983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.428330659866333, "rewards/margins": 0.7665795683860779, "rewards/rejected": 2.6617510318756104, "step": 36050 }, { "epoch": 1.6741724314035005, "grad_norm": 2.8406262397766113, "learning_rate": 1.9960258136403732e-07, "logits/chosen": -18.84065818786621, "logits/rejected": -19.367727279663086, "logps/chosen": -399.67724609375, "logps/rejected": -424.4867248535156, "loss": 0.9659, "rewards/accuracies": 0.5, "rewards/chosen": 3.963043212890625, "rewards/margins": 0.1061619371175766, "rewards/rejected": 3.8568813800811768, "step": 36060 }, { "epoch": 1.6746367055109337, "grad_norm": 16.723268508911133, "learning_rate": 1.9957472491759134e-07, "logits/chosen": -17.99612045288086, "logits/rejected": -18.28916358947754, "logps/chosen": -385.8811340332031, "logps/rejected": -360.71014404296875, "loss": 0.9459, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.063291072845459, "rewards/margins": -0.015081515535712242, "rewards/rejected": 3.0783727169036865, "step": 36070 }, { "epoch": 1.6751009796183667, "grad_norm": 205.27987670898438, "learning_rate": 1.9954686847114538e-07, "logits/chosen": -18.77621078491211, "logits/rejected": -17.440013885498047, "logps/chosen": -420.875732421875, "logps/rejected": -257.0325927734375, "loss": 0.537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.597311496734619, "rewards/margins": 1.7924816608428955, "rewards/rejected": 1.8048295974731445, "step": 36080 }, { "epoch": 1.6755652537257997, "grad_norm": 4.455717086791992, "learning_rate": 1.9951901202469937e-07, "logits/chosen": -18.889612197875977, "logits/rejected": -17.18325424194336, "logps/chosen": -430.4457092285156, "logps/rejected": -274.95355224609375, "loss": 0.3037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.262722492218018, "rewards/margins": 2.0789122581481934, "rewards/rejected": 2.183809995651245, "step": 36090 }, { "epoch": 1.676029527833233, "grad_norm": 192.24827575683594, "learning_rate": 1.9949115557825338e-07, "logits/chosen": -18.481822967529297, "logits/rejected": -19.25775146484375, "logps/chosen": -383.0227355957031, "logps/rejected": -393.1094055175781, "loss": 1.161, "rewards/accuracies": 0.5, "rewards/chosen": 3.2696259021759033, "rewards/margins": -0.3178948760032654, "rewards/rejected": 3.5875205993652344, "step": 36100 }, { "epoch": 1.6764938019406657, "grad_norm": 32.75651550292969, "learning_rate": 1.9946329913180742e-07, "logits/chosen": -18.11223793029785, "logits/rejected": -18.091175079345703, "logps/chosen": -392.04840087890625, "logps/rejected": -366.18780517578125, "loss": 1.0724, "rewards/accuracies": 0.5, "rewards/chosen": 3.205324172973633, "rewards/margins": -0.1971026510000229, "rewards/rejected": 3.4024269580841064, "step": 36110 }, { "epoch": 1.6769580760480989, "grad_norm": 76.4565658569336, "learning_rate": 1.9943544268536144e-07, "logits/chosen": -18.449878692626953, "logits/rejected": -17.649667739868164, "logps/chosen": -371.7091369628906, "logps/rejected": -288.70025634765625, "loss": 0.9145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1156344413757324, "rewards/margins": 1.0483016967773438, "rewards/rejected": 2.0673327445983887, "step": 36120 }, { "epoch": 1.6774223501555319, "grad_norm": 64.67799377441406, "learning_rate": 1.9940758623891542e-07, "logits/chosen": -18.406171798706055, "logits/rejected": -17.91221809387207, "logps/chosen": -323.5105285644531, "logps/rejected": -276.75799560546875, "loss": 0.9151, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3628644943237305, "rewards/margins": 0.36401963233947754, "rewards/rejected": 1.998844861984253, "step": 36130 }, { "epoch": 1.6778866242629649, "grad_norm": 1.0919430255889893, "learning_rate": 1.9937972979246947e-07, "logits/chosen": -19.525083541870117, "logits/rejected": -18.387104034423828, "logps/chosen": -403.0536193847656, "logps/rejected": -206.236083984375, "loss": 0.1409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.846459865570068, "rewards/margins": 3.0192153453826904, "rewards/rejected": 1.8272440433502197, "step": 36140 }, { "epoch": 1.6783508983703979, "grad_norm": 211.9741668701172, "learning_rate": 1.9935187334602348e-07, "logits/chosen": -18.901281356811523, "logits/rejected": -18.680889129638672, "logps/chosen": -315.55987548828125, "logps/rejected": -268.5801696777344, "loss": 0.9005, "rewards/accuracies": 0.5, "rewards/chosen": 2.279193639755249, "rewards/margins": 0.16112704575061798, "rewards/rejected": 2.1180663108825684, "step": 36150 }, { "epoch": 1.6788151724778309, "grad_norm": 162.68406677246094, "learning_rate": 1.993240168995775e-07, "logits/chosen": -18.67620849609375, "logits/rejected": -17.60115623474121, "logps/chosen": -313.97222900390625, "logps/rejected": -239.19619750976562, "loss": 0.879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.818309783935547, "rewards/margins": 1.3911091089248657, "rewards/rejected": 1.4272009134292603, "step": 36160 }, { "epoch": 1.679279446585264, "grad_norm": 114.82035827636719, "learning_rate": 1.992961604531315e-07, "logits/chosen": -18.152624130249023, "logits/rejected": -17.45047950744629, "logps/chosen": -402.86383056640625, "logps/rejected": -355.987548828125, "loss": 0.8762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7515950202941895, "rewards/margins": 0.08240324258804321, "rewards/rejected": 2.669191837310791, "step": 36170 }, { "epoch": 1.6797437206926968, "grad_norm": 69.00993347167969, "learning_rate": 1.9926830400668555e-07, "logits/chosen": -19.272871017456055, "logits/rejected": -18.272228240966797, "logps/chosen": -361.8020935058594, "logps/rejected": -277.2344970703125, "loss": 0.6715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0976462364196777, "rewards/margins": 0.6708501577377319, "rewards/rejected": 2.426795721054077, "step": 36180 }, { "epoch": 1.68020799480013, "grad_norm": 57.83627700805664, "learning_rate": 1.9924044756023957e-07, "logits/chosen": -19.305004119873047, "logits/rejected": -18.61964988708496, "logps/chosen": -473.53729248046875, "logps/rejected": -345.3972473144531, "loss": 0.2989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.666628360748291, "rewards/margins": 1.5016789436340332, "rewards/rejected": 3.164949417114258, "step": 36190 }, { "epoch": 1.680672268907563, "grad_norm": 4.414321422576904, "learning_rate": 1.9921259111379355e-07, "logits/chosen": -19.016366958618164, "logits/rejected": -18.33770179748535, "logps/chosen": -423.7220764160156, "logps/rejected": -313.36553955078125, "loss": 0.413, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3302559852600098, "rewards/margins": 1.303585410118103, "rewards/rejected": 2.026670455932617, "step": 36200 }, { "epoch": 1.681136543014996, "grad_norm": 42.84124755859375, "learning_rate": 1.991847346673476e-07, "logits/chosen": -19.022541046142578, "logits/rejected": -17.965984344482422, "logps/chosen": -394.29022216796875, "logps/rejected": -256.251953125, "loss": 0.4545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.967137098312378, "rewards/margins": 1.4074026346206665, "rewards/rejected": 1.5597344636917114, "step": 36210 }, { "epoch": 1.681600817122429, "grad_norm": 58.13102722167969, "learning_rate": 1.991568782209016e-07, "logits/chosen": -18.418315887451172, "logits/rejected": -17.574615478515625, "logps/chosen": -237.56228637695312, "logps/rejected": -190.44680786132812, "loss": 0.6624, "rewards/accuracies": 0.5, "rewards/chosen": 2.1815996170043945, "rewards/margins": 0.7761985659599304, "rewards/rejected": 1.4054008722305298, "step": 36220 }, { "epoch": 1.682065091229862, "grad_norm": 111.23419952392578, "learning_rate": 1.9912902177445565e-07, "logits/chosen": -17.88053321838379, "logits/rejected": -16.970962524414062, "logps/chosen": -339.42706298828125, "logps/rejected": -206.41720581054688, "loss": 0.4521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2115330696105957, "rewards/margins": 1.2794359922409058, "rewards/rejected": 0.9320972561836243, "step": 36230 }, { "epoch": 1.6825293653372952, "grad_norm": 133.21188354492188, "learning_rate": 1.9910116532800964e-07, "logits/chosen": -17.9277286529541, "logits/rejected": -17.150421142578125, "logps/chosen": -344.70440673828125, "logps/rejected": -261.21331787109375, "loss": 0.629, "rewards/accuracies": 0.5, "rewards/chosen": 2.7389187812805176, "rewards/margins": 1.4368584156036377, "rewards/rejected": 1.3020604848861694, "step": 36240 }, { "epoch": 1.682993639444728, "grad_norm": 7.438652992248535, "learning_rate": 1.9907330888156365e-07, "logits/chosen": -19.2372989654541, "logits/rejected": -17.37443733215332, "logps/chosen": -597.9495239257812, "logps/rejected": -354.9342041015625, "loss": 0.3335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.582236289978027, "rewards/margins": 2.2459964752197266, "rewards/rejected": 3.33624005317688, "step": 36250 }, { "epoch": 1.6834579135521612, "grad_norm": 86.1114730834961, "learning_rate": 1.990454524351177e-07, "logits/chosen": -18.042367935180664, "logits/rejected": -17.147686004638672, "logps/chosen": -456.5406799316406, "logps/rejected": -307.66351318359375, "loss": 0.5535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4733662605285645, "rewards/margins": 0.8426202535629272, "rewards/rejected": 1.6307461261749268, "step": 36260 }, { "epoch": 1.6839221876595942, "grad_norm": 149.11505126953125, "learning_rate": 1.990175959886717e-07, "logits/chosen": -18.906543731689453, "logits/rejected": -17.65938377380371, "logps/chosen": -460.0205078125, "logps/rejected": -356.19622802734375, "loss": 0.5689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.244018077850342, "rewards/margins": 0.9203327298164368, "rewards/rejected": 2.3236851692199707, "step": 36270 }, { "epoch": 1.6843864617670272, "grad_norm": 41.4940185546875, "learning_rate": 1.9898973954222572e-07, "logits/chosen": -18.923099517822266, "logits/rejected": -18.07870101928711, "logps/chosen": -425.921142578125, "logps/rejected": -459.77081298828125, "loss": 0.7763, "rewards/accuracies": 0.5, "rewards/chosen": 3.0957396030426025, "rewards/margins": 0.3050946593284607, "rewards/rejected": 2.790644884109497, "step": 36280 }, { "epoch": 1.6848507358744604, "grad_norm": 45.417701721191406, "learning_rate": 1.9896188309577974e-07, "logits/chosen": -19.034793853759766, "logits/rejected": -18.25767707824707, "logps/chosen": -234.5105438232422, "logps/rejected": -218.9454345703125, "loss": 0.6237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.806810975074768, "rewards/margins": 0.20315614342689514, "rewards/rejected": 1.6036548614501953, "step": 36290 }, { "epoch": 1.6853150099818932, "grad_norm": 25.389995574951172, "learning_rate": 1.9893402664933375e-07, "logits/chosen": -19.9679012298584, "logits/rejected": -18.946651458740234, "logps/chosen": -309.49542236328125, "logps/rejected": -239.112548828125, "loss": 0.3977, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2272560596466064, "rewards/margins": 1.3485031127929688, "rewards/rejected": 1.8787529468536377, "step": 36300 }, { "epoch": 1.6857792840893264, "grad_norm": 21.86545181274414, "learning_rate": 1.989061702028878e-07, "logits/chosen": -19.738229751586914, "logits/rejected": -18.346729278564453, "logps/chosen": -453.84228515625, "logps/rejected": -262.003173828125, "loss": 0.2776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9014594554901123, "rewards/margins": 2.208772659301758, "rewards/rejected": 1.6926876306533813, "step": 36310 }, { "epoch": 1.6862435581967594, "grad_norm": 70.93141174316406, "learning_rate": 1.9887831375644178e-07, "logits/chosen": -19.259714126586914, "logits/rejected": -18.855459213256836, "logps/chosen": -338.60687255859375, "logps/rejected": -347.08721923828125, "loss": 0.8767, "rewards/accuracies": 0.5, "rewards/chosen": 2.633255958557129, "rewards/margins": 0.02834146097302437, "rewards/rejected": 2.604914903640747, "step": 36320 }, { "epoch": 1.6867078323041924, "grad_norm": 77.55646514892578, "learning_rate": 1.9885045730999582e-07, "logits/chosen": -19.158483505249023, "logits/rejected": -18.788211822509766, "logps/chosen": -279.568115234375, "logps/rejected": -237.43057250976562, "loss": 0.5341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.434208393096924, "rewards/margins": 0.6330970525741577, "rewards/rejected": 1.8011114597320557, "step": 36330 }, { "epoch": 1.6871721064116254, "grad_norm": 0.6538992524147034, "learning_rate": 1.9882260086354984e-07, "logits/chosen": -18.549278259277344, "logits/rejected": -17.680667877197266, "logps/chosen": -346.8995056152344, "logps/rejected": -258.5130615234375, "loss": 0.3322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3452041149139404, "rewards/margins": 1.6513198614120483, "rewards/rejected": 1.6938844919204712, "step": 36340 }, { "epoch": 1.6876363805190584, "grad_norm": 67.01958465576172, "learning_rate": 1.9879474441710388e-07, "logits/chosen": -19.489580154418945, "logits/rejected": -18.596609115600586, "logps/chosen": -449.465087890625, "logps/rejected": -331.58544921875, "loss": 0.3966, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9826221466064453, "rewards/margins": 1.2780269384384155, "rewards/rejected": 2.7045953273773193, "step": 36350 }, { "epoch": 1.6881006546264916, "grad_norm": 62.6153678894043, "learning_rate": 1.9876688797065787e-07, "logits/chosen": -18.42671012878418, "logits/rejected": -17.684261322021484, "logps/chosen": -381.2870788574219, "logps/rejected": -321.87261962890625, "loss": 0.6387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.012413740158081, "rewards/margins": 0.9502664804458618, "rewards/rejected": 2.062147617340088, "step": 36360 }, { "epoch": 1.6885649287339244, "grad_norm": 105.0152359008789, "learning_rate": 1.9873903152421188e-07, "logits/chosen": -18.1485538482666, "logits/rejected": -17.770841598510742, "logps/chosen": -372.14080810546875, "logps/rejected": -308.61700439453125, "loss": 0.79, "rewards/accuracies": 0.5, "rewards/chosen": 2.5698490142822266, "rewards/margins": 0.4318002164363861, "rewards/rejected": 2.1380486488342285, "step": 36370 }, { "epoch": 1.6890292028413576, "grad_norm": 46.616302490234375, "learning_rate": 1.9871117507776592e-07, "logits/chosen": -18.630481719970703, "logits/rejected": -17.459474563598633, "logps/chosen": -514.2506713867188, "logps/rejected": -364.64984130859375, "loss": 0.6144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8505091667175293, "rewards/margins": 0.8089064359664917, "rewards/rejected": 3.041602849960327, "step": 36380 }, { "epoch": 1.6894934769487906, "grad_norm": 114.90370178222656, "learning_rate": 1.986833186313199e-07, "logits/chosen": -18.90261459350586, "logits/rejected": -18.49554443359375, "logps/chosen": -397.43804931640625, "logps/rejected": -393.260986328125, "loss": 0.895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.041256904602051, "rewards/margins": 0.7700886726379395, "rewards/rejected": 3.2711684703826904, "step": 36390 }, { "epoch": 1.6899577510562236, "grad_norm": 0.48984333872795105, "learning_rate": 1.9865546218487392e-07, "logits/chosen": -19.30734634399414, "logits/rejected": -18.156570434570312, "logps/chosen": -519.382080078125, "logps/rejected": -338.3804016113281, "loss": 0.6844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.375322341918945, "rewards/margins": 1.2930127382278442, "rewards/rejected": 3.0823092460632324, "step": 36400 }, { "epoch": 1.6904220251636566, "grad_norm": 122.95182037353516, "learning_rate": 1.9862760573842796e-07, "logits/chosen": -18.755901336669922, "logits/rejected": -18.171627044677734, "logps/chosen": -397.33673095703125, "logps/rejected": -444.86138916015625, "loss": 1.2699, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3978774547576904, "rewards/margins": -0.09472785145044327, "rewards/rejected": 3.492605209350586, "step": 36410 }, { "epoch": 1.6908862992710896, "grad_norm": 47.07961654663086, "learning_rate": 1.9859974929198198e-07, "logits/chosen": -18.78236198425293, "logits/rejected": -18.858333587646484, "logps/chosen": -457.4764099121094, "logps/rejected": -367.73773193359375, "loss": 0.4139, "rewards/accuracies": 1.0, "rewards/chosen": 3.0957040786743164, "rewards/margins": 0.8402596712112427, "rewards/rejected": 2.2554447650909424, "step": 36420 }, { "epoch": 1.6913505733785228, "grad_norm": 80.43917083740234, "learning_rate": 1.98571892845536e-07, "logits/chosen": -19.89078140258789, "logits/rejected": -18.80740737915039, "logps/chosen": -481.8092346191406, "logps/rejected": -370.12274169921875, "loss": 0.489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.895597457885742, "rewards/margins": 0.8758190274238586, "rewards/rejected": 3.0197787284851074, "step": 36430 }, { "epoch": 1.6918148474859556, "grad_norm": 25.070878982543945, "learning_rate": 1.9854403639909e-07, "logits/chosen": -18.70043182373047, "logits/rejected": -18.7058162689209, "logps/chosen": -477.7041015625, "logps/rejected": -408.27789306640625, "loss": 0.5214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.551440715789795, "rewards/margins": 0.6753556132316589, "rewards/rejected": 2.876085042953491, "step": 36440 }, { "epoch": 1.6922791215933888, "grad_norm": 114.41699981689453, "learning_rate": 1.9851617995264405e-07, "logits/chosen": -19.957294464111328, "logits/rejected": -18.66826629638672, "logps/chosen": -430.6249084472656, "logps/rejected": -272.7649230957031, "loss": 0.6032, "rewards/accuracies": 0.5, "rewards/chosen": 3.791531801223755, "rewards/margins": 1.4780323505401611, "rewards/rejected": 2.313499689102173, "step": 36450 }, { "epoch": 1.6927433957008218, "grad_norm": 135.15377807617188, "learning_rate": 1.9848832350619806e-07, "logits/chosen": -19.947132110595703, "logits/rejected": -19.661882400512695, "logps/chosen": -519.2824096679688, "logps/rejected": -414.4588928222656, "loss": 0.5039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.449049472808838, "rewards/margins": 1.1289842128753662, "rewards/rejected": 2.3200652599334717, "step": 36460 }, { "epoch": 1.6932076698082548, "grad_norm": 12.440269470214844, "learning_rate": 1.9846046705975205e-07, "logits/chosen": -18.368974685668945, "logits/rejected": -16.813552856445312, "logps/chosen": -429.8075256347656, "logps/rejected": -341.0434875488281, "loss": 0.6368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.362128496170044, "rewards/margins": 1.5814214944839478, "rewards/rejected": 1.7807070016860962, "step": 36470 }, { "epoch": 1.693671943915688, "grad_norm": 60.83989715576172, "learning_rate": 1.984326106133061e-07, "logits/chosen": -18.489200592041016, "logits/rejected": -17.319042205810547, "logps/chosen": -351.6613464355469, "logps/rejected": -218.6893310546875, "loss": 0.4761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2635796070098877, "rewards/margins": 1.149446725845337, "rewards/rejected": 2.11413311958313, "step": 36480 }, { "epoch": 1.6941362180231208, "grad_norm": 9.991253852844238, "learning_rate": 1.984047541668601e-07, "logits/chosen": -18.57135581970215, "logits/rejected": -16.685672760009766, "logps/chosen": -364.55841064453125, "logps/rejected": -219.5746307373047, "loss": 0.2916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2953104972839355, "rewards/margins": 1.88790762424469, "rewards/rejected": 1.4074032306671143, "step": 36490 }, { "epoch": 1.694600492130554, "grad_norm": 4.926429271697998, "learning_rate": 1.9837689772041415e-07, "logits/chosen": -18.818058013916016, "logits/rejected": -18.471267700195312, "logps/chosen": -344.12615966796875, "logps/rejected": -333.7179870605469, "loss": 1.037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.464669704437256, "rewards/margins": 0.2130182981491089, "rewards/rejected": 3.2516517639160156, "step": 36500 }, { "epoch": 1.6950647662379867, "grad_norm": 61.75687789916992, "learning_rate": 1.9834904127396814e-07, "logits/chosen": -18.834747314453125, "logits/rejected": -18.430692672729492, "logps/chosen": -283.0039978027344, "logps/rejected": -223.36752319335938, "loss": 1.0887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0445942878723145, "rewards/margins": -0.09325455129146576, "rewards/rejected": 2.1378493309020996, "step": 36510 }, { "epoch": 1.69552904034542, "grad_norm": 15.762524604797363, "learning_rate": 1.9832118482752215e-07, "logits/chosen": -17.994590759277344, "logits/rejected": -17.93876075744629, "logps/chosen": -394.45880126953125, "logps/rejected": -320.2159729003906, "loss": 0.7097, "rewards/accuracies": 0.5, "rewards/chosen": 2.816837787628174, "rewards/margins": 0.6334999203681946, "rewards/rejected": 2.183337926864624, "step": 36520 }, { "epoch": 1.695993314452853, "grad_norm": 16.936716079711914, "learning_rate": 1.982933283810762e-07, "logits/chosen": -18.005033493041992, "logits/rejected": -17.697153091430664, "logps/chosen": -275.53704833984375, "logps/rejected": -272.39129638671875, "loss": 0.8561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.848365068435669, "rewards/margins": 0.3789241313934326, "rewards/rejected": 1.4694409370422363, "step": 36530 }, { "epoch": 1.696457588560286, "grad_norm": 145.572021484375, "learning_rate": 1.982654719346302e-07, "logits/chosen": -19.590938568115234, "logits/rejected": -20.174875259399414, "logps/chosen": -382.52740478515625, "logps/rejected": -365.4790954589844, "loss": 0.6972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.306800365447998, "rewards/margins": 0.5129875540733337, "rewards/rejected": 2.7938132286071777, "step": 36540 }, { "epoch": 1.6969218626677192, "grad_norm": 114.34873962402344, "learning_rate": 1.982376154881842e-07, "logits/chosen": -19.125225067138672, "logits/rejected": -19.243505477905273, "logps/chosen": -288.991455078125, "logps/rejected": -267.2884826660156, "loss": 1.0671, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.246567964553833, "rewards/margins": -0.23488791286945343, "rewards/rejected": 2.4814558029174805, "step": 36550 }, { "epoch": 1.697386136775152, "grad_norm": 124.56674194335938, "learning_rate": 1.9820975904173824e-07, "logits/chosen": -18.64740562438965, "logits/rejected": -18.360754013061523, "logps/chosen": -359.577880859375, "logps/rejected": -290.6345520019531, "loss": 0.7671, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4787681102752686, "rewards/margins": 0.3309401273727417, "rewards/rejected": 3.1478281021118164, "step": 36560 }, { "epoch": 1.6978504108825851, "grad_norm": 20.786550521850586, "learning_rate": 1.9818190259529225e-07, "logits/chosen": -19.04642677307129, "logits/rejected": -17.675395965576172, "logps/chosen": -429.7041931152344, "logps/rejected": -313.6207275390625, "loss": 0.3675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9062957763671875, "rewards/margins": 1.7173633575439453, "rewards/rejected": 2.188932180404663, "step": 36570 }, { "epoch": 1.6983146849900181, "grad_norm": 158.97349548339844, "learning_rate": 1.9815404614884626e-07, "logits/chosen": -18.637035369873047, "logits/rejected": -18.12228012084961, "logps/chosen": -361.8296203613281, "logps/rejected": -344.66412353515625, "loss": 0.8772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.620413303375244, "rewards/margins": 0.20365536212921143, "rewards/rejected": 2.416757822036743, "step": 36580 }, { "epoch": 1.6987789590974511, "grad_norm": 30.980239868164062, "learning_rate": 1.9812618970240028e-07, "logits/chosen": -18.989032745361328, "logits/rejected": -18.80171775817871, "logps/chosen": -320.003173828125, "logps/rejected": -305.3207092285156, "loss": 0.8131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.343540668487549, "rewards/margins": 0.27110734581947327, "rewards/rejected": 2.0724334716796875, "step": 36590 }, { "epoch": 1.6992432332048841, "grad_norm": 243.05621337890625, "learning_rate": 1.9809833325595432e-07, "logits/chosen": -18.088369369506836, "logits/rejected": -17.751554489135742, "logps/chosen": -474.22076416015625, "logps/rejected": -451.70672607421875, "loss": 0.8901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5994083881378174, "rewards/margins": 0.16964906454086304, "rewards/rejected": 3.4297592639923096, "step": 36600 }, { "epoch": 1.6997075073123171, "grad_norm": 87.59825897216797, "learning_rate": 1.9807047680950833e-07, "logits/chosen": -18.999454498291016, "logits/rejected": -18.89236068725586, "logps/chosen": -475.6827087402344, "logps/rejected": -449.468994140625, "loss": 0.782, "rewards/accuracies": 0.5, "rewards/chosen": 3.945890426635742, "rewards/margins": 0.1252128779888153, "rewards/rejected": 3.8206779956817627, "step": 36610 }, { "epoch": 1.7001717814197503, "grad_norm": 9.85474967956543, "learning_rate": 1.9804262036306232e-07, "logits/chosen": -19.159883499145508, "logits/rejected": -18.3516845703125, "logps/chosen": -398.19378662109375, "logps/rejected": -357.77899169921875, "loss": 0.8082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7594268321990967, "rewards/margins": 0.8308375477790833, "rewards/rejected": 1.9285894632339478, "step": 36620 }, { "epoch": 1.700636055527183, "grad_norm": 69.18183135986328, "learning_rate": 1.9801476391661636e-07, "logits/chosen": -19.099950790405273, "logits/rejected": -18.405736923217773, "logps/chosen": -413.91534423828125, "logps/rejected": -350.01092529296875, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.425208568572998, "rewards/margins": 0.9154454469680786, "rewards/rejected": 2.509763240814209, "step": 36630 }, { "epoch": 1.7011003296346163, "grad_norm": 7.290279865264893, "learning_rate": 1.9798690747017038e-07, "logits/chosen": -19.623870849609375, "logits/rejected": -17.85191535949707, "logps/chosen": -517.8394775390625, "logps/rejected": -340.61688232421875, "loss": 0.8335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.672219276428223, "rewards/margins": 1.3681893348693848, "rewards/rejected": 3.304030179977417, "step": 36640 }, { "epoch": 1.7015646037420493, "grad_norm": 33.5177116394043, "learning_rate": 1.9795905102372442e-07, "logits/chosen": -18.316835403442383, "logits/rejected": -18.926651000976562, "logps/chosen": -314.2000732421875, "logps/rejected": -379.94281005859375, "loss": 0.8024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5871214866638184, "rewards/margins": 0.09467373043298721, "rewards/rejected": 2.492447853088379, "step": 36650 }, { "epoch": 1.7020288778494823, "grad_norm": 4.042874813079834, "learning_rate": 1.979311945772784e-07, "logits/chosen": -19.570068359375, "logits/rejected": -19.11138916015625, "logps/chosen": -448.00128173828125, "logps/rejected": -401.11065673828125, "loss": 0.4614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.794900417327881, "rewards/margins": 1.544285535812378, "rewards/rejected": 2.250614643096924, "step": 36660 }, { "epoch": 1.7024931519569155, "grad_norm": 34.502655029296875, "learning_rate": 1.9790333813083242e-07, "logits/chosen": -17.433382034301758, "logits/rejected": -16.765995025634766, "logps/chosen": -398.05908203125, "logps/rejected": -282.7811279296875, "loss": 0.5697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9371628761291504, "rewards/margins": 1.3949499130249023, "rewards/rejected": 1.5422132015228271, "step": 36670 }, { "epoch": 1.7029574260643483, "grad_norm": 213.3957061767578, "learning_rate": 1.9787548168438646e-07, "logits/chosen": -18.88554573059082, "logits/rejected": -18.357160568237305, "logps/chosen": -431.52630615234375, "logps/rejected": -366.4247131347656, "loss": 0.8899, "rewards/accuracies": 0.5, "rewards/chosen": 3.6115615367889404, "rewards/margins": 0.7011285424232483, "rewards/rejected": 2.910433292388916, "step": 36680 }, { "epoch": 1.7034217001717815, "grad_norm": 10.86701774597168, "learning_rate": 1.9784762523794048e-07, "logits/chosen": -19.71468734741211, "logits/rejected": -18.710651397705078, "logps/chosen": -395.58056640625, "logps/rejected": -250.0682830810547, "loss": 0.413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4672577381134033, "rewards/margins": 1.1195214986801147, "rewards/rejected": 2.34773588180542, "step": 36690 }, { "epoch": 1.7038859742792143, "grad_norm": 10.967452049255371, "learning_rate": 1.978197687914945e-07, "logits/chosen": -18.26151466369629, "logits/rejected": -17.78676414489746, "logps/chosen": -401.07940673828125, "logps/rejected": -321.4605712890625, "loss": 0.6051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3715739250183105, "rewards/margins": 0.7406017184257507, "rewards/rejected": 2.630972385406494, "step": 36700 }, { "epoch": 1.7043502483866475, "grad_norm": 37.959800720214844, "learning_rate": 1.977919123450485e-07, "logits/chosen": -18.73480987548828, "logits/rejected": -17.860353469848633, "logps/chosen": -392.9737854003906, "logps/rejected": -365.56585693359375, "loss": 0.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9316582679748535, "rewards/margins": 1.070859432220459, "rewards/rejected": 2.8607988357543945, "step": 36710 }, { "epoch": 1.7048145224940805, "grad_norm": 25.528339385986328, "learning_rate": 1.9776405589860252e-07, "logits/chosen": -18.92868423461914, "logits/rejected": -18.6929931640625, "logps/chosen": -377.9021911621094, "logps/rejected": -351.86077880859375, "loss": 0.7991, "rewards/accuracies": 0.5, "rewards/chosen": 3.419995069503784, "rewards/margins": 0.4372220039367676, "rewards/rejected": 2.9827728271484375, "step": 36720 }, { "epoch": 1.7052787966015135, "grad_norm": 58.155696868896484, "learning_rate": 1.9773619945215656e-07, "logits/chosen": -17.79998207092285, "logits/rejected": -17.379013061523438, "logps/chosen": -241.7688751220703, "logps/rejected": -192.45895385742188, "loss": 1.1943, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.971698522567749, "rewards/margins": 0.021353578194975853, "rewards/rejected": 1.9503450393676758, "step": 36730 }, { "epoch": 1.7057430707089467, "grad_norm": 0.10443142056465149, "learning_rate": 1.9770834300571055e-07, "logits/chosen": -19.168521881103516, "logits/rejected": -18.28229522705078, "logps/chosen": -337.9930419921875, "logps/rejected": -258.7472229003906, "loss": 0.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.008449077606201, "rewards/margins": 1.8926331996917725, "rewards/rejected": 2.115816116333008, "step": 36740 }, { "epoch": 1.7062073448163795, "grad_norm": 1.880746841430664, "learning_rate": 1.976804865592646e-07, "logits/chosen": -18.97006607055664, "logits/rejected": -18.510662078857422, "logps/chosen": -498.8841857910156, "logps/rejected": -373.4717712402344, "loss": 0.6154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9182662963867188, "rewards/margins": 0.8380886316299438, "rewards/rejected": 3.0801775455474854, "step": 36750 }, { "epoch": 1.7066716189238127, "grad_norm": 101.17198181152344, "learning_rate": 1.976526301128186e-07, "logits/chosen": -20.397430419921875, "logits/rejected": -19.92401885986328, "logps/chosen": -360.4234313964844, "logps/rejected": -298.83685302734375, "loss": 0.6902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0133512020111084, "rewards/margins": 0.4279727041721344, "rewards/rejected": 2.5853781700134277, "step": 36760 }, { "epoch": 1.7071358930312457, "grad_norm": 156.7458953857422, "learning_rate": 1.976247736663726e-07, "logits/chosen": -19.27269172668457, "logits/rejected": -18.752031326293945, "logps/chosen": -484.5147399902344, "logps/rejected": -418.2103576660156, "loss": 1.7159, "rewards/accuracies": 0.5, "rewards/chosen": 3.019279956817627, "rewards/margins": -0.6194857358932495, "rewards/rejected": 3.638765811920166, "step": 36770 }, { "epoch": 1.7076001671386787, "grad_norm": 77.5216293334961, "learning_rate": 1.9759691721992663e-07, "logits/chosen": -18.963682174682617, "logits/rejected": -18.385791778564453, "logps/chosen": -431.58465576171875, "logps/rejected": -355.284912109375, "loss": 0.5599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.155885934829712, "rewards/margins": 0.530780553817749, "rewards/rejected": 2.625105381011963, "step": 36780 }, { "epoch": 1.7080644412461117, "grad_norm": 38.9136962890625, "learning_rate": 1.9756906077348065e-07, "logits/chosen": -18.573490142822266, "logits/rejected": -17.298547744750977, "logps/chosen": -353.8598937988281, "logps/rejected": -264.57781982421875, "loss": 0.592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.911303758621216, "rewards/margins": 1.183754801750183, "rewards/rejected": 1.727548599243164, "step": 36790 }, { "epoch": 1.7085287153535447, "grad_norm": 239.81192016601562, "learning_rate": 1.975412043270347e-07, "logits/chosen": -18.794729232788086, "logits/rejected": -17.774341583251953, "logps/chosen": -358.81048583984375, "logps/rejected": -321.3286437988281, "loss": 0.4869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.929884195327759, "rewards/margins": 1.081733226776123, "rewards/rejected": 1.8481509685516357, "step": 36800 }, { "epoch": 1.7089929894609779, "grad_norm": 34.88115692138672, "learning_rate": 1.9751334788058868e-07, "logits/chosen": -18.651552200317383, "logits/rejected": -17.79250144958496, "logps/chosen": -505.78155517578125, "logps/rejected": -416.03253173828125, "loss": 0.4141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3382978439331055, "rewards/margins": 1.6319177150726318, "rewards/rejected": 2.7063803672790527, "step": 36810 }, { "epoch": 1.7094572635684107, "grad_norm": 36.48249435424805, "learning_rate": 1.974854914341427e-07, "logits/chosen": -19.517473220825195, "logits/rejected": -18.889026641845703, "logps/chosen": -397.7472229003906, "logps/rejected": -383.5065612792969, "loss": 0.7022, "rewards/accuracies": 0.5, "rewards/chosen": 4.2531938552856445, "rewards/margins": 0.8192826509475708, "rewards/rejected": 3.433910846710205, "step": 36820 }, { "epoch": 1.7099215376758439, "grad_norm": 4.216119766235352, "learning_rate": 1.9745763498769673e-07, "logits/chosen": -18.943267822265625, "logits/rejected": -17.553796768188477, "logps/chosen": -391.68292236328125, "logps/rejected": -278.43121337890625, "loss": 0.7957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5575015544891357, "rewards/margins": 1.4907867908477783, "rewards/rejected": 2.0667147636413574, "step": 36830 }, { "epoch": 1.7103858117832769, "grad_norm": 78.42151641845703, "learning_rate": 1.9742977854125075e-07, "logits/chosen": -18.60236358642578, "logits/rejected": -18.765291213989258, "logps/chosen": -391.56695556640625, "logps/rejected": -383.45635986328125, "loss": 0.6696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.001537322998047, "rewards/margins": 0.632097065448761, "rewards/rejected": 2.3694403171539307, "step": 36840 }, { "epoch": 1.7108500858907099, "grad_norm": 70.19440460205078, "learning_rate": 1.9740192209480476e-07, "logits/chosen": -17.942523956298828, "logits/rejected": -17.538923263549805, "logps/chosen": -443.9281311035156, "logps/rejected": -327.64678955078125, "loss": 0.7944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.648635149002075, "rewards/margins": 1.4072940349578857, "rewards/rejected": 2.2413413524627686, "step": 36850 }, { "epoch": 1.7113143599981429, "grad_norm": 34.35807418823242, "learning_rate": 1.9737406564835878e-07, "logits/chosen": -18.483610153198242, "logits/rejected": -18.339834213256836, "logps/chosen": -488.337158203125, "logps/rejected": -523.0125122070312, "loss": 0.9755, "rewards/accuracies": 0.5, "rewards/chosen": 3.9697418212890625, "rewards/margins": 0.26819539070129395, "rewards/rejected": 3.7015461921691895, "step": 36860 }, { "epoch": 1.7117786341055758, "grad_norm": 162.43856811523438, "learning_rate": 1.9734620920191282e-07, "logits/chosen": -18.196514129638672, "logits/rejected": -17.891948699951172, "logps/chosen": -326.45623779296875, "logps/rejected": -308.18939208984375, "loss": 1.381, "rewards/accuracies": 0.5, "rewards/chosen": 3.1131269931793213, "rewards/margins": -0.2956800162792206, "rewards/rejected": 3.4088070392608643, "step": 36870 }, { "epoch": 1.712242908213009, "grad_norm": 82.54393005371094, "learning_rate": 1.9731835275546683e-07, "logits/chosen": -19.327852249145508, "logits/rejected": -19.027019500732422, "logps/chosen": -368.1840515136719, "logps/rejected": -367.73492431640625, "loss": 0.9269, "rewards/accuracies": 0.5, "rewards/chosen": 3.240994930267334, "rewards/margins": 0.2272985279560089, "rewards/rejected": 3.0136969089508057, "step": 36880 }, { "epoch": 1.7127071823204418, "grad_norm": 63.48040008544922, "learning_rate": 1.9729049630902082e-07, "logits/chosen": -18.569713592529297, "logits/rejected": -17.14181137084961, "logps/chosen": -376.64080810546875, "logps/rejected": -245.0190887451172, "loss": 0.2919, "rewards/accuracies": 1.0, "rewards/chosen": 2.673529624938965, "rewards/margins": 1.3736788034439087, "rewards/rejected": 1.2998508214950562, "step": 36890 }, { "epoch": 1.713171456427875, "grad_norm": 113.11810302734375, "learning_rate": 1.9726263986257486e-07, "logits/chosen": -17.947303771972656, "logits/rejected": -18.1811466217041, "logps/chosen": -373.9893798828125, "logps/rejected": -388.76312255859375, "loss": 0.6651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.701969623565674, "rewards/margins": 0.26479285955429077, "rewards/rejected": 2.4371767044067383, "step": 36900 }, { "epoch": 1.713635730535308, "grad_norm": 25.44077491760254, "learning_rate": 1.9723478341612888e-07, "logits/chosen": -19.05439567565918, "logits/rejected": -18.864723205566406, "logps/chosen": -414.7915954589844, "logps/rejected": -437.7535095214844, "loss": 1.0686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7397568225860596, "rewards/margins": 0.21092280745506287, "rewards/rejected": 3.5288333892822266, "step": 36910 }, { "epoch": 1.714100004642741, "grad_norm": 56.2534294128418, "learning_rate": 1.9720692696968292e-07, "logits/chosen": -18.800724029541016, "logits/rejected": -17.61981773376465, "logps/chosen": -410.4120178222656, "logps/rejected": -248.30740356445312, "loss": 0.315, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.211087465286255, "rewards/margins": 1.5034923553466797, "rewards/rejected": 1.7075951099395752, "step": 36920 }, { "epoch": 1.7145642787501743, "grad_norm": 139.8109588623047, "learning_rate": 1.971790705232369e-07, "logits/chosen": -19.23067283630371, "logits/rejected": -17.884044647216797, "logps/chosen": -417.2705993652344, "logps/rejected": -337.84698486328125, "loss": 0.3187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.210048675537109, "rewards/margins": 1.7464901208877563, "rewards/rejected": 2.4635586738586426, "step": 36930 }, { "epoch": 1.715028552857607, "grad_norm": 63.15601348876953, "learning_rate": 1.9715121407679092e-07, "logits/chosen": -19.016010284423828, "logits/rejected": -18.409557342529297, "logps/chosen": -585.8413696289062, "logps/rejected": -520.1075439453125, "loss": 1.0947, "rewards/accuracies": 0.5, "rewards/chosen": 4.222188472747803, "rewards/margins": 0.11940421909093857, "rewards/rejected": 4.102784156799316, "step": 36940 }, { "epoch": 1.7154928269650402, "grad_norm": 11.133013725280762, "learning_rate": 1.9712335763034496e-07, "logits/chosen": -18.98344612121582, "logits/rejected": -17.42814826965332, "logps/chosen": -491.6790466308594, "logps/rejected": -326.8245849609375, "loss": 0.3752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.366230487823486, "rewards/margins": 1.8322312831878662, "rewards/rejected": 2.533998966217041, "step": 36950 }, { "epoch": 1.7159571010724732, "grad_norm": 13.659255981445312, "learning_rate": 1.9709550118389895e-07, "logits/chosen": -19.562053680419922, "logits/rejected": -19.147212982177734, "logps/chosen": -468.7779846191406, "logps/rejected": -393.7550354003906, "loss": 0.5862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.302331447601318, "rewards/margins": 0.7717030048370361, "rewards/rejected": 3.5306286811828613, "step": 36960 }, { "epoch": 1.7164213751799062, "grad_norm": 8.385221481323242, "learning_rate": 1.9706764473745296e-07, "logits/chosen": -19.17900848388672, "logits/rejected": -18.36370849609375, "logps/chosen": -367.8114013671875, "logps/rejected": -273.8819580078125, "loss": 0.6987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7674055099487305, "rewards/margins": 0.4032323956489563, "rewards/rejected": 2.364173412322998, "step": 36970 }, { "epoch": 1.7168856492873392, "grad_norm": 106.67179870605469, "learning_rate": 1.97039788291007e-07, "logits/chosen": -19.26238441467285, "logits/rejected": -17.96310043334961, "logps/chosen": -412.06048583984375, "logps/rejected": -241.4070281982422, "loss": 0.3013, "rewards/accuracies": 1.0, "rewards/chosen": 3.745047092437744, "rewards/margins": 1.7790253162384033, "rewards/rejected": 1.9660217761993408, "step": 36980 }, { "epoch": 1.7173499233947722, "grad_norm": 183.61158752441406, "learning_rate": 1.9701193184456102e-07, "logits/chosen": -18.57772445678711, "logits/rejected": -17.805862426757812, "logps/chosen": -406.6957702636719, "logps/rejected": -371.90350341796875, "loss": 1.3823, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2496795654296875, "rewards/margins": 0.27051717042922974, "rewards/rejected": 2.9791626930236816, "step": 36990 }, { "epoch": 1.7178141975022054, "grad_norm": 31.585094451904297, "learning_rate": 1.9698407539811503e-07, "logits/chosen": -18.149599075317383, "logits/rejected": -17.595577239990234, "logps/chosen": -246.8679962158203, "logps/rejected": -190.4414825439453, "loss": 0.366, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.885835886001587, "rewards/margins": 1.1549153327941895, "rewards/rejected": 0.7309204936027527, "step": 37000 }, { "epoch": 1.7182784716096382, "grad_norm": 3.51768159866333, "learning_rate": 1.9695621895166905e-07, "logits/chosen": -18.12109375, "logits/rejected": -17.461627960205078, "logps/chosen": -435.21514892578125, "logps/rejected": -295.19512939453125, "loss": 0.9759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5344276428222656, "rewards/margins": 0.8088173866271973, "rewards/rejected": 2.7256107330322266, "step": 37010 }, { "epoch": 1.7187427457170714, "grad_norm": 94.28350830078125, "learning_rate": 1.969283625052231e-07, "logits/chosen": -19.447256088256836, "logits/rejected": -17.468647003173828, "logps/chosen": -370.4831848144531, "logps/rejected": -263.5338439941406, "loss": 1.0275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.821800947189331, "rewards/margins": 1.559307336807251, "rewards/rejected": 2.262493848800659, "step": 37020 }, { "epoch": 1.7192070198245044, "grad_norm": 0.802703320980072, "learning_rate": 1.969005060587771e-07, "logits/chosen": -19.340110778808594, "logits/rejected": -18.70432472229004, "logps/chosen": -444.3965759277344, "logps/rejected": -347.72271728515625, "loss": 0.5318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.137874126434326, "rewards/margins": 1.2067933082580566, "rewards/rejected": 1.9310810565948486, "step": 37030 }, { "epoch": 1.7196712939319374, "grad_norm": 52.48563003540039, "learning_rate": 1.968726496123311e-07, "logits/chosen": -18.689271926879883, "logits/rejected": -18.15427589416504, "logps/chosen": -487.1483459472656, "logps/rejected": -398.1675720214844, "loss": 0.5532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.580473899841309, "rewards/margins": 1.0957412719726562, "rewards/rejected": 3.4847328662872314, "step": 37040 }, { "epoch": 1.7201355680393704, "grad_norm": 32.71637725830078, "learning_rate": 1.9684479316588513e-07, "logits/chosen": -20.398778915405273, "logits/rejected": -19.436054229736328, "logps/chosen": -419.79364013671875, "logps/rejected": -328.8356018066406, "loss": 0.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.153984069824219, "rewards/margins": 1.147882103919983, "rewards/rejected": 3.0061018466949463, "step": 37050 }, { "epoch": 1.7205998421468034, "grad_norm": 149.26795959472656, "learning_rate": 1.9681693671943915e-07, "logits/chosen": -19.719280242919922, "logits/rejected": -18.344663619995117, "logps/chosen": -466.1529235839844, "logps/rejected": -297.35919189453125, "loss": 0.6225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.441460609436035, "rewards/margins": 1.0271555185317993, "rewards/rejected": 3.4143054485321045, "step": 37060 }, { "epoch": 1.7210641162542366, "grad_norm": 94.29695892333984, "learning_rate": 1.967890802729932e-07, "logits/chosen": -18.86316680908203, "logits/rejected": -17.478271484375, "logps/chosen": -355.19073486328125, "logps/rejected": -231.7788848876953, "loss": 0.5317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.729274272918701, "rewards/margins": 1.3259145021438599, "rewards/rejected": 1.4033598899841309, "step": 37070 }, { "epoch": 1.7215283903616694, "grad_norm": 4.871761322021484, "learning_rate": 1.9676122382654718e-07, "logits/chosen": -19.07510757446289, "logits/rejected": -17.577865600585938, "logps/chosen": -474.82659912109375, "logps/rejected": -330.77374267578125, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8888614177703857, "rewards/margins": 1.804064154624939, "rewards/rejected": 2.0847971439361572, "step": 37080 }, { "epoch": 1.7219926644691026, "grad_norm": 276.1712951660156, "learning_rate": 1.967333673801012e-07, "logits/chosen": -19.512767791748047, "logits/rejected": -18.13221549987793, "logps/chosen": -374.6867370605469, "logps/rejected": -378.30560302734375, "loss": 0.9381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1836793422698975, "rewards/margins": 0.7959774136543274, "rewards/rejected": 2.3877017498016357, "step": 37090 }, { "epoch": 1.7224569385765356, "grad_norm": 95.32762145996094, "learning_rate": 1.9670551093365523e-07, "logits/chosen": -19.8586368560791, "logits/rejected": -19.197154998779297, "logps/chosen": -312.61175537109375, "logps/rejected": -241.36245727539062, "loss": 0.6372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.113708257675171, "rewards/margins": 1.054770588874817, "rewards/rejected": 2.0589377880096436, "step": 37100 }, { "epoch": 1.7229212126839686, "grad_norm": 24.42743682861328, "learning_rate": 1.9667765448720925e-07, "logits/chosen": -18.231151580810547, "logits/rejected": -17.476444244384766, "logps/chosen": -401.90252685546875, "logps/rejected": -324.7801208496094, "loss": 0.5001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.307328701019287, "rewards/margins": 0.7644611597061157, "rewards/rejected": 2.542867422103882, "step": 37110 }, { "epoch": 1.7233854867914018, "grad_norm": 0.6192916035652161, "learning_rate": 1.9664979804076326e-07, "logits/chosen": -18.54641342163086, "logits/rejected": -18.97694969177246, "logps/chosen": -388.62664794921875, "logps/rejected": -406.0033874511719, "loss": 1.4365, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.270205020904541, "rewards/margins": -0.21075353026390076, "rewards/rejected": 3.4809584617614746, "step": 37120 }, { "epoch": 1.7238497608988346, "grad_norm": 15.687779426574707, "learning_rate": 1.9662194159431728e-07, "logits/chosen": -18.237770080566406, "logits/rejected": -17.792478561401367, "logps/chosen": -328.60101318359375, "logps/rejected": -246.41796875, "loss": 0.6933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5297584533691406, "rewards/margins": 1.1176214218139648, "rewards/rejected": 2.4121367931365967, "step": 37130 }, { "epoch": 1.7243140350062678, "grad_norm": 125.46733856201172, "learning_rate": 1.965940851478713e-07, "logits/chosen": -18.690433502197266, "logits/rejected": -18.51926612854004, "logps/chosen": -403.0552062988281, "logps/rejected": -389.28814697265625, "loss": 0.9525, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.128422260284424, "rewards/margins": -0.08831752836704254, "rewards/rejected": 2.2167396545410156, "step": 37140 }, { "epoch": 1.7247783091137008, "grad_norm": 123.91357421875, "learning_rate": 1.965662287014253e-07, "logits/chosen": -18.80543327331543, "logits/rejected": -18.309749603271484, "logps/chosen": -395.5124816894531, "logps/rejected": -307.9629211425781, "loss": 0.9243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2717292308807373, "rewards/margins": 0.7767959833145142, "rewards/rejected": 2.4949331283569336, "step": 37150 }, { "epoch": 1.7252425832211338, "grad_norm": 14.935684204101562, "learning_rate": 1.9653837225497932e-07, "logits/chosen": -19.174327850341797, "logits/rejected": -18.720012664794922, "logps/chosen": -345.00811767578125, "logps/rejected": -250.97042846679688, "loss": 0.4673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0921378135681152, "rewards/margins": 1.1300026178359985, "rewards/rejected": 1.9621349573135376, "step": 37160 }, { "epoch": 1.7257068573285668, "grad_norm": 153.7716522216797, "learning_rate": 1.9651051580853336e-07, "logits/chosen": -18.05463409423828, "logits/rejected": -17.932376861572266, "logps/chosen": -394.2137756347656, "logps/rejected": -331.1987609863281, "loss": 0.8199, "rewards/accuracies": 0.5, "rewards/chosen": 3.258761167526245, "rewards/margins": 0.7521774172782898, "rewards/rejected": 2.5065834522247314, "step": 37170 }, { "epoch": 1.7261711314359998, "grad_norm": 204.55636596679688, "learning_rate": 1.9648265936208737e-07, "logits/chosen": -18.811443328857422, "logits/rejected": -18.842571258544922, "logps/chosen": -449.53662109375, "logps/rejected": -424.77081298828125, "loss": 0.6963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8216419219970703, "rewards/margins": 0.19465896487236023, "rewards/rejected": 3.6269829273223877, "step": 37180 }, { "epoch": 1.726635405543433, "grad_norm": 303.01861572265625, "learning_rate": 1.9645480291564136e-07, "logits/chosen": -18.878223419189453, "logits/rejected": -18.790985107421875, "logps/chosen": -424.4925842285156, "logps/rejected": -400.7754821777344, "loss": 0.995, "rewards/accuracies": 0.5, "rewards/chosen": 3.6911444664001465, "rewards/margins": 0.945630669593811, "rewards/rejected": 2.745513677597046, "step": 37190 }, { "epoch": 1.7270996796508657, "grad_norm": 80.56658172607422, "learning_rate": 1.964269464691954e-07, "logits/chosen": -18.9967041015625, "logits/rejected": -17.568513870239258, "logps/chosen": -492.046142578125, "logps/rejected": -293.3045959472656, "loss": 0.4799, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.850905179977417, "rewards/margins": 1.9940437078475952, "rewards/rejected": 1.8568614721298218, "step": 37200 }, { "epoch": 1.727563953758299, "grad_norm": 156.9264678955078, "learning_rate": 1.9639909002274942e-07, "logits/chosen": -19.529460906982422, "logits/rejected": -18.70035171508789, "logps/chosen": -373.7864685058594, "logps/rejected": -359.5652770996094, "loss": 0.4756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7132887840270996, "rewards/margins": 1.0023778676986694, "rewards/rejected": 2.7109110355377197, "step": 37210 }, { "epoch": 1.728028227865732, "grad_norm": 43.434165954589844, "learning_rate": 1.9637123357630346e-07, "logits/chosen": -18.260513305664062, "logits/rejected": -16.79557228088379, "logps/chosen": -416.628662109375, "logps/rejected": -285.3365783691406, "loss": 0.4914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.024024724960327, "rewards/margins": 1.174769401550293, "rewards/rejected": 1.8492555618286133, "step": 37220 }, { "epoch": 1.728492501973165, "grad_norm": 74.98928833007812, "learning_rate": 1.9634337712985745e-07, "logits/chosen": -18.624469757080078, "logits/rejected": -18.258453369140625, "logps/chosen": -460.2391052246094, "logps/rejected": -405.88360595703125, "loss": 0.5396, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.671485424041748, "rewards/margins": 0.8628038167953491, "rewards/rejected": 2.8086814880371094, "step": 37230 }, { "epoch": 1.728956776080598, "grad_norm": 106.91241455078125, "learning_rate": 1.9631552068341146e-07, "logits/chosen": -18.94144058227539, "logits/rejected": -18.398265838623047, "logps/chosen": -341.76568603515625, "logps/rejected": -259.806640625, "loss": 0.5292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6603643894195557, "rewards/margins": 0.7698327302932739, "rewards/rejected": 1.8905317783355713, "step": 37240 }, { "epoch": 1.729421050188031, "grad_norm": 48.11105728149414, "learning_rate": 1.962876642369655e-07, "logits/chosen": -19.866336822509766, "logits/rejected": -17.733362197875977, "logps/chosen": -461.40435791015625, "logps/rejected": -297.8504943847656, "loss": 0.5053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.143622398376465, "rewards/margins": 1.8624515533447266, "rewards/rejected": 2.281170606613159, "step": 37250 }, { "epoch": 1.7298853242954642, "grad_norm": 72.57353210449219, "learning_rate": 1.9625980779051952e-07, "logits/chosen": -19.23642349243164, "logits/rejected": -16.854373931884766, "logps/chosen": -456.09112548828125, "logps/rejected": -280.5332336425781, "loss": 0.4735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9484310150146484, "rewards/margins": 1.1935436725616455, "rewards/rejected": 1.7548869848251343, "step": 37260 }, { "epoch": 1.730349598402897, "grad_norm": 125.89700317382812, "learning_rate": 1.9623195134407353e-07, "logits/chosen": -19.41152572631836, "logits/rejected": -18.717803955078125, "logps/chosen": -409.899169921875, "logps/rejected": -368.07830810546875, "loss": 0.6205, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.594679594039917, "rewards/margins": 1.0738991498947144, "rewards/rejected": 2.520780324935913, "step": 37270 }, { "epoch": 1.7308138725103301, "grad_norm": 164.00241088867188, "learning_rate": 1.9620409489762755e-07, "logits/chosen": -17.823328018188477, "logits/rejected": -18.870882034301758, "logps/chosen": -278.21630859375, "logps/rejected": -359.13421630859375, "loss": 1.3808, "rewards/accuracies": 0.5, "rewards/chosen": 2.8157577514648438, "rewards/margins": 0.11528132110834122, "rewards/rejected": 2.70047664642334, "step": 37280 }, { "epoch": 1.7312781466177631, "grad_norm": 86.59464263916016, "learning_rate": 1.961762384511816e-07, "logits/chosen": -19.139080047607422, "logits/rejected": -19.6500244140625, "logps/chosen": -411.64404296875, "logps/rejected": -435.37127685546875, "loss": 0.9669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7724220752716064, "rewards/margins": 0.02095639705657959, "rewards/rejected": 3.7514655590057373, "step": 37290 }, { "epoch": 1.7317424207251961, "grad_norm": 9.596403121948242, "learning_rate": 1.961483820047356e-07, "logits/chosen": -18.393512725830078, "logits/rejected": -18.63394546508789, "logps/chosen": -406.1951599121094, "logps/rejected": -316.1932678222656, "loss": 0.7884, "rewards/accuracies": 0.5, "rewards/chosen": 3.2149696350097656, "rewards/margins": 0.7706254720687866, "rewards/rejected": 2.4443440437316895, "step": 37300 }, { "epoch": 1.7322066948326293, "grad_norm": 36.36980438232422, "learning_rate": 1.961205255582896e-07, "logits/chosen": -19.340280532836914, "logits/rejected": -17.268146514892578, "logps/chosen": -423.77899169921875, "logps/rejected": -208.3456268310547, "loss": 0.3758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.581721067428589, "rewards/margins": 1.8938219547271729, "rewards/rejected": 1.6878989934921265, "step": 37310 }, { "epoch": 1.7326709689400621, "grad_norm": 80.33502197265625, "learning_rate": 1.9609266911184363e-07, "logits/chosen": -18.6589298248291, "logits/rejected": -18.380443572998047, "logps/chosen": -370.54046630859375, "logps/rejected": -367.4149475097656, "loss": 0.9615, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.524284601211548, "rewards/margins": -0.27063417434692383, "rewards/rejected": 2.7949185371398926, "step": 37320 }, { "epoch": 1.7331352430474953, "grad_norm": 50.22201919555664, "learning_rate": 1.9606481266539765e-07, "logits/chosen": -18.518619537353516, "logits/rejected": -17.3388729095459, "logps/chosen": -459.59429931640625, "logps/rejected": -316.11285400390625, "loss": 0.5034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6881213188171387, "rewards/margins": 1.8121862411499023, "rewards/rejected": 1.8759353160858154, "step": 37330 }, { "epoch": 1.733599517154928, "grad_norm": 15.070844650268555, "learning_rate": 1.960369562189517e-07, "logits/chosen": -19.058658599853516, "logits/rejected": -17.57012367248535, "logps/chosen": -472.09112548828125, "logps/rejected": -337.13055419921875, "loss": 0.374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.236094951629639, "rewards/margins": 2.0653340816497803, "rewards/rejected": 2.1707606315612793, "step": 37340 }, { "epoch": 1.7340637912623613, "grad_norm": 176.05950927734375, "learning_rate": 1.9600909977250567e-07, "logits/chosen": -18.758136749267578, "logits/rejected": -17.543405532836914, "logps/chosen": -386.42938232421875, "logps/rejected": -274.2077941894531, "loss": 0.6262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.285414457321167, "rewards/margins": 0.7224792838096619, "rewards/rejected": 2.5629353523254395, "step": 37350 }, { "epoch": 1.7345280653697943, "grad_norm": 72.27081298828125, "learning_rate": 1.959812433260597e-07, "logits/chosen": -19.79192352294922, "logits/rejected": -17.776214599609375, "logps/chosen": -381.3955993652344, "logps/rejected": -251.6540985107422, "loss": 0.3556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.580209732055664, "rewards/margins": 1.8248927593231201, "rewards/rejected": 1.755316972732544, "step": 37360 }, { "epoch": 1.7349923394772273, "grad_norm": 23.76724624633789, "learning_rate": 1.9595338687961373e-07, "logits/chosen": -19.345962524414062, "logits/rejected": -18.767040252685547, "logps/chosen": -326.6905212402344, "logps/rejected": -314.1654052734375, "loss": 1.0071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5094807147979736, "rewards/margins": 0.14126411080360413, "rewards/rejected": 2.3682162761688232, "step": 37370 }, { "epoch": 1.7354566135846605, "grad_norm": 4.142284393310547, "learning_rate": 1.9592553043316772e-07, "logits/chosen": -19.529644012451172, "logits/rejected": -18.250499725341797, "logps/chosen": -363.6216735839844, "logps/rejected": -292.9688720703125, "loss": 0.8559, "rewards/accuracies": 0.5, "rewards/chosen": 3.2483649253845215, "rewards/margins": 0.9615675210952759, "rewards/rejected": 2.2867977619171143, "step": 37380 }, { "epoch": 1.7359208876920933, "grad_norm": 28.649797439575195, "learning_rate": 1.9589767398672173e-07, "logits/chosen": -19.14376449584961, "logits/rejected": -18.298765182495117, "logps/chosen": -350.177734375, "logps/rejected": -309.0785217285156, "loss": 0.8399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8583388328552246, "rewards/margins": 0.37620365619659424, "rewards/rejected": 2.48213529586792, "step": 37390 }, { "epoch": 1.7363851617995265, "grad_norm": 32.414085388183594, "learning_rate": 1.9586981754027577e-07, "logits/chosen": -18.906843185424805, "logits/rejected": -17.808475494384766, "logps/chosen": -359.3990783691406, "logps/rejected": -239.2365264892578, "loss": 0.2667, "rewards/accuracies": 1.0, "rewards/chosen": 4.7842302322387695, "rewards/margins": 1.8910518884658813, "rewards/rejected": 2.8931784629821777, "step": 37400 }, { "epoch": 1.7368494359069595, "grad_norm": 0.18822833895683289, "learning_rate": 1.958419610938298e-07, "logits/chosen": -19.162456512451172, "logits/rejected": -17.513078689575195, "logps/chosen": -370.5318603515625, "logps/rejected": -254.884033203125, "loss": 0.3658, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8571369647979736, "rewards/margins": 1.9581356048583984, "rewards/rejected": 1.8990013599395752, "step": 37410 }, { "epoch": 1.7373137100143925, "grad_norm": 55.24588394165039, "learning_rate": 1.958141046473838e-07, "logits/chosen": -18.840709686279297, "logits/rejected": -18.529537200927734, "logps/chosen": -493.40673828125, "logps/rejected": -492.01959228515625, "loss": 0.7806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.057680130004883, "rewards/margins": 0.3204684555530548, "rewards/rejected": 3.7372119426727295, "step": 37420 }, { "epoch": 1.7377779841218255, "grad_norm": 30.585575103759766, "learning_rate": 1.9578624820093782e-07, "logits/chosen": -18.336795806884766, "logits/rejected": -17.810117721557617, "logps/chosen": -319.4302673339844, "logps/rejected": -315.55914306640625, "loss": 0.7564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.535346746444702, "rewards/margins": 0.5978977084159851, "rewards/rejected": 1.9374488592147827, "step": 37430 }, { "epoch": 1.7382422582292585, "grad_norm": 40.90739822387695, "learning_rate": 1.9575839175449186e-07, "logits/chosen": -18.66774559020996, "logits/rejected": -18.219417572021484, "logps/chosen": -268.375732421875, "logps/rejected": -226.4786834716797, "loss": 0.6434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.61796498298645, "rewards/margins": 0.8706377148628235, "rewards/rejected": 1.747327208518982, "step": 37440 }, { "epoch": 1.7387065323366917, "grad_norm": 133.8538818359375, "learning_rate": 1.9573053530804587e-07, "logits/chosen": -19.228116989135742, "logits/rejected": -18.051239013671875, "logps/chosen": -448.4107360839844, "logps/rejected": -382.098876953125, "loss": 0.648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3531277179718018, "rewards/margins": 0.7313796877861023, "rewards/rejected": 1.6217479705810547, "step": 37450 }, { "epoch": 1.7391708064441245, "grad_norm": 24.151456832885742, "learning_rate": 1.9570267886159986e-07, "logits/chosen": -18.563446044921875, "logits/rejected": -17.658226013183594, "logps/chosen": -559.8716430664062, "logps/rejected": -469.4723205566406, "loss": 0.5413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.033717632293701, "rewards/margins": 1.1893044710159302, "rewards/rejected": 2.8444132804870605, "step": 37460 }, { "epoch": 1.7396350805515577, "grad_norm": 95.01917266845703, "learning_rate": 1.956748224151539e-07, "logits/chosen": -18.357101440429688, "logits/rejected": -18.000410079956055, "logps/chosen": -419.8603515625, "logps/rejected": -459.00439453125, "loss": 1.029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.897104501724243, "rewards/margins": 0.015003049746155739, "rewards/rejected": 2.882101535797119, "step": 37470 }, { "epoch": 1.7400993546589907, "grad_norm": 277.1377258300781, "learning_rate": 1.9564696596870792e-07, "logits/chosen": -18.458118438720703, "logits/rejected": -18.634294509887695, "logps/chosen": -574.1852416992188, "logps/rejected": -525.0974731445312, "loss": 1.3596, "rewards/accuracies": 0.5, "rewards/chosen": 4.130324363708496, "rewards/margins": -0.04032454639673233, "rewards/rejected": 4.17064905166626, "step": 37480 }, { "epoch": 1.7405636287664237, "grad_norm": 141.7066192626953, "learning_rate": 1.9561910952226196e-07, "logits/chosen": -20.085058212280273, "logits/rejected": -18.97633934020996, "logps/chosen": -567.7481689453125, "logps/rejected": -422.9751892089844, "loss": 0.3796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.639717102050781, "rewards/margins": 1.4072672128677368, "rewards/rejected": 3.232449769973755, "step": 37490 }, { "epoch": 1.7410279028738567, "grad_norm": 28.162498474121094, "learning_rate": 1.9559125307581595e-07, "logits/chosen": -18.713775634765625, "logits/rejected": -18.8999080657959, "logps/chosen": -258.5247802734375, "logps/rejected": -333.0980224609375, "loss": 1.0067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3433303833007812, "rewards/margins": 0.03580498695373535, "rewards/rejected": 2.307525157928467, "step": 37500 }, { "epoch": 1.7414921769812897, "grad_norm": 149.45892333984375, "learning_rate": 1.9556339662936996e-07, "logits/chosen": -17.98514175415039, "logits/rejected": -17.47949981689453, "logps/chosen": -462.2223205566406, "logps/rejected": -390.4307556152344, "loss": 1.3888, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.0063693523406982, "rewards/margins": -0.05410013347864151, "rewards/rejected": 3.060469388961792, "step": 37510 }, { "epoch": 1.7419564510887229, "grad_norm": 29.008594512939453, "learning_rate": 1.95535540182924e-07, "logits/chosen": -18.989009857177734, "logits/rejected": -18.603443145751953, "logps/chosen": -376.2374572753906, "logps/rejected": -404.8370056152344, "loss": 1.1211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.944209337234497, "rewards/margins": -0.15001997351646423, "rewards/rejected": 3.094229221343994, "step": 37520 }, { "epoch": 1.7424207251961557, "grad_norm": 0.13456390798091888, "learning_rate": 1.9550768373647802e-07, "logits/chosen": -18.292850494384766, "logits/rejected": -17.330738067626953, "logps/chosen": -351.93817138671875, "logps/rejected": -235.5923309326172, "loss": 0.4691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.763214349746704, "rewards/margins": 1.6220319271087646, "rewards/rejected": 1.14118230342865, "step": 37530 }, { "epoch": 1.7428849993035889, "grad_norm": 156.39987182617188, "learning_rate": 1.9547982729003203e-07, "logits/chosen": -18.4645938873291, "logits/rejected": -18.45688819885254, "logps/chosen": -334.27862548828125, "logps/rejected": -328.45513916015625, "loss": 0.8389, "rewards/accuracies": 0.5, "rewards/chosen": 2.8218319416046143, "rewards/margins": 0.18347296118736267, "rewards/rejected": 2.6383590698242188, "step": 37540 }, { "epoch": 1.7433492734110219, "grad_norm": 72.16607666015625, "learning_rate": 1.9545197084358605e-07, "logits/chosen": -18.6429443359375, "logits/rejected": -18.826839447021484, "logps/chosen": -373.85791015625, "logps/rejected": -488.09979248046875, "loss": 1.2947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.791175603866577, "rewards/margins": -0.5532630681991577, "rewards/rejected": 3.3444385528564453, "step": 37550 }, { "epoch": 1.7438135475184549, "grad_norm": 1.135526418685913, "learning_rate": 1.9542411439714006e-07, "logits/chosen": -18.791614532470703, "logits/rejected": -17.654027938842773, "logps/chosen": -414.89385986328125, "logps/rejected": -330.0897521972656, "loss": 0.6141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.090842247009277, "rewards/margins": 1.3219716548919678, "rewards/rejected": 2.7688705921173096, "step": 37560 }, { "epoch": 1.744277821625888, "grad_norm": 90.35343170166016, "learning_rate": 1.9539625795069407e-07, "logits/chosen": -18.08460807800293, "logits/rejected": -16.962047576904297, "logps/chosen": -385.2142028808594, "logps/rejected": -227.57479858398438, "loss": 0.7572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0396852493286133, "rewards/margins": 1.5093845129013062, "rewards/rejected": 1.530300498008728, "step": 37570 }, { "epoch": 1.7447420957333208, "grad_norm": 34.5277099609375, "learning_rate": 1.953684015042481e-07, "logits/chosen": -18.819236755371094, "logits/rejected": -17.434194564819336, "logps/chosen": -432.27032470703125, "logps/rejected": -280.22479248046875, "loss": 0.4539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.710397243499756, "rewards/margins": 1.3191425800323486, "rewards/rejected": 2.3912549018859863, "step": 37580 }, { "epoch": 1.745206369840754, "grad_norm": 104.11323547363281, "learning_rate": 1.9534054505780213e-07, "logits/chosen": -19.71520233154297, "logits/rejected": -19.418685913085938, "logps/chosen": -370.9605712890625, "logps/rejected": -354.4693603515625, "loss": 1.2516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.498317241668701, "rewards/margins": 0.03371887281537056, "rewards/rejected": 3.4645984172821045, "step": 37590 }, { "epoch": 1.745670643948187, "grad_norm": 72.96332550048828, "learning_rate": 1.9531268861135614e-07, "logits/chosen": -19.436195373535156, "logits/rejected": -18.288654327392578, "logps/chosen": -547.656494140625, "logps/rejected": -358.05828857421875, "loss": 0.376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.75339412689209, "rewards/margins": 2.214104175567627, "rewards/rejected": 2.539290189743042, "step": 37600 }, { "epoch": 1.74613491805562, "grad_norm": 50.75665283203125, "learning_rate": 1.9528483216491013e-07, "logits/chosen": -18.63028335571289, "logits/rejected": -17.70180320739746, "logps/chosen": -406.7229919433594, "logps/rejected": -353.33282470703125, "loss": 0.4093, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6863174438476562, "rewards/margins": 1.2935023307800293, "rewards/rejected": 2.392815351486206, "step": 37610 }, { "epoch": 1.746599192163053, "grad_norm": 190.8604736328125, "learning_rate": 1.9525697571846417e-07, "logits/chosen": -18.678569793701172, "logits/rejected": -18.461170196533203, "logps/chosen": -472.84619140625, "logps/rejected": -406.6510925292969, "loss": 0.9399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1091721057891846, "rewards/margins": 0.059276483952999115, "rewards/rejected": 3.0498952865600586, "step": 37620 }, { "epoch": 1.747063466270486, "grad_norm": 8.086981773376465, "learning_rate": 1.952291192720182e-07, "logits/chosen": -19.053077697753906, "logits/rejected": -19.045963287353516, "logps/chosen": -405.4364318847656, "logps/rejected": -336.1963806152344, "loss": 0.7361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0842678546905518, "rewards/margins": 0.4419547915458679, "rewards/rejected": 2.642313241958618, "step": 37630 }, { "epoch": 1.7475277403779192, "grad_norm": 48.88718795776367, "learning_rate": 1.9520126282557223e-07, "logits/chosen": -19.689273834228516, "logits/rejected": -18.67140007019043, "logps/chosen": -447.24688720703125, "logps/rejected": -363.3481140136719, "loss": 0.5421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.980395555496216, "rewards/margins": 1.0796620845794678, "rewards/rejected": 2.900733470916748, "step": 37640 }, { "epoch": 1.747992014485352, "grad_norm": 15.716970443725586, "learning_rate": 1.9517340637912622e-07, "logits/chosen": -18.46597671508789, "logits/rejected": -17.61125946044922, "logps/chosen": -351.3761901855469, "logps/rejected": -277.07208251953125, "loss": 0.5331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7051024436950684, "rewards/margins": 0.7875415086746216, "rewards/rejected": 1.9175611734390259, "step": 37650 }, { "epoch": 1.7484562885927852, "grad_norm": 80.4868392944336, "learning_rate": 1.9514554993268023e-07, "logits/chosen": -19.410648345947266, "logits/rejected": -19.48497772216797, "logps/chosen": -338.5629577636719, "logps/rejected": -346.2266540527344, "loss": 1.2255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7123827934265137, "rewards/margins": 0.06254291534423828, "rewards/rejected": 2.6498401165008545, "step": 37660 }, { "epoch": 1.7489205627002182, "grad_norm": 205.42550659179688, "learning_rate": 1.9511769348623427e-07, "logits/chosen": -19.807249069213867, "logits/rejected": -18.553556442260742, "logps/chosen": -337.47113037109375, "logps/rejected": -242.8393096923828, "loss": 0.3118, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.525719165802002, "rewards/margins": 1.5924198627471924, "rewards/rejected": 1.9332994222640991, "step": 37670 }, { "epoch": 1.7493848368076512, "grad_norm": 22.60948944091797, "learning_rate": 1.950898370397883e-07, "logits/chosen": -18.790990829467773, "logits/rejected": -18.754528045654297, "logps/chosen": -383.37445068359375, "logps/rejected": -389.3377380371094, "loss": 0.5411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3149561882019043, "rewards/margins": 0.4882144033908844, "rewards/rejected": 1.8267418146133423, "step": 37680 }, { "epoch": 1.7498491109150842, "grad_norm": 70.10008239746094, "learning_rate": 1.950619805933423e-07, "logits/chosen": -19.539382934570312, "logits/rejected": -18.86025619506836, "logps/chosen": -442.3915100097656, "logps/rejected": -308.0229187011719, "loss": 0.4996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.766655445098877, "rewards/margins": 1.3653647899627686, "rewards/rejected": 2.4012911319732666, "step": 37690 }, { "epoch": 1.7503133850225172, "grad_norm": 26.733156204223633, "learning_rate": 1.9503412414689632e-07, "logits/chosen": -18.769285202026367, "logits/rejected": -17.847135543823242, "logps/chosen": -394.11993408203125, "logps/rejected": -287.63018798828125, "loss": 0.5404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.527048110961914, "rewards/margins": 1.458412766456604, "rewards/rejected": 2.0686349868774414, "step": 37700 }, { "epoch": 1.7507776591299504, "grad_norm": 275.2757568359375, "learning_rate": 1.9500626770045036e-07, "logits/chosen": -18.589733123779297, "logits/rejected": -19.132671356201172, "logps/chosen": -445.26910400390625, "logps/rejected": -444.5138244628906, "loss": 1.1455, "rewards/accuracies": 0.5, "rewards/chosen": 3.651498317718506, "rewards/margins": -0.19578497111797333, "rewards/rejected": 3.847283124923706, "step": 37710 }, { "epoch": 1.7512419332373832, "grad_norm": 149.30050659179688, "learning_rate": 1.9497841125400437e-07, "logits/chosen": -19.015222549438477, "logits/rejected": -18.775793075561523, "logps/chosen": -258.80279541015625, "logps/rejected": -251.6638641357422, "loss": 0.7933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0328209400177, "rewards/margins": 0.05618641898036003, "rewards/rejected": 1.9766346216201782, "step": 37720 }, { "epoch": 1.7517062073448164, "grad_norm": 124.47467041015625, "learning_rate": 1.9495055480755836e-07, "logits/chosen": -18.146913528442383, "logits/rejected": -18.08013916015625, "logps/chosen": -288.111083984375, "logps/rejected": -261.66033935546875, "loss": 0.9901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4591195583343506, "rewards/margins": 0.30448949337005615, "rewards/rejected": 2.154630184173584, "step": 37730 }, { "epoch": 1.7521704814522494, "grad_norm": 155.20925903320312, "learning_rate": 1.949226983611124e-07, "logits/chosen": -18.777202606201172, "logits/rejected": -18.495824813842773, "logps/chosen": -392.46771240234375, "logps/rejected": -339.87274169921875, "loss": 1.6209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.38193941116333, "rewards/margins": -0.10810141265392303, "rewards/rejected": 3.4900410175323486, "step": 37740 }, { "epoch": 1.7526347555596824, "grad_norm": 12.800064086914062, "learning_rate": 1.9489484191466642e-07, "logits/chosen": -19.471145629882812, "logits/rejected": -19.257076263427734, "logps/chosen": -430.57940673828125, "logps/rejected": -407.54791259765625, "loss": 0.7945, "rewards/accuracies": 0.5, "rewards/chosen": 3.2372639179229736, "rewards/margins": 0.14747220277786255, "rewards/rejected": 3.0897915363311768, "step": 37750 }, { "epoch": 1.7530990296671156, "grad_norm": 180.0038299560547, "learning_rate": 1.948669854682204e-07, "logits/chosen": -18.271591186523438, "logits/rejected": -17.761611938476562, "logps/chosen": -329.3645324707031, "logps/rejected": -287.130859375, "loss": 0.6781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1488587856292725, "rewards/margins": 0.35731109976768494, "rewards/rejected": 1.7915477752685547, "step": 37760 }, { "epoch": 1.7535633037745484, "grad_norm": 157.5163116455078, "learning_rate": 1.9483912902177444e-07, "logits/chosen": -19.853313446044922, "logits/rejected": -19.07219886779785, "logps/chosen": -423.53277587890625, "logps/rejected": -345.3713073730469, "loss": 0.4207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.820416212081909, "rewards/margins": 1.4803175926208496, "rewards/rejected": 2.3400986194610596, "step": 37770 }, { "epoch": 1.7540275778819816, "grad_norm": 39.052459716796875, "learning_rate": 1.9481127257532846e-07, "logits/chosen": -19.284618377685547, "logits/rejected": -18.17388916015625, "logps/chosen": -510.89324951171875, "logps/rejected": -398.78326416015625, "loss": 0.2786, "rewards/accuracies": 1.0, "rewards/chosen": 4.036438465118408, "rewards/margins": 1.6800994873046875, "rewards/rejected": 2.3563389778137207, "step": 37780 }, { "epoch": 1.7544918519894146, "grad_norm": 40.7346076965332, "learning_rate": 1.947834161288825e-07, "logits/chosen": -19.977195739746094, "logits/rejected": -19.606752395629883, "logps/chosen": -326.25946044921875, "logps/rejected": -303.11993408203125, "loss": 0.7715, "rewards/accuracies": 0.5, "rewards/chosen": 3.3979427814483643, "rewards/margins": 0.30378958582878113, "rewards/rejected": 3.0941531658172607, "step": 37790 }, { "epoch": 1.7549561260968476, "grad_norm": 88.1214599609375, "learning_rate": 1.947555596824365e-07, "logits/chosen": -19.144123077392578, "logits/rejected": -19.128122329711914, "logps/chosen": -379.82159423828125, "logps/rejected": -350.5323486328125, "loss": 0.7508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.363826036453247, "rewards/margins": 0.9188507199287415, "rewards/rejected": 2.4449751377105713, "step": 37800 }, { "epoch": 1.7554204002042806, "grad_norm": 73.74067687988281, "learning_rate": 1.947277032359905e-07, "logits/chosen": -19.47231101989746, "logits/rejected": -18.67347526550293, "logps/chosen": -520.5618896484375, "logps/rejected": -416.1328125, "loss": 0.7023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.734646797180176, "rewards/margins": 0.6227966547012329, "rewards/rejected": 4.111850261688232, "step": 37810 }, { "epoch": 1.7558846743117136, "grad_norm": 52.73173904418945, "learning_rate": 1.9469984678954454e-07, "logits/chosen": -18.190540313720703, "logits/rejected": -17.714397430419922, "logps/chosen": -274.3789367675781, "logps/rejected": -221.7777099609375, "loss": 1.1175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8114771842956543, "rewards/margins": 0.7334465980529785, "rewards/rejected": 2.078030586242676, "step": 37820 }, { "epoch": 1.7563489484191468, "grad_norm": 102.7091064453125, "learning_rate": 1.9467199034309856e-07, "logits/chosen": -17.733457565307617, "logits/rejected": -17.473243713378906, "logps/chosen": -322.30242919921875, "logps/rejected": -354.27996826171875, "loss": 1.2582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7440648078918457, "rewards/margins": 0.15398050844669342, "rewards/rejected": 2.5900843143463135, "step": 37830 }, { "epoch": 1.7568132225265796, "grad_norm": 198.44703674316406, "learning_rate": 1.9464413389665257e-07, "logits/chosen": -18.15806770324707, "logits/rejected": -17.129966735839844, "logps/chosen": -311.52166748046875, "logps/rejected": -234.90576171875, "loss": 0.9698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0463814735412598, "rewards/margins": 0.9147605895996094, "rewards/rejected": 2.1316208839416504, "step": 37840 }, { "epoch": 1.7572774966340128, "grad_norm": 84.70396423339844, "learning_rate": 1.946162774502066e-07, "logits/chosen": -18.826234817504883, "logits/rejected": -17.8344783782959, "logps/chosen": -440.7642517089844, "logps/rejected": -412.7015075683594, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 3.9485886096954346, "rewards/margins": 1.275634527206421, "rewards/rejected": 2.6729540824890137, "step": 37850 }, { "epoch": 1.7577417707414458, "grad_norm": 25.816810607910156, "learning_rate": 1.9458842100376063e-07, "logits/chosen": -19.376333236694336, "logits/rejected": -17.237232208251953, "logps/chosen": -461.7640075683594, "logps/rejected": -235.9903564453125, "loss": 0.441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7924964427948, "rewards/margins": 2.2078351974487305, "rewards/rejected": 1.5846611261367798, "step": 37860 }, { "epoch": 1.7582060448488788, "grad_norm": 73.89302825927734, "learning_rate": 1.9456056455731464e-07, "logits/chosen": -18.740543365478516, "logits/rejected": -18.533740997314453, "logps/chosen": -407.2622985839844, "logps/rejected": -316.83404541015625, "loss": 1.3436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.302412509918213, "rewards/margins": 0.5425082445144653, "rewards/rejected": 2.759904146194458, "step": 37870 }, { "epoch": 1.7586703189563118, "grad_norm": 186.28465270996094, "learning_rate": 1.9453270811086863e-07, "logits/chosen": -18.98978614807129, "logits/rejected": -18.39266586303711, "logps/chosen": -264.30133056640625, "logps/rejected": -242.0774383544922, "loss": 0.9117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4146459102630615, "rewards/margins": 0.4339785575866699, "rewards/rejected": 1.9806673526763916, "step": 37880 }, { "epoch": 1.7591345930637448, "grad_norm": 74.19254302978516, "learning_rate": 1.9450485166442267e-07, "logits/chosen": -18.106670379638672, "logits/rejected": -17.52637481689453, "logps/chosen": -431.74884033203125, "logps/rejected": -299.10699462890625, "loss": 0.2388, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.137358665466309, "rewards/margins": 2.154369354248047, "rewards/rejected": 1.982988953590393, "step": 37890 }, { "epoch": 1.759598867171178, "grad_norm": 232.26365661621094, "learning_rate": 1.9447699521797669e-07, "logits/chosen": -19.304607391357422, "logits/rejected": -18.699893951416016, "logps/chosen": -430.2952575683594, "logps/rejected": -367.652099609375, "loss": 0.8931, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.217573881149292, "rewards/margins": 0.23445940017700195, "rewards/rejected": 2.983114719390869, "step": 37900 }, { "epoch": 1.7600631412786107, "grad_norm": 29.224313735961914, "learning_rate": 1.9444913877153073e-07, "logits/chosen": -19.15298843383789, "logits/rejected": -19.889102935791016, "logps/chosen": -290.5698547363281, "logps/rejected": -335.1124572753906, "loss": 1.0447, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0492398738861084, "rewards/margins": -0.19995108246803284, "rewards/rejected": 3.2491908073425293, "step": 37910 }, { "epoch": 1.760527415386044, "grad_norm": 32.4079704284668, "learning_rate": 1.9442128232508472e-07, "logits/chosen": -18.82819366455078, "logits/rejected": -18.336917877197266, "logps/chosen": -348.14215087890625, "logps/rejected": -259.82391357421875, "loss": 0.549, "rewards/accuracies": 0.5, "rewards/chosen": 2.676638126373291, "rewards/margins": 1.0355784893035889, "rewards/rejected": 1.6410596370697021, "step": 37920 }, { "epoch": 1.760991689493477, "grad_norm": 213.46820068359375, "learning_rate": 1.9439342587863873e-07, "logits/chosen": -17.76691246032715, "logits/rejected": -18.10700225830078, "logps/chosen": -306.7622985839844, "logps/rejected": -334.9168395996094, "loss": 0.8882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.894059181213379, "rewards/margins": 0.3158472776412964, "rewards/rejected": 2.578212022781372, "step": 37930 }, { "epoch": 1.76145596360091, "grad_norm": 17.51768684387207, "learning_rate": 1.9436556943219277e-07, "logits/chosen": -18.934673309326172, "logits/rejected": -18.481908798217773, "logps/chosen": -435.6615295410156, "logps/rejected": -374.13140869140625, "loss": 0.2501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7470641136169434, "rewards/margins": 1.5912635326385498, "rewards/rejected": 2.1558005809783936, "step": 37940 }, { "epoch": 1.7619202377083432, "grad_norm": 6.892862796783447, "learning_rate": 1.9433771298574676e-07, "logits/chosen": -19.050830841064453, "logits/rejected": -18.240589141845703, "logps/chosen": -362.16357421875, "logps/rejected": -291.68597412109375, "loss": 0.6537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6277318000793457, "rewards/margins": 1.2305179834365845, "rewards/rejected": 2.3972134590148926, "step": 37950 }, { "epoch": 1.762384511815776, "grad_norm": 93.03424835205078, "learning_rate": 1.943098565393008e-07, "logits/chosen": -20.354652404785156, "logits/rejected": -18.550745010375977, "logps/chosen": -346.03302001953125, "logps/rejected": -264.12701416015625, "loss": 0.3998, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.306736469268799, "rewards/margins": 1.2506283521652222, "rewards/rejected": 2.056107997894287, "step": 37960 }, { "epoch": 1.7628487859232091, "grad_norm": 8.744939804077148, "learning_rate": 1.9428200009285481e-07, "logits/chosen": -19.2130069732666, "logits/rejected": -18.60489273071289, "logps/chosen": -373.7327880859375, "logps/rejected": -315.84588623046875, "loss": 0.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6017627716064453, "rewards/margins": 1.8273178339004517, "rewards/rejected": 1.7744449377059937, "step": 37970 }, { "epoch": 1.7633130600306421, "grad_norm": 9.05945873260498, "learning_rate": 1.9425414364640883e-07, "logits/chosen": -18.249271392822266, "logits/rejected": -17.65348243713379, "logps/chosen": -379.3153381347656, "logps/rejected": -319.5335388183594, "loss": 0.5792, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0807623863220215, "rewards/margins": 0.8048675656318665, "rewards/rejected": 2.2758948802948, "step": 37980 }, { "epoch": 1.7637773341380751, "grad_norm": 106.69913482666016, "learning_rate": 1.9422628719996284e-07, "logits/chosen": -18.327842712402344, "logits/rejected": -19.168270111083984, "logps/chosen": -369.44696044921875, "logps/rejected": -458.77850341796875, "loss": 1.1352, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.2130789756774902, "rewards/margins": -0.6446883082389832, "rewards/rejected": 3.857767105102539, "step": 37990 }, { "epoch": 1.7642416082455081, "grad_norm": 22.570701599121094, "learning_rate": 1.9419843075351686e-07, "logits/chosen": -18.1944637298584, "logits/rejected": -18.012100219726562, "logps/chosen": -368.00262451171875, "logps/rejected": -340.3594055175781, "loss": 1.1044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.661159038543701, "rewards/margins": -0.09392130374908447, "rewards/rejected": 2.755080461502075, "step": 38000 }, { "epoch": 1.7647058823529411, "grad_norm": 190.13226318359375, "learning_rate": 1.941705743070709e-07, "logits/chosen": -18.63547706604004, "logits/rejected": -18.16058349609375, "logps/chosen": -331.66522216796875, "logps/rejected": -312.62139892578125, "loss": 0.7471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.329184055328369, "rewards/margins": 0.8160846829414368, "rewards/rejected": 2.513099193572998, "step": 38010 }, { "epoch": 1.7651701564603743, "grad_norm": 21.10024642944336, "learning_rate": 1.9414271786062491e-07, "logits/chosen": -19.014503479003906, "logits/rejected": -17.917598724365234, "logps/chosen": -390.36712646484375, "logps/rejected": -239.3827667236328, "loss": 0.2973, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.199955463409424, "rewards/margins": 1.8526268005371094, "rewards/rejected": 1.347328543663025, "step": 38020 }, { "epoch": 1.7656344305678071, "grad_norm": 43.5926628112793, "learning_rate": 1.941148614141789e-07, "logits/chosen": -20.680849075317383, "logits/rejected": -20.032987594604492, "logps/chosen": -456.66064453125, "logps/rejected": -468.9141540527344, "loss": 0.7249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.383904933929443, "rewards/margins": 0.5150313377380371, "rewards/rejected": 3.8688735961914062, "step": 38030 }, { "epoch": 1.7660987046752403, "grad_norm": 82.70697784423828, "learning_rate": 1.9408700496773294e-07, "logits/chosen": -18.617298126220703, "logits/rejected": -18.11630630493164, "logps/chosen": -278.337646484375, "logps/rejected": -243.8698272705078, "loss": 0.657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4121272563934326, "rewards/margins": 0.8770323991775513, "rewards/rejected": 1.5350950956344604, "step": 38040 }, { "epoch": 1.7665629787826733, "grad_norm": 60.61085510253906, "learning_rate": 1.9405914852128696e-07, "logits/chosen": -18.797151565551758, "logits/rejected": -18.027706146240234, "logps/chosen": -446.3111267089844, "logps/rejected": -312.3559265136719, "loss": 0.5228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6088333129882812, "rewards/margins": 0.8151979446411133, "rewards/rejected": 2.7936348915100098, "step": 38050 }, { "epoch": 1.7670272528901063, "grad_norm": 129.11135864257812, "learning_rate": 1.94031292074841e-07, "logits/chosen": -19.397865295410156, "logits/rejected": -18.29878807067871, "logps/chosen": -348.73101806640625, "logps/rejected": -306.0909423828125, "loss": 0.4904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0065393447875977, "rewards/margins": 0.7352681159973145, "rewards/rejected": 2.271271228790283, "step": 38060 }, { "epoch": 1.7674915269975393, "grad_norm": 47.88182067871094, "learning_rate": 1.9400343562839499e-07, "logits/chosen": -18.78586196899414, "logits/rejected": -18.540515899658203, "logps/chosen": -398.80657958984375, "logps/rejected": -323.9982604980469, "loss": 0.4846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3363051414489746, "rewards/margins": 0.8067590594291687, "rewards/rejected": 2.5295464992523193, "step": 38070 }, { "epoch": 1.7679558011049723, "grad_norm": 203.15045166015625, "learning_rate": 1.93975579181949e-07, "logits/chosen": -18.919206619262695, "logits/rejected": -18.064525604248047, "logps/chosen": -398.83990478515625, "logps/rejected": -338.03448486328125, "loss": 0.7351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.156291961669922, "rewards/margins": 0.6927316188812256, "rewards/rejected": 3.4635605812072754, "step": 38080 }, { "epoch": 1.7684200752124055, "grad_norm": 168.322021484375, "learning_rate": 1.9394772273550304e-07, "logits/chosen": -19.199037551879883, "logits/rejected": -17.577211380004883, "logps/chosen": -443.66497802734375, "logps/rejected": -354.9036865234375, "loss": 0.7088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.259919166564941, "rewards/margins": 1.1531836986541748, "rewards/rejected": 3.1067349910736084, "step": 38090 }, { "epoch": 1.7688843493198383, "grad_norm": 138.7240753173828, "learning_rate": 1.9391986628905706e-07, "logits/chosen": -19.723743438720703, "logits/rejected": -18.271299362182617, "logps/chosen": -430.58294677734375, "logps/rejected": -300.1073303222656, "loss": 0.4742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9832587242126465, "rewards/margins": 1.7464720010757446, "rewards/rejected": 2.2367868423461914, "step": 38100 }, { "epoch": 1.7693486234272715, "grad_norm": 9.219087600708008, "learning_rate": 1.9389200984261107e-07, "logits/chosen": -18.56909942626953, "logits/rejected": -17.65948486328125, "logps/chosen": -383.3324279785156, "logps/rejected": -291.18121337890625, "loss": 1.026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.945296287536621, "rewards/margins": 0.4229775071144104, "rewards/rejected": 2.5223186016082764, "step": 38110 }, { "epoch": 1.7698128975347045, "grad_norm": 154.66941833496094, "learning_rate": 1.9386693904080967e-07, "logits/chosen": -18.750030517578125, "logits/rejected": -18.866779327392578, "logps/chosen": -450.65484619140625, "logps/rejected": -445.13104248046875, "loss": 1.4837, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2751076221466064, "rewards/margins": -0.8741401433944702, "rewards/rejected": 3.1492481231689453, "step": 38120 }, { "epoch": 1.7702771716421375, "grad_norm": 19.431793212890625, "learning_rate": 1.938390825943637e-07, "logits/chosen": -18.338531494140625, "logits/rejected": -18.389862060546875, "logps/chosen": -355.9494934082031, "logps/rejected": -335.2294921875, "loss": 1.2908, "rewards/accuracies": 0.5, "rewards/chosen": 2.5325257778167725, "rewards/margins": -0.23242220282554626, "rewards/rejected": 2.7649483680725098, "step": 38130 }, { "epoch": 1.7707414457495707, "grad_norm": 113.46411895751953, "learning_rate": 1.9381122614791772e-07, "logits/chosen": -20.141281127929688, "logits/rejected": -20.180423736572266, "logps/chosen": -444.20416259765625, "logps/rejected": -426.11102294921875, "loss": 0.63, "rewards/accuracies": 0.5, "rewards/chosen": 5.005331516265869, "rewards/margins": 0.5454269647598267, "rewards/rejected": 4.459904670715332, "step": 38140 }, { "epoch": 1.7712057198570035, "grad_norm": 25.08759117126465, "learning_rate": 1.9378336970147176e-07, "logits/chosen": -18.32175636291504, "logits/rejected": -17.593074798583984, "logps/chosen": -347.19573974609375, "logps/rejected": -226.7184295654297, "loss": 0.6655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6026082038879395, "rewards/margins": 1.1138731241226196, "rewards/rejected": 1.4887349605560303, "step": 38150 }, { "epoch": 1.7716699939644367, "grad_norm": 211.6425018310547, "learning_rate": 1.9375551325502575e-07, "logits/chosen": -19.34604263305664, "logits/rejected": -18.947776794433594, "logps/chosen": -564.6325073242188, "logps/rejected": -494.619140625, "loss": 0.5971, "rewards/accuracies": 0.5, "rewards/chosen": 4.107580184936523, "rewards/margins": 0.8075935244560242, "rewards/rejected": 3.2999866008758545, "step": 38160 }, { "epoch": 1.7721342680718695, "grad_norm": 12.647915840148926, "learning_rate": 1.9372765680857976e-07, "logits/chosen": -19.1932315826416, "logits/rejected": -18.702152252197266, "logps/chosen": -381.40869140625, "logps/rejected": -293.0386962890625, "loss": 0.5488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.75142240524292, "rewards/margins": 1.3756197690963745, "rewards/rejected": 2.375802516937256, "step": 38170 }, { "epoch": 1.7725985421793027, "grad_norm": 181.6539306640625, "learning_rate": 1.936998003621338e-07, "logits/chosen": -19.819541931152344, "logits/rejected": -19.343961715698242, "logps/chosen": -416.5135803222656, "logps/rejected": -328.8063049316406, "loss": 0.829, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9382991790771484, "rewards/margins": 0.8092120885848999, "rewards/rejected": 3.1290876865386963, "step": 38180 }, { "epoch": 1.7730628162867357, "grad_norm": 52.19609451293945, "learning_rate": 1.9367194391568782e-07, "logits/chosen": -18.05695152282715, "logits/rejected": -17.476268768310547, "logps/chosen": -359.7388610839844, "logps/rejected": -293.6431579589844, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.776851177215576, "rewards/margins": 1.3070237636566162, "rewards/rejected": 2.469827175140381, "step": 38190 }, { "epoch": 1.7735270903941687, "grad_norm": 286.3271789550781, "learning_rate": 1.9364408746924183e-07, "logits/chosen": -19.265718460083008, "logits/rejected": -18.695453643798828, "logps/chosen": -543.0138549804688, "logps/rejected": -455.6588439941406, "loss": 1.0346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.287866115570068, "rewards/margins": -0.046332765370607376, "rewards/rejected": 4.334198951721191, "step": 38200 }, { "epoch": 1.7739913645016019, "grad_norm": 28.055341720581055, "learning_rate": 1.9361623102279585e-07, "logits/chosen": -18.481714248657227, "logits/rejected": -17.751605987548828, "logps/chosen": -370.1446838378906, "logps/rejected": -340.5248718261719, "loss": 0.6443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7831015586853027, "rewards/margins": 0.8499034643173218, "rewards/rejected": 1.9331979751586914, "step": 38210 }, { "epoch": 1.7744556386090347, "grad_norm": 0.20575878024101257, "learning_rate": 1.935883745763499e-07, "logits/chosen": -18.456663131713867, "logits/rejected": -17.022205352783203, "logps/chosen": -376.6695861816406, "logps/rejected": -288.7840576171875, "loss": 1.0103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6013171672821045, "rewards/margins": 0.8079662322998047, "rewards/rejected": 2.7933506965637207, "step": 38220 }, { "epoch": 1.7749199127164679, "grad_norm": 189.82229614257812, "learning_rate": 1.9356051812990388e-07, "logits/chosen": -19.126724243164062, "logits/rejected": -19.401546478271484, "logps/chosen": -281.84295654296875, "logps/rejected": -252.85488891601562, "loss": 1.1348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.335510492324829, "rewards/margins": -0.10466451942920685, "rewards/rejected": 2.4401748180389404, "step": 38230 }, { "epoch": 1.7753841868239009, "grad_norm": 45.50580596923828, "learning_rate": 1.935326616834579e-07, "logits/chosen": -18.582141876220703, "logits/rejected": -18.087688446044922, "logps/chosen": -332.2760009765625, "logps/rejected": -351.5771789550781, "loss": 0.6925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.986704111099243, "rewards/margins": 0.6858962178230286, "rewards/rejected": 2.3008077144622803, "step": 38240 }, { "epoch": 1.7758484609313339, "grad_norm": 17.342662811279297, "learning_rate": 1.9350480523701193e-07, "logits/chosen": -19.293781280517578, "logits/rejected": -17.934263229370117, "logps/chosen": -381.0953674316406, "logps/rejected": -258.0320129394531, "loss": 0.2488, "rewards/accuracies": 1.0, "rewards/chosen": 4.230137348175049, "rewards/margins": 1.7426278591156006, "rewards/rejected": 2.4875094890594482, "step": 38250 }, { "epoch": 1.7763127350387669, "grad_norm": 13.058279991149902, "learning_rate": 1.9347694879056595e-07, "logits/chosen": -18.432476043701172, "logits/rejected": -17.740718841552734, "logps/chosen": -323.78302001953125, "logps/rejected": -292.05194091796875, "loss": 0.9509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8889598846435547, "rewards/margins": 0.09320180118083954, "rewards/rejected": 1.7957582473754883, "step": 38260 }, { "epoch": 1.7767770091461998, "grad_norm": 99.18770599365234, "learning_rate": 1.9344909234411994e-07, "logits/chosen": -19.334278106689453, "logits/rejected": -18.02701759338379, "logps/chosen": -401.31365966796875, "logps/rejected": -314.79705810546875, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 4.043759346008301, "rewards/margins": 1.5668270587921143, "rewards/rejected": 2.4769322872161865, "step": 38270 }, { "epoch": 1.777241283253633, "grad_norm": 2.585057497024536, "learning_rate": 1.9342123589767398e-07, "logits/chosen": -19.665071487426758, "logits/rejected": -17.646373748779297, "logps/chosen": -487.24249267578125, "logps/rejected": -317.35308837890625, "loss": 0.2177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.462404251098633, "rewards/margins": 2.408573865890503, "rewards/rejected": 2.053830623626709, "step": 38280 }, { "epoch": 1.7777055573610658, "grad_norm": 35.26124572753906, "learning_rate": 1.93393379451228e-07, "logits/chosen": -19.054248809814453, "logits/rejected": -17.538631439208984, "logps/chosen": -408.84442138671875, "logps/rejected": -290.88201904296875, "loss": 0.3666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4596595764160156, "rewards/margins": 1.8759740591049194, "rewards/rejected": 1.5836851596832275, "step": 38290 }, { "epoch": 1.778169831468499, "grad_norm": 294.67242431640625, "learning_rate": 1.9336552300478203e-07, "logits/chosen": -18.395687103271484, "logits/rejected": -18.479963302612305, "logps/chosen": -374.82232666015625, "logps/rejected": -394.6512145996094, "loss": 1.0395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.557342290878296, "rewards/margins": 0.04492359235882759, "rewards/rejected": 2.512418508529663, "step": 38300 }, { "epoch": 1.778634105575932, "grad_norm": 61.073055267333984, "learning_rate": 1.9333766655833602e-07, "logits/chosen": -19.31698226928711, "logits/rejected": -18.46889877319336, "logps/chosen": -388.2439270019531, "logps/rejected": -280.3811950683594, "loss": 0.3362, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3713178634643555, "rewards/margins": 1.7848374843597412, "rewards/rejected": 2.586480140686035, "step": 38310 }, { "epoch": 1.779098379683365, "grad_norm": 49.88948059082031, "learning_rate": 1.9330981011189004e-07, "logits/chosen": -17.814334869384766, "logits/rejected": -17.62849998474121, "logps/chosen": -368.11627197265625, "logps/rejected": -398.4300231933594, "loss": 1.0663, "rewards/accuracies": 0.5, "rewards/chosen": 2.7963662147521973, "rewards/margins": 0.061546873301267624, "rewards/rejected": 2.7348194122314453, "step": 38320 }, { "epoch": 1.779562653790798, "grad_norm": 26.71650505065918, "learning_rate": 1.9328195366544408e-07, "logits/chosen": -19.24519157409668, "logits/rejected": -17.474502563476562, "logps/chosen": -514.85546875, "logps/rejected": -329.7138366699219, "loss": 0.2359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.353802680969238, "rewards/margins": 2.209144353866577, "rewards/rejected": 2.144658088684082, "step": 38330 }, { "epoch": 1.780026927898231, "grad_norm": 57.30868911743164, "learning_rate": 1.932540972189981e-07, "logits/chosen": -18.353384017944336, "logits/rejected": -17.424015045166016, "logps/chosen": -323.6494445800781, "logps/rejected": -207.2913818359375, "loss": 0.4864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.738999843597412, "rewards/margins": 1.5509716272354126, "rewards/rejected": 2.188028335571289, "step": 38340 }, { "epoch": 1.7804912020056642, "grad_norm": 146.16380310058594, "learning_rate": 1.932262407725521e-07, "logits/chosen": -19.91786003112793, "logits/rejected": -18.871484756469727, "logps/chosen": -504.4608459472656, "logps/rejected": -412.30767822265625, "loss": 0.5718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.683106422424316, "rewards/margins": 0.8671633005142212, "rewards/rejected": 3.8159422874450684, "step": 38350 }, { "epoch": 1.780955476113097, "grad_norm": 20.38640785217285, "learning_rate": 1.9319838432610612e-07, "logits/chosen": -19.42439842224121, "logits/rejected": -18.236059188842773, "logps/chosen": -380.8475646972656, "logps/rejected": -300.1927185058594, "loss": 0.5785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4908604621887207, "rewards/margins": 1.1374292373657227, "rewards/rejected": 2.353431224822998, "step": 38360 }, { "epoch": 1.7814197502205302, "grad_norm": 7.897902011871338, "learning_rate": 1.9317052787966016e-07, "logits/chosen": -18.844873428344727, "logits/rejected": -17.984619140625, "logps/chosen": -441.1725158691406, "logps/rejected": -403.68621826171875, "loss": 0.4378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7425007820129395, "rewards/margins": 1.2674716711044312, "rewards/rejected": 2.4750287532806396, "step": 38370 }, { "epoch": 1.7818840243279632, "grad_norm": 51.27432632446289, "learning_rate": 1.9314267143321418e-07, "logits/chosen": -18.205677032470703, "logits/rejected": -17.962398529052734, "logps/chosen": -346.1475830078125, "logps/rejected": -378.1070861816406, "loss": 1.1962, "rewards/accuracies": 0.5, "rewards/chosen": 2.3316550254821777, "rewards/margins": -0.10740339756011963, "rewards/rejected": 2.439058542251587, "step": 38380 }, { "epoch": 1.7823482984353962, "grad_norm": 129.4324188232422, "learning_rate": 1.9311481498676816e-07, "logits/chosen": -19.353017807006836, "logits/rejected": -18.754850387573242, "logps/chosen": -417.22412109375, "logps/rejected": -303.1707763671875, "loss": 0.7569, "rewards/accuracies": 0.5, "rewards/chosen": 3.149265766143799, "rewards/margins": 0.9386252164840698, "rewards/rejected": 2.2106404304504395, "step": 38390 }, { "epoch": 1.7828125725428294, "grad_norm": 46.666019439697266, "learning_rate": 1.930869585403222e-07, "logits/chosen": -20.46457290649414, "logits/rejected": -19.112987518310547, "logps/chosen": -361.9877014160156, "logps/rejected": -279.3669738769531, "loss": 0.4715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.090950965881348, "rewards/margins": 1.189988374710083, "rewards/rejected": 2.9009623527526855, "step": 38400 }, { "epoch": 1.7832768466502622, "grad_norm": 0.9559656381607056, "learning_rate": 1.9305910209387622e-07, "logits/chosen": -19.170085906982422, "logits/rejected": -17.65570068359375, "logps/chosen": -365.4111022949219, "logps/rejected": -217.7964630126953, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 3.820288896560669, "rewards/margins": 1.7942559719085693, "rewards/rejected": 2.0260326862335205, "step": 38410 }, { "epoch": 1.7837411207576954, "grad_norm": 30.80681037902832, "learning_rate": 1.930312456474302e-07, "logits/chosen": -18.243696212768555, "logits/rejected": -18.42198371887207, "logps/chosen": -240.44143676757812, "logps/rejected": -320.929443359375, "loss": 1.4016, "rewards/accuracies": 0.5, "rewards/chosen": 2.2591183185577393, "rewards/margins": -0.28848570585250854, "rewards/rejected": 2.5476038455963135, "step": 38420 }, { "epoch": 1.7842053948651284, "grad_norm": 56.230472564697266, "learning_rate": 1.9300338920098425e-07, "logits/chosen": -19.18366050720215, "logits/rejected": -18.566574096679688, "logps/chosen": -423.1036682128906, "logps/rejected": -356.42755126953125, "loss": 0.8999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.194606781005859, "rewards/margins": 0.45820480585098267, "rewards/rejected": 3.736401319503784, "step": 38430 }, { "epoch": 1.7846696689725614, "grad_norm": 169.31956481933594, "learning_rate": 1.9297553275453826e-07, "logits/chosen": -18.67643165588379, "logits/rejected": -18.52956771850586, "logps/chosen": -328.204833984375, "logps/rejected": -312.92498779296875, "loss": 0.9562, "rewards/accuracies": 0.5, "rewards/chosen": 2.7600865364074707, "rewards/margins": 0.11516524851322174, "rewards/rejected": 2.64492130279541, "step": 38440 }, { "epoch": 1.7851339430799944, "grad_norm": 193.3366241455078, "learning_rate": 1.929476763080923e-07, "logits/chosen": -18.252788543701172, "logits/rejected": -17.90427589416504, "logps/chosen": -359.2933654785156, "logps/rejected": -352.9576110839844, "loss": 0.9034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.549837589263916, "rewards/margins": -0.027148067951202393, "rewards/rejected": 3.5769851207733154, "step": 38450 }, { "epoch": 1.7855982171874274, "grad_norm": 35.62508010864258, "learning_rate": 1.929198198616463e-07, "logits/chosen": -19.013019561767578, "logits/rejected": -18.743120193481445, "logps/chosen": -340.45556640625, "logps/rejected": -294.59747314453125, "loss": 1.0517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.887923002243042, "rewards/margins": 0.11574073135852814, "rewards/rejected": 2.7721822261810303, "step": 38460 }, { "epoch": 1.7860624912948606, "grad_norm": 98.5315933227539, "learning_rate": 1.9289196341520033e-07, "logits/chosen": -18.503604888916016, "logits/rejected": -18.43206787109375, "logps/chosen": -468.47149658203125, "logps/rejected": -424.365966796875, "loss": 0.7204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.288620710372925, "rewards/margins": 0.32207760214805603, "rewards/rejected": 2.966543197631836, "step": 38470 }, { "epoch": 1.7865267654022934, "grad_norm": 125.34839630126953, "learning_rate": 1.9286410696875435e-07, "logits/chosen": -18.8203067779541, "logits/rejected": -18.304676055908203, "logps/chosen": -400.26336669921875, "logps/rejected": -321.12725830078125, "loss": 0.768, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.974045753479004, "rewards/margins": 0.15337388217449188, "rewards/rejected": 2.820671558380127, "step": 38480 }, { "epoch": 1.7869910395097266, "grad_norm": 4.146976470947266, "learning_rate": 1.9283625052230836e-07, "logits/chosen": -18.961078643798828, "logits/rejected": -17.680774688720703, "logps/chosen": -422.8744201660156, "logps/rejected": -281.97454833984375, "loss": 0.3701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.583258867263794, "rewards/margins": 1.5599421262741089, "rewards/rejected": 2.0233166217803955, "step": 38490 }, { "epoch": 1.7874553136171596, "grad_norm": 85.17378234863281, "learning_rate": 1.9280839407586238e-07, "logits/chosen": -19.468975067138672, "logits/rejected": -19.1626033782959, "logps/chosen": -363.9127502441406, "logps/rejected": -274.54656982421875, "loss": 0.4306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7490286827087402, "rewards/margins": 0.8904261589050293, "rewards/rejected": 2.858602523803711, "step": 38500 }, { "epoch": 1.7879195877245926, "grad_norm": 0.3820665776729584, "learning_rate": 1.927805376294164e-07, "logits/chosen": -18.538646697998047, "logits/rejected": -18.48470687866211, "logps/chosen": -397.48809814453125, "logps/rejected": -398.38507080078125, "loss": 1.0361, "rewards/accuracies": 0.5, "rewards/chosen": 3.481870651245117, "rewards/margins": 0.23999269306659698, "rewards/rejected": 3.2418785095214844, "step": 38510 }, { "epoch": 1.7883838618320256, "grad_norm": 141.0404815673828, "learning_rate": 1.9275268118297043e-07, "logits/chosen": -19.43117332458496, "logits/rejected": -19.31598663330078, "logps/chosen": -355.51666259765625, "logps/rejected": -264.39434814453125, "loss": 0.6696, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9409313201904297, "rewards/margins": 0.4761483073234558, "rewards/rejected": 2.464782953262329, "step": 38520 }, { "epoch": 1.7888481359394586, "grad_norm": 84.27919006347656, "learning_rate": 1.9272482473652445e-07, "logits/chosen": -18.94249725341797, "logits/rejected": -18.103578567504883, "logps/chosen": -353.9357604980469, "logps/rejected": -276.7274169921875, "loss": 0.8034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.48632550239563, "rewards/margins": 0.736903727054596, "rewards/rejected": 2.749422073364258, "step": 38530 }, { "epoch": 1.7893124100468918, "grad_norm": 147.4630126953125, "learning_rate": 1.9269696829007843e-07, "logits/chosen": -20.32115936279297, "logits/rejected": -20.052501678466797, "logps/chosen": -368.88519287109375, "logps/rejected": -327.0338439941406, "loss": 0.9176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.053067922592163, "rewards/margins": 0.2160399705171585, "rewards/rejected": 2.8370280265808105, "step": 38540 }, { "epoch": 1.7897766841543246, "grad_norm": 15.186117172241211, "learning_rate": 1.9266911184363248e-07, "logits/chosen": -18.113666534423828, "logits/rejected": -17.608230590820312, "logps/chosen": -365.3860168457031, "logps/rejected": -318.5384826660156, "loss": 0.6414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.964172840118408, "rewards/margins": 1.2595899105072021, "rewards/rejected": 1.7045828104019165, "step": 38550 }, { "epoch": 1.7902409582617578, "grad_norm": 0.627265214920044, "learning_rate": 1.926412553971865e-07, "logits/chosen": -18.964664459228516, "logits/rejected": -18.227100372314453, "logps/chosen": -353.95050048828125, "logps/rejected": -292.3438415527344, "loss": 0.831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.069222927093506, "rewards/margins": 0.9580909609794617, "rewards/rejected": 2.1111321449279785, "step": 38560 }, { "epoch": 1.7907052323691908, "grad_norm": 62.9867057800293, "learning_rate": 1.9261339895074053e-07, "logits/chosen": -20.425203323364258, "logits/rejected": -19.3272647857666, "logps/chosen": -362.5250244140625, "logps/rejected": -299.08416748046875, "loss": 0.4866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.523165464401245, "rewards/margins": 1.0489816665649414, "rewards/rejected": 2.4741837978363037, "step": 38570 }, { "epoch": 1.7911695064766238, "grad_norm": 122.76624298095703, "learning_rate": 1.9258554250429452e-07, "logits/chosen": -19.143753051757812, "logits/rejected": -18.67689323425293, "logps/chosen": -485.9873962402344, "logps/rejected": -416.63031005859375, "loss": 0.6872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.303958892822266, "rewards/margins": 0.5757542252540588, "rewards/rejected": 3.7282042503356934, "step": 38580 }, { "epoch": 1.791633780584057, "grad_norm": 31.34386444091797, "learning_rate": 1.9255768605784853e-07, "logits/chosen": -19.05876350402832, "logits/rejected": -18.56670379638672, "logps/chosen": -412.7470703125, "logps/rejected": -361.2636413574219, "loss": 0.6788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3411426544189453, "rewards/margins": 0.5821893811225891, "rewards/rejected": 2.7589528560638428, "step": 38590 }, { "epoch": 1.7920980546914898, "grad_norm": 108.38471984863281, "learning_rate": 1.9252982961140257e-07, "logits/chosen": -18.767566680908203, "logits/rejected": -17.810848236083984, "logps/chosen": -413.744873046875, "logps/rejected": -279.8525390625, "loss": 0.6283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0947961807250977, "rewards/margins": 1.0410144329071045, "rewards/rejected": 2.053781747817993, "step": 38600 }, { "epoch": 1.792562328798923, "grad_norm": 76.78958129882812, "learning_rate": 1.9250197316495656e-07, "logits/chosen": -18.938386917114258, "logits/rejected": -18.94679069519043, "logps/chosen": -342.588623046875, "logps/rejected": -382.7710876464844, "loss": 0.7739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1021132469177246, "rewards/margins": 0.4298144280910492, "rewards/rejected": 2.6722989082336426, "step": 38610 }, { "epoch": 1.793026602906356, "grad_norm": 6.741198539733887, "learning_rate": 1.924741167185106e-07, "logits/chosen": -19.623964309692383, "logits/rejected": -18.245708465576172, "logps/chosen": -288.5769958496094, "logps/rejected": -259.5343322753906, "loss": 0.531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.298320770263672, "rewards/margins": 1.2787106037139893, "rewards/rejected": 2.0196099281311035, "step": 38620 }, { "epoch": 1.793490877013789, "grad_norm": 197.67526245117188, "learning_rate": 1.9244626027206462e-07, "logits/chosen": -17.642642974853516, "logits/rejected": -18.278202056884766, "logps/chosen": -391.30126953125, "logps/rejected": -411.5594787597656, "loss": 1.1711, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5834057331085205, "rewards/margins": -0.2768363058567047, "rewards/rejected": 3.860241651535034, "step": 38630 }, { "epoch": 1.793955151121222, "grad_norm": 26.545835494995117, "learning_rate": 1.9241840382561866e-07, "logits/chosen": -18.334213256835938, "logits/rejected": -17.197418212890625, "logps/chosen": -400.8849792480469, "logps/rejected": -280.67694091796875, "loss": 0.5524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.546989917755127, "rewards/margins": 0.982100784778595, "rewards/rejected": 2.5648887157440186, "step": 38640 }, { "epoch": 1.794419425228655, "grad_norm": 71.36769104003906, "learning_rate": 1.9239054737917265e-07, "logits/chosen": -19.365192413330078, "logits/rejected": -18.507219314575195, "logps/chosen": -265.4825744628906, "logps/rejected": -272.43499755859375, "loss": 0.537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8358101844787598, "rewards/margins": 1.1401374340057373, "rewards/rejected": 1.6956729888916016, "step": 38650 }, { "epoch": 1.7948836993360882, "grad_norm": 1.637835144996643, "learning_rate": 1.9236269093272666e-07, "logits/chosen": -19.245849609375, "logits/rejected": -18.326290130615234, "logps/chosen": -417.0848083496094, "logps/rejected": -287.36346435546875, "loss": 0.4356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5773277282714844, "rewards/margins": 1.42337965965271, "rewards/rejected": 2.153947591781616, "step": 38660 }, { "epoch": 1.795347973443521, "grad_norm": 0.64114910364151, "learning_rate": 1.923348344862807e-07, "logits/chosen": -18.615558624267578, "logits/rejected": -17.865814208984375, "logps/chosen": -397.97296142578125, "logps/rejected": -274.35040283203125, "loss": 0.7321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.232241630554199, "rewards/margins": 2.1212801933288574, "rewards/rejected": 2.110961675643921, "step": 38670 }, { "epoch": 1.7958122475509541, "grad_norm": 16.087661743164062, "learning_rate": 1.9230697803983472e-07, "logits/chosen": -19.63397216796875, "logits/rejected": -18.5150146484375, "logps/chosen": -284.6239013671875, "logps/rejected": -274.66717529296875, "loss": 0.6545, "rewards/accuracies": 0.5, "rewards/chosen": 2.610623359680176, "rewards/margins": 0.4332825541496277, "rewards/rejected": 2.1773407459259033, "step": 38680 }, { "epoch": 1.7962765216583871, "grad_norm": 188.41110229492188, "learning_rate": 1.922791215933887e-07, "logits/chosen": -18.88938331604004, "logits/rejected": -17.84933853149414, "logps/chosen": -370.8651123046875, "logps/rejected": -233.9276580810547, "loss": 0.4367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3022449016571045, "rewards/margins": 1.4935994148254395, "rewards/rejected": 1.808645248413086, "step": 38690 }, { "epoch": 1.7967407957658201, "grad_norm": 67.22924041748047, "learning_rate": 1.9225126514694275e-07, "logits/chosen": -18.627206802368164, "logits/rejected": -17.642311096191406, "logps/chosen": -352.75006103515625, "logps/rejected": -315.88739013671875, "loss": 0.7031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.602191209793091, "rewards/margins": 1.2882705926895142, "rewards/rejected": 2.3139209747314453, "step": 38700 }, { "epoch": 1.7972050698732531, "grad_norm": 262.1325378417969, "learning_rate": 1.9222340870049676e-07, "logits/chosen": -18.761211395263672, "logits/rejected": -17.780925750732422, "logps/chosen": -374.88623046875, "logps/rejected": -307.8087463378906, "loss": 1.001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.367030382156372, "rewards/margins": 0.32583972811698914, "rewards/rejected": 3.0411906242370605, "step": 38710 }, { "epoch": 1.7976693439806861, "grad_norm": 22.095823287963867, "learning_rate": 1.921955522540508e-07, "logits/chosen": -19.333398818969727, "logits/rejected": -17.414535522460938, "logps/chosen": -352.81884765625, "logps/rejected": -192.9644317626953, "loss": 0.492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8137857913970947, "rewards/margins": 1.5992257595062256, "rewards/rejected": 1.2145600318908691, "step": 38720 }, { "epoch": 1.7981336180881193, "grad_norm": 41.64179229736328, "learning_rate": 1.921676958076048e-07, "logits/chosen": -19.162609100341797, "logits/rejected": -18.5257511138916, "logps/chosen": -316.225341796875, "logps/rejected": -324.5711975097656, "loss": 0.749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3712756633758545, "rewards/margins": 0.2643257975578308, "rewards/rejected": 3.106950283050537, "step": 38730 }, { "epoch": 1.798597892195552, "grad_norm": 95.07891845703125, "learning_rate": 1.921398393611588e-07, "logits/chosen": -19.280155181884766, "logits/rejected": -18.962299346923828, "logps/chosen": -364.633544921875, "logps/rejected": -307.9478454589844, "loss": 0.7759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7315850257873535, "rewards/margins": 0.31371089816093445, "rewards/rejected": 2.4178740978240967, "step": 38740 }, { "epoch": 1.7990621663029853, "grad_norm": 129.5514373779297, "learning_rate": 1.9211198291471285e-07, "logits/chosen": -18.589487075805664, "logits/rejected": -17.646472930908203, "logps/chosen": -507.87298583984375, "logps/rejected": -347.54864501953125, "loss": 0.5657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9844210147857666, "rewards/margins": 0.6565287709236145, "rewards/rejected": 2.3278920650482178, "step": 38750 }, { "epoch": 1.7995264404104183, "grad_norm": 243.0975799560547, "learning_rate": 1.9208412646826686e-07, "logits/chosen": -19.544591903686523, "logits/rejected": -17.97676658630371, "logps/chosen": -483.0352478027344, "logps/rejected": -337.97296142578125, "loss": 0.4807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.490853309631348, "rewards/margins": 2.2839019298553467, "rewards/rejected": 2.2069506645202637, "step": 38760 }, { "epoch": 1.7999907145178513, "grad_norm": 89.92787170410156, "learning_rate": 1.9205627002182088e-07, "logits/chosen": -19.62274742126465, "logits/rejected": -17.4815616607666, "logps/chosen": -404.1842956542969, "logps/rejected": -220.64096069335938, "loss": 0.1832, "rewards/accuracies": 1.0, "rewards/chosen": 3.899625778198242, "rewards/margins": 2.097407341003418, "rewards/rejected": 1.8022184371948242, "step": 38770 }, { "epoch": 1.8004549886252845, "grad_norm": 184.55430603027344, "learning_rate": 1.920284135753749e-07, "logits/chosen": -19.218809127807617, "logits/rejected": -19.644947052001953, "logps/chosen": -404.1689453125, "logps/rejected": -369.3221130371094, "loss": 0.9665, "rewards/accuracies": 0.5, "rewards/chosen": 3.1992671489715576, "rewards/margins": 0.19288620352745056, "rewards/rejected": 3.006381034851074, "step": 38780 }, { "epoch": 1.8009192627327173, "grad_norm": 3.0208606719970703, "learning_rate": 1.9200055712892893e-07, "logits/chosen": -19.07223892211914, "logits/rejected": -19.096420288085938, "logps/chosen": -328.8465270996094, "logps/rejected": -317.7052001953125, "loss": 0.8349, "rewards/accuracies": 0.5, "rewards/chosen": 2.119736909866333, "rewards/margins": 0.1273731291294098, "rewards/rejected": 1.992363691329956, "step": 38790 }, { "epoch": 1.8013835368401505, "grad_norm": 61.55757141113281, "learning_rate": 1.9197270068248292e-07, "logits/chosen": -19.69008445739746, "logits/rejected": -18.687610626220703, "logps/chosen": -288.46710205078125, "logps/rejected": -233.6641845703125, "loss": 0.5334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1073696613311768, "rewards/margins": 0.9091958999633789, "rewards/rejected": 1.1981736421585083, "step": 38800 }, { "epoch": 1.8018478109475835, "grad_norm": 30.46699333190918, "learning_rate": 1.9194484423603693e-07, "logits/chosen": -19.098379135131836, "logits/rejected": -18.544071197509766, "logps/chosen": -356.23455810546875, "logps/rejected": -294.55218505859375, "loss": 0.6579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9447803497314453, "rewards/margins": 0.5510791540145874, "rewards/rejected": 2.3937013149261475, "step": 38810 }, { "epoch": 1.8023120850550165, "grad_norm": 14.635238647460938, "learning_rate": 1.9191698778959097e-07, "logits/chosen": -19.246885299682617, "logits/rejected": -18.204511642456055, "logps/chosen": -438.01123046875, "logps/rejected": -323.54754638671875, "loss": 0.506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.087053298950195, "rewards/margins": 1.4142684936523438, "rewards/rejected": 2.6727850437164307, "step": 38820 }, { "epoch": 1.8027763591624495, "grad_norm": 2.3085851669311523, "learning_rate": 1.91889131343145e-07, "logits/chosen": -18.310930252075195, "logits/rejected": -17.13266944885254, "logps/chosen": -346.46441650390625, "logps/rejected": -281.62982177734375, "loss": 0.7259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6835246086120605, "rewards/margins": 1.2692772150039673, "rewards/rejected": 2.4142470359802246, "step": 38830 }, { "epoch": 1.8032406332698825, "grad_norm": 31.50823974609375, "learning_rate": 1.9186127489669898e-07, "logits/chosen": -18.876174926757812, "logits/rejected": -18.529417037963867, "logps/chosen": -322.13580322265625, "logps/rejected": -300.84014892578125, "loss": 0.7176, "rewards/accuracies": 0.5, "rewards/chosen": 3.2994816303253174, "rewards/margins": 0.24647831916809082, "rewards/rejected": 3.0530033111572266, "step": 38840 }, { "epoch": 1.8037049073773157, "grad_norm": 162.82310485839844, "learning_rate": 1.9183341845025302e-07, "logits/chosen": -18.675310134887695, "logits/rejected": -19.340892791748047, "logps/chosen": -415.61090087890625, "logps/rejected": -536.3477783203125, "loss": 1.2197, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6157310009002686, "rewards/margins": -0.2812708020210266, "rewards/rejected": 3.8970019817352295, "step": 38850 }, { "epoch": 1.8041691814847485, "grad_norm": 23.973026275634766, "learning_rate": 1.9180556200380703e-07, "logits/chosen": -19.62704849243164, "logits/rejected": -17.86350440979004, "logps/chosen": -442.486572265625, "logps/rejected": -326.6485595703125, "loss": 0.3801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2351768016815186, "rewards/margins": 1.625775694847107, "rewards/rejected": 1.6094013452529907, "step": 38860 }, { "epoch": 1.8046334555921817, "grad_norm": 171.64132690429688, "learning_rate": 1.9177770555736107e-07, "logits/chosen": -18.149293899536133, "logits/rejected": -17.621566772460938, "logps/chosen": -395.2170104980469, "logps/rejected": -305.7353515625, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": 3.014848232269287, "rewards/margins": 0.6093441843986511, "rewards/rejected": 2.4055042266845703, "step": 38870 }, { "epoch": 1.8050977296996147, "grad_norm": 62.19712829589844, "learning_rate": 1.9174984911091506e-07, "logits/chosen": -19.371183395385742, "logits/rejected": -18.318531036376953, "logps/chosen": -313.2359313964844, "logps/rejected": -287.62481689453125, "loss": 0.767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0619559288024902, "rewards/margins": 0.5954152941703796, "rewards/rejected": 2.466540813446045, "step": 38880 }, { "epoch": 1.8055620038070477, "grad_norm": 36.06816101074219, "learning_rate": 1.917219926644691e-07, "logits/chosen": -20.190643310546875, "logits/rejected": -18.832401275634766, "logps/chosen": -348.1213684082031, "logps/rejected": -315.70892333984375, "loss": 0.7601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.854342460632324, "rewards/margins": 0.541610598564148, "rewards/rejected": 2.3127317428588867, "step": 38890 }, { "epoch": 1.8060262779144807, "grad_norm": 13.022082328796387, "learning_rate": 1.9169413621802312e-07, "logits/chosen": -18.89932632446289, "logits/rejected": -17.202791213989258, "logps/chosen": -447.35595703125, "logps/rejected": -197.863525390625, "loss": 0.4143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7653231620788574, "rewards/margins": 2.3948111534118652, "rewards/rejected": 1.3705120086669922, "step": 38900 }, { "epoch": 1.8064905520219137, "grad_norm": 2.0401451587677, "learning_rate": 1.9166627977157713e-07, "logits/chosen": -19.232376098632812, "logits/rejected": -18.10284996032715, "logps/chosen": -402.4275207519531, "logps/rejected": -281.514892578125, "loss": 0.4184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.129384994506836, "rewards/margins": 1.2784937620162964, "rewards/rejected": 1.85089111328125, "step": 38910 }, { "epoch": 1.8069548261293469, "grad_norm": 23.59895896911621, "learning_rate": 1.9163842332513115e-07, "logits/chosen": -19.17362403869629, "logits/rejected": -18.840675354003906, "logps/chosen": -346.94183349609375, "logps/rejected": -325.1001892089844, "loss": 0.5943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.142228364944458, "rewards/margins": 0.954359233379364, "rewards/rejected": 2.1878693103790283, "step": 38920 }, { "epoch": 1.8074191002367797, "grad_norm": 73.59831237792969, "learning_rate": 1.9161056687868516e-07, "logits/chosen": -17.991540908813477, "logits/rejected": -16.800853729248047, "logps/chosen": -395.33575439453125, "logps/rejected": -244.4047088623047, "loss": 0.4029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.372098445892334, "rewards/margins": 1.871862769126892, "rewards/rejected": 1.5002355575561523, "step": 38930 }, { "epoch": 1.8078833743442129, "grad_norm": 91.40245056152344, "learning_rate": 1.915827104322392e-07, "logits/chosen": -19.31634521484375, "logits/rejected": -17.999141693115234, "logps/chosen": -327.84674072265625, "logps/rejected": -268.13031005859375, "loss": 0.6432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3408138751983643, "rewards/margins": 0.9588829278945923, "rewards/rejected": 2.3819308280944824, "step": 38940 }, { "epoch": 1.8083476484516459, "grad_norm": 125.97937774658203, "learning_rate": 1.9155485398579322e-07, "logits/chosen": -20.490474700927734, "logits/rejected": -20.086212158203125, "logps/chosen": -444.09466552734375, "logps/rejected": -389.5284729003906, "loss": 0.6282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2597992420196533, "rewards/margins": 0.6699100732803345, "rewards/rejected": 2.5898890495300293, "step": 38950 }, { "epoch": 1.8088119225590789, "grad_norm": 17.261611938476562, "learning_rate": 1.915269975393472e-07, "logits/chosen": -18.86768341064453, "logits/rejected": -18.62038803100586, "logps/chosen": -362.47149658203125, "logps/rejected": -286.0572204589844, "loss": 1.1855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7243189811706543, "rewards/margins": 0.4868003726005554, "rewards/rejected": 2.237518787384033, "step": 38960 }, { "epoch": 1.809276196666512, "grad_norm": 0.40301084518432617, "learning_rate": 1.9149914109290125e-07, "logits/chosen": -19.34606170654297, "logits/rejected": -18.46063804626465, "logps/chosen": -408.27777099609375, "logps/rejected": -290.36712646484375, "loss": 0.4438, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9277751445770264, "rewards/margins": 1.704633116722107, "rewards/rejected": 2.223142385482788, "step": 38970 }, { "epoch": 1.8097404707739448, "grad_norm": 111.49134063720703, "learning_rate": 1.9147128464645526e-07, "logits/chosen": -19.98073387145996, "logits/rejected": -19.83902359008789, "logps/chosen": -500.11199951171875, "logps/rejected": -453.4822692871094, "loss": 0.4979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.122247695922852, "rewards/margins": 0.7608476877212524, "rewards/rejected": 3.3613998889923096, "step": 38980 }, { "epoch": 1.810204744881378, "grad_norm": 42.926841735839844, "learning_rate": 1.9144342820000925e-07, "logits/chosen": -18.841569900512695, "logits/rejected": -18.251361846923828, "logps/chosen": -411.768310546875, "logps/rejected": -301.30120849609375, "loss": 0.6456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4411120414733887, "rewards/margins": 1.0293729305267334, "rewards/rejected": 2.4117395877838135, "step": 38990 }, { "epoch": 1.8106690189888108, "grad_norm": 256.6132507324219, "learning_rate": 1.914155717535633e-07, "logits/chosen": -19.24506187438965, "logits/rejected": -19.002763748168945, "logps/chosen": -373.6001892089844, "logps/rejected": -374.28851318359375, "loss": 0.6607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.553703784942627, "rewards/margins": 0.5547767281532288, "rewards/rejected": 2.998926877975464, "step": 39000 }, { "epoch": 1.811133293096244, "grad_norm": 8.848709106445312, "learning_rate": 1.913877153071173e-07, "logits/chosen": -19.649253845214844, "logits/rejected": -18.962749481201172, "logps/chosen": -418.4449768066406, "logps/rejected": -368.84161376953125, "loss": 0.7753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.168123245239258, "rewards/margins": 1.0484580993652344, "rewards/rejected": 3.1196656227111816, "step": 39010 }, { "epoch": 1.811597567203677, "grad_norm": 182.6968994140625, "learning_rate": 1.9135985886067134e-07, "logits/chosen": -19.078237533569336, "logits/rejected": -18.85360336303711, "logps/chosen": -429.37811279296875, "logps/rejected": -396.96343994140625, "loss": 0.7771, "rewards/accuracies": 0.5, "rewards/chosen": 3.8382163047790527, "rewards/margins": 0.5236362218856812, "rewards/rejected": 3.314580202102661, "step": 39020 }, { "epoch": 1.81206184131111, "grad_norm": 22.932052612304688, "learning_rate": 1.9133200241422533e-07, "logits/chosen": -17.986703872680664, "logits/rejected": -17.30161476135254, "logps/chosen": -258.7533264160156, "logps/rejected": -213.40438842773438, "loss": 0.673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9433794021606445, "rewards/margins": 1.0252822637557983, "rewards/rejected": 0.9180973172187805, "step": 39030 }, { "epoch": 1.8125261154185432, "grad_norm": 34.52877426147461, "learning_rate": 1.9130414596777937e-07, "logits/chosen": -19.273983001708984, "logits/rejected": -17.786182403564453, "logps/chosen": -499.53033447265625, "logps/rejected": -317.1188049316406, "loss": 0.2605, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.247054100036621, "rewards/margins": 1.9635467529296875, "rewards/rejected": 2.2835073471069336, "step": 39040 }, { "epoch": 1.812990389525976, "grad_norm": 80.53899383544922, "learning_rate": 1.912762895213334e-07, "logits/chosen": -19.755001068115234, "logits/rejected": -18.904705047607422, "logps/chosen": -455.77520751953125, "logps/rejected": -443.8447265625, "loss": 0.6832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8592185974121094, "rewards/margins": 0.9168974161148071, "rewards/rejected": 2.942321300506592, "step": 39050 }, { "epoch": 1.8134546636334092, "grad_norm": 6.43282413482666, "learning_rate": 1.9124843307488743e-07, "logits/chosen": -19.320358276367188, "logits/rejected": -19.10272789001465, "logps/chosen": -483.1936950683594, "logps/rejected": -425.781494140625, "loss": 1.0808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.401066780090332, "rewards/margins": 0.46185049414634705, "rewards/rejected": 3.9392166137695312, "step": 39060 }, { "epoch": 1.8139189377408422, "grad_norm": 31.43887710571289, "learning_rate": 1.9122057662844142e-07, "logits/chosen": -18.491708755493164, "logits/rejected": -18.223331451416016, "logps/chosen": -463.47772216796875, "logps/rejected": -444.27606201171875, "loss": 0.8834, "rewards/accuracies": 0.5, "rewards/chosen": 4.679693698883057, "rewards/margins": 0.41532278060913086, "rewards/rejected": 4.264371395111084, "step": 39070 }, { "epoch": 1.8143832118482752, "grad_norm": 77.16094970703125, "learning_rate": 1.9119272018199543e-07, "logits/chosen": -17.96470069885254, "logits/rejected": -18.424070358276367, "logps/chosen": -357.8050231933594, "logps/rejected": -357.0470275878906, "loss": 0.8284, "rewards/accuracies": 0.5, "rewards/chosen": 3.2588400840759277, "rewards/margins": 0.45190829038619995, "rewards/rejected": 2.806931734085083, "step": 39080 }, { "epoch": 1.8148474859557082, "grad_norm": 178.31259155273438, "learning_rate": 1.9116486373554947e-07, "logits/chosen": -18.87393569946289, "logits/rejected": -18.00638771057129, "logps/chosen": -485.35400390625, "logps/rejected": -381.7657470703125, "loss": 0.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.533401966094971, "rewards/margins": 1.724218726158142, "rewards/rejected": 2.8091835975646973, "step": 39090 }, { "epoch": 1.8153117600631412, "grad_norm": 45.011592864990234, "learning_rate": 1.911370072891035e-07, "logits/chosen": -18.087520599365234, "logits/rejected": -16.522146224975586, "logps/chosen": -339.6302795410156, "logps/rejected": -190.70123291015625, "loss": 0.4965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.913958787918091, "rewards/margins": 2.0414011478424072, "rewards/rejected": 0.8725578188896179, "step": 39100 }, { "epoch": 1.8157760341705744, "grad_norm": 122.6463851928711, "learning_rate": 1.9110915084265748e-07, "logits/chosen": -18.707874298095703, "logits/rejected": -17.910409927368164, "logps/chosen": -441.70672607421875, "logps/rejected": -357.92950439453125, "loss": 0.561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.581333875656128, "rewards/margins": 1.3814756870269775, "rewards/rejected": 2.1998581886291504, "step": 39110 }, { "epoch": 1.8162403082780072, "grad_norm": 20.137052536010742, "learning_rate": 1.9108129439621152e-07, "logits/chosen": -18.386857986450195, "logits/rejected": -18.05870819091797, "logps/chosen": -413.90826416015625, "logps/rejected": -382.9277038574219, "loss": 0.4271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3876068592071533, "rewards/margins": 0.9504884481430054, "rewards/rejected": 2.4371180534362793, "step": 39120 }, { "epoch": 1.8167045823854404, "grad_norm": 83.9598388671875, "learning_rate": 1.9105343794976553e-07, "logits/chosen": -18.13669204711914, "logits/rejected": -18.335250854492188, "logps/chosen": -432.064697265625, "logps/rejected": -414.25848388671875, "loss": 0.9886, "rewards/accuracies": 0.5, "rewards/chosen": 3.188852548599243, "rewards/margins": 0.1240755170583725, "rewards/rejected": 3.06477689743042, "step": 39130 }, { "epoch": 1.8171688564928734, "grad_norm": 126.9395751953125, "learning_rate": 1.9102558150331957e-07, "logits/chosen": -18.516063690185547, "logits/rejected": -18.207761764526367, "logps/chosen": -316.96075439453125, "logps/rejected": -332.4198913574219, "loss": 0.9907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.340494155883789, "rewards/margins": -0.058309078216552734, "rewards/rejected": 2.398803234100342, "step": 39140 }, { "epoch": 1.8176331306003064, "grad_norm": 148.44786071777344, "learning_rate": 1.9099772505687356e-07, "logits/chosen": -18.18144416809082, "logits/rejected": -18.316699981689453, "logps/chosen": -391.98431396484375, "logps/rejected": -445.19287109375, "loss": 1.1896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.944652557373047, "rewards/margins": -0.032296013087034225, "rewards/rejected": 2.9769484996795654, "step": 39150 }, { "epoch": 1.8180974047077394, "grad_norm": 75.26374816894531, "learning_rate": 1.9096986861042757e-07, "logits/chosen": -18.32060432434082, "logits/rejected": -17.4620418548584, "logps/chosen": -319.662841796875, "logps/rejected": -239.5416259765625, "loss": 0.4399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.685596466064453, "rewards/margins": 1.371382474899292, "rewards/rejected": 1.3142138719558716, "step": 39160 }, { "epoch": 1.8185616788151724, "grad_norm": 0.29621389508247375, "learning_rate": 1.9094201216398162e-07, "logits/chosen": -19.392925262451172, "logits/rejected": -18.52240562438965, "logps/chosen": -364.51544189453125, "logps/rejected": -248.90853881835938, "loss": 0.6977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7909412384033203, "rewards/margins": 0.8123273849487305, "rewards/rejected": 1.9786138534545898, "step": 39170 }, { "epoch": 1.8190259529226056, "grad_norm": 21.827289581298828, "learning_rate": 1.9091415571753563e-07, "logits/chosen": -18.69190216064453, "logits/rejected": -18.305450439453125, "logps/chosen": -401.8280334472656, "logps/rejected": -384.25244140625, "loss": 0.9731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6971123218536377, "rewards/margins": 0.47830134630203247, "rewards/rejected": 3.21881103515625, "step": 39180 }, { "epoch": 1.8194902270300384, "grad_norm": 205.04714965820312, "learning_rate": 1.9088629927108964e-07, "logits/chosen": -19.482913970947266, "logits/rejected": -19.045825958251953, "logps/chosen": -373.7611083984375, "logps/rejected": -405.17572021484375, "loss": 0.8958, "rewards/accuracies": 0.5, "rewards/chosen": 2.9547338485717773, "rewards/margins": 0.5196612477302551, "rewards/rejected": 2.435072660446167, "step": 39190 }, { "epoch": 1.8199545011374716, "grad_norm": 80.48768615722656, "learning_rate": 1.9085844282464366e-07, "logits/chosen": -19.00295639038086, "logits/rejected": -18.140033721923828, "logps/chosen": -374.53759765625, "logps/rejected": -328.17694091796875, "loss": 0.6844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.888589382171631, "rewards/margins": 0.9792803525924683, "rewards/rejected": 1.909308671951294, "step": 39200 }, { "epoch": 1.8204187752449046, "grad_norm": 201.322509765625, "learning_rate": 1.908305863781977e-07, "logits/chosen": -18.177690505981445, "logits/rejected": -18.00973892211914, "logps/chosen": -377.1564025878906, "logps/rejected": -395.8840026855469, "loss": 1.3777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.924511432647705, "rewards/margins": -0.481759250164032, "rewards/rejected": 3.4062705039978027, "step": 39210 }, { "epoch": 1.8208830493523376, "grad_norm": 24.172834396362305, "learning_rate": 1.908027299317517e-07, "logits/chosen": -18.92955780029297, "logits/rejected": -18.2626895904541, "logps/chosen": -318.9612731933594, "logps/rejected": -254.1236114501953, "loss": 0.6286, "rewards/accuracies": 0.5, "rewards/chosen": 3.116150379180908, "rewards/margins": 1.1980514526367188, "rewards/rejected": 1.9180988073349, "step": 39220 }, { "epoch": 1.8213473234597708, "grad_norm": 23.670543670654297, "learning_rate": 1.907748734853057e-07, "logits/chosen": -18.76932144165039, "logits/rejected": -17.4069766998291, "logps/chosen": -412.27508544921875, "logps/rejected": -260.5580749511719, "loss": 0.4572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3959484100341797, "rewards/margins": 1.2818113565444946, "rewards/rejected": 2.1141369342803955, "step": 39230 }, { "epoch": 1.8218115975672036, "grad_norm": 24.969148635864258, "learning_rate": 1.9074701703885974e-07, "logits/chosen": -18.504335403442383, "logits/rejected": -17.49400520324707, "logps/chosen": -422.1844177246094, "logps/rejected": -295.24859619140625, "loss": 0.5399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1622090339660645, "rewards/margins": 1.7954366207122803, "rewards/rejected": 2.366772174835205, "step": 39240 }, { "epoch": 1.8222758716746368, "grad_norm": 97.03784942626953, "learning_rate": 1.9071916059241376e-07, "logits/chosen": -19.128826141357422, "logits/rejected": -18.984424591064453, "logps/chosen": -365.5671081542969, "logps/rejected": -349.1744079589844, "loss": 0.8516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.687354564666748, "rewards/margins": 0.3170117437839508, "rewards/rejected": 2.37034273147583, "step": 39250 }, { "epoch": 1.8227401457820698, "grad_norm": 87.39303588867188, "learning_rate": 1.9069130414596775e-07, "logits/chosen": -17.87640380859375, "logits/rejected": -17.693004608154297, "logps/chosen": -335.9776916503906, "logps/rejected": -327.350341796875, "loss": 0.9575, "rewards/accuracies": 0.5, "rewards/chosen": 2.2832376956939697, "rewards/margins": 0.06352822482585907, "rewards/rejected": 2.2197093963623047, "step": 39260 }, { "epoch": 1.8232044198895028, "grad_norm": 0.24760611355304718, "learning_rate": 1.906634476995218e-07, "logits/chosen": -19.503372192382812, "logits/rejected": -18.288644790649414, "logps/chosen": -387.3381652832031, "logps/rejected": -356.77642822265625, "loss": 0.4493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8374276161193848, "rewards/margins": 1.6123218536376953, "rewards/rejected": 2.225105047225952, "step": 39270 }, { "epoch": 1.8236686939969358, "grad_norm": 48.220970153808594, "learning_rate": 1.906355912530758e-07, "logits/chosen": -18.993459701538086, "logits/rejected": -18.06732177734375, "logps/chosen": -439.6542053222656, "logps/rejected": -402.99578857421875, "loss": 0.5011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.32305908203125, "rewards/margins": 0.6572567820549011, "rewards/rejected": 2.665802001953125, "step": 39280 }, { "epoch": 1.8241329681043688, "grad_norm": 71.954345703125, "learning_rate": 1.9060773480662984e-07, "logits/chosen": -18.516937255859375, "logits/rejected": -17.26136016845703, "logps/chosen": -390.61138916015625, "logps/rejected": -319.66192626953125, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.702681303024292, "rewards/margins": 1.6624925136566162, "rewards/rejected": 2.0401885509490967, "step": 39290 }, { "epoch": 1.824597242211802, "grad_norm": 117.87741088867188, "learning_rate": 1.9057987836018383e-07, "logits/chosen": -19.71369743347168, "logits/rejected": -19.415525436401367, "logps/chosen": -451.21807861328125, "logps/rejected": -427.2049255371094, "loss": 1.2501, "rewards/accuracies": 0.5, "rewards/chosen": 3.250262498855591, "rewards/margins": -0.32855015993118286, "rewards/rejected": 3.578812837600708, "step": 39300 }, { "epoch": 1.8250615163192347, "grad_norm": 3.4198129177093506, "learning_rate": 1.9055202191373787e-07, "logits/chosen": -18.983003616333008, "logits/rejected": -17.051788330078125, "logps/chosen": -418.0347595214844, "logps/rejected": -285.85723876953125, "loss": 0.741, "rewards/accuracies": 0.5, "rewards/chosen": 3.2867379188537598, "rewards/margins": 1.46478271484375, "rewards/rejected": 1.8219553232192993, "step": 39310 }, { "epoch": 1.825525790426668, "grad_norm": 61.98788070678711, "learning_rate": 1.9052416546729189e-07, "logits/chosen": -19.17017364501953, "logits/rejected": -17.891437530517578, "logps/chosen": -305.29718017578125, "logps/rejected": -208.5202178955078, "loss": 0.3991, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.627199172973633, "rewards/margins": 1.426731824874878, "rewards/rejected": 1.200467586517334, "step": 39320 }, { "epoch": 1.825990064534101, "grad_norm": 87.73917388916016, "learning_rate": 1.904963090208459e-07, "logits/chosen": -19.714035034179688, "logits/rejected": -18.472436904907227, "logps/chosen": -456.8858337402344, "logps/rejected": -402.1859436035156, "loss": 0.4701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.386476516723633, "rewards/margins": 1.1208031177520752, "rewards/rejected": 2.2656733989715576, "step": 39330 }, { "epoch": 1.826454338641534, "grad_norm": 139.37176513671875, "learning_rate": 1.9046845257439992e-07, "logits/chosen": -19.411540985107422, "logits/rejected": -19.058835983276367, "logps/chosen": -366.3807067871094, "logps/rejected": -346.21099853515625, "loss": 1.0572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.241788864135742, "rewards/margins": 0.21501651406288147, "rewards/rejected": 3.0267724990844727, "step": 39340 }, { "epoch": 1.826918612748967, "grad_norm": 49.51234436035156, "learning_rate": 1.9044059612795393e-07, "logits/chosen": -18.378299713134766, "logits/rejected": -17.81859016418457, "logps/chosen": -481.46661376953125, "logps/rejected": -422.61810302734375, "loss": 0.5578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.958178997039795, "rewards/margins": 1.0214276313781738, "rewards/rejected": 2.9367518424987793, "step": 39350 }, { "epoch": 1.8273828868564, "grad_norm": 54.61125564575195, "learning_rate": 1.9041273968150797e-07, "logits/chosen": -17.974882125854492, "logits/rejected": -18.286434173583984, "logps/chosen": -304.02587890625, "logps/rejected": -303.28094482421875, "loss": 0.7428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5742292404174805, "rewards/margins": 0.2800258994102478, "rewards/rejected": 2.294203281402588, "step": 39360 }, { "epoch": 1.8278471609638332, "grad_norm": 49.53981399536133, "learning_rate": 1.9038488323506199e-07, "logits/chosen": -19.472675323486328, "logits/rejected": -19.03504180908203, "logps/chosen": -356.7763671875, "logps/rejected": -333.2137145996094, "loss": 0.9091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9372143745422363, "rewards/margins": 0.44832080602645874, "rewards/rejected": 2.488893508911133, "step": 39370 }, { "epoch": 1.828311435071266, "grad_norm": 247.8502655029297, "learning_rate": 1.9035702678861597e-07, "logits/chosen": -18.68877601623535, "logits/rejected": -18.53714942932129, "logps/chosen": -431.7005310058594, "logps/rejected": -411.76434326171875, "loss": 1.1162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5604004859924316, "rewards/margins": 0.2976831793785095, "rewards/rejected": 3.2627174854278564, "step": 39380 }, { "epoch": 1.8287757091786991, "grad_norm": 40.98868179321289, "learning_rate": 1.9032917034217001e-07, "logits/chosen": -18.501163482666016, "logits/rejected": -18.594099044799805, "logps/chosen": -368.45013427734375, "logps/rejected": -327.72711181640625, "loss": 0.7321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.83914852142334, "rewards/margins": 0.32635003328323364, "rewards/rejected": 2.512798547744751, "step": 39390 }, { "epoch": 1.8292399832861321, "grad_norm": 138.7548065185547, "learning_rate": 1.9030131389572403e-07, "logits/chosen": -18.78481101989746, "logits/rejected": -18.729042053222656, "logps/chosen": -462.50238037109375, "logps/rejected": -435.4442443847656, "loss": 1.007, "rewards/accuracies": 0.5, "rewards/chosen": 2.994506359100342, "rewards/margins": -0.08173210918903351, "rewards/rejected": 3.0762383937835693, "step": 39400 }, { "epoch": 1.8297042573935651, "grad_norm": 37.20004653930664, "learning_rate": 1.9027345744927802e-07, "logits/chosen": -19.43799591064453, "logits/rejected": -18.225414276123047, "logps/chosen": -496.5616760253906, "logps/rejected": -358.2966003417969, "loss": 0.6546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.076350212097168, "rewards/margins": 1.3490922451019287, "rewards/rejected": 2.72725772857666, "step": 39410 }, { "epoch": 1.8301685315009983, "grad_norm": 95.82170867919922, "learning_rate": 1.9024560100283206e-07, "logits/chosen": -18.699146270751953, "logits/rejected": -18.375011444091797, "logps/chosen": -462.6796875, "logps/rejected": -374.9416198730469, "loss": 0.8055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2519969940185547, "rewards/margins": 0.3468656837940216, "rewards/rejected": 2.9051313400268555, "step": 39420 }, { "epoch": 1.8306328056084311, "grad_norm": 116.68466186523438, "learning_rate": 1.9021774455638607e-07, "logits/chosen": -20.00275993347168, "logits/rejected": -18.99463653564453, "logps/chosen": -388.61920166015625, "logps/rejected": -329.7653503417969, "loss": 0.743, "rewards/accuracies": 0.5, "rewards/chosen": 3.876359462738037, "rewards/margins": 1.0868334770202637, "rewards/rejected": 2.7895257472991943, "step": 39430 }, { "epoch": 1.8310970797158643, "grad_norm": 47.65212631225586, "learning_rate": 1.9018988810994011e-07, "logits/chosen": -19.56263542175293, "logits/rejected": -18.711193084716797, "logps/chosen": -437.54998779296875, "logps/rejected": -349.6230773925781, "loss": 0.7357, "rewards/accuracies": 0.5, "rewards/chosen": 3.804666519165039, "rewards/margins": 0.33410486578941345, "rewards/rejected": 3.470561981201172, "step": 39440 }, { "epoch": 1.8315613538232973, "grad_norm": 59.33847427368164, "learning_rate": 1.901620316634941e-07, "logits/chosen": -18.0954532623291, "logits/rejected": -17.601490020751953, "logps/chosen": -335.02984619140625, "logps/rejected": -267.5149230957031, "loss": 0.958, "rewards/accuracies": 0.5, "rewards/chosen": 2.6891534328460693, "rewards/margins": 0.4917258322238922, "rewards/rejected": 2.19742751121521, "step": 39450 }, { "epoch": 1.8320256279307303, "grad_norm": 49.92992401123047, "learning_rate": 1.9013417521704814e-07, "logits/chosen": -18.682743072509766, "logits/rejected": -17.48040008544922, "logps/chosen": -448.29730224609375, "logps/rejected": -303.3645324707031, "loss": 0.5789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.551339626312256, "rewards/margins": 1.2912613153457642, "rewards/rejected": 2.260077953338623, "step": 39460 }, { "epoch": 1.8324899020381633, "grad_norm": 161.40794372558594, "learning_rate": 1.9010631877060216e-07, "logits/chosen": -17.889389038085938, "logits/rejected": -17.786775588989258, "logps/chosen": -327.60089111328125, "logps/rejected": -318.03997802734375, "loss": 1.1682, "rewards/accuracies": 0.5, "rewards/chosen": 2.762633800506592, "rewards/margins": 0.07072510570287704, "rewards/rejected": 2.691908359527588, "step": 39470 }, { "epoch": 1.8329541761455963, "grad_norm": 85.85592651367188, "learning_rate": 1.900784623241562e-07, "logits/chosen": -18.5839786529541, "logits/rejected": -18.155616760253906, "logps/chosen": -422.032470703125, "logps/rejected": -375.44390869140625, "loss": 0.9094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1482527256011963, "rewards/margins": -0.03722777217626572, "rewards/rejected": 3.1854801177978516, "step": 39480 }, { "epoch": 1.8334184502530295, "grad_norm": 0.3989824652671814, "learning_rate": 1.9005060587771019e-07, "logits/chosen": -18.974164962768555, "logits/rejected": -18.1922607421875, "logps/chosen": -535.6861572265625, "logps/rejected": -366.54486083984375, "loss": 0.949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8292877674102783, "rewards/margins": 1.0944905281066895, "rewards/rejected": 2.734797716140747, "step": 39490 }, { "epoch": 1.8338827243604623, "grad_norm": 61.2996826171875, "learning_rate": 1.900227494312642e-07, "logits/chosen": -18.93817138671875, "logits/rejected": -19.362125396728516, "logps/chosen": -397.68438720703125, "logps/rejected": -351.30224609375, "loss": 0.6239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.274204969406128, "rewards/margins": 0.46680861711502075, "rewards/rejected": 2.807396173477173, "step": 39500 }, { "epoch": 1.8343469984678955, "grad_norm": 0.6216461658477783, "learning_rate": 1.8999489298481824e-07, "logits/chosen": -18.942489624023438, "logits/rejected": -18.097518920898438, "logps/chosen": -377.49713134765625, "logps/rejected": -339.5108947753906, "loss": 0.5598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8815460205078125, "rewards/margins": 1.4634729623794556, "rewards/rejected": 2.4180731773376465, "step": 39510 }, { "epoch": 1.8348112725753285, "grad_norm": 59.904964447021484, "learning_rate": 1.8996703653837226e-07, "logits/chosen": -19.546367645263672, "logits/rejected": -18.065494537353516, "logps/chosen": -435.00201416015625, "logps/rejected": -307.50018310546875, "loss": 0.4426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.084442615509033, "rewards/margins": 1.7960401773452759, "rewards/rejected": 2.288402557373047, "step": 39520 }, { "epoch": 1.8352755466827615, "grad_norm": 238.41796875, "learning_rate": 1.8993918009192624e-07, "logits/chosen": -19.021718978881836, "logits/rejected": -18.271831512451172, "logps/chosen": -454.1153869628906, "logps/rejected": -327.8268737792969, "loss": 0.5104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.489266872406006, "rewards/margins": 1.7549257278442383, "rewards/rejected": 2.734341621398926, "step": 39530 }, { "epoch": 1.8357398207901945, "grad_norm": 145.94651794433594, "learning_rate": 1.8991132364548029e-07, "logits/chosen": -18.809600830078125, "logits/rejected": -18.7446346282959, "logps/chosen": -466.41961669921875, "logps/rejected": -422.0404357910156, "loss": 1.3671, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0189902782440186, "rewards/margins": -0.011922359466552734, "rewards/rejected": 3.0309126377105713, "step": 39540 }, { "epoch": 1.8362040948976275, "grad_norm": 51.20798873901367, "learning_rate": 1.898834671990343e-07, "logits/chosen": -18.684406280517578, "logits/rejected": -18.92970085144043, "logps/chosen": -427.6517639160156, "logps/rejected": -416.9615173339844, "loss": 0.8777, "rewards/accuracies": 0.5, "rewards/chosen": 3.3437094688415527, "rewards/margins": 0.10625112056732178, "rewards/rejected": 3.2374587059020996, "step": 39550 }, { "epoch": 1.8366683690050607, "grad_norm": 44.80201721191406, "learning_rate": 1.8985561075258834e-07, "logits/chosen": -19.03663444519043, "logits/rejected": -19.370481491088867, "logps/chosen": -322.01519775390625, "logps/rejected": -315.4110107421875, "loss": 0.8015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3249073028564453, "rewards/margins": 1.0301014184951782, "rewards/rejected": 2.2948060035705566, "step": 39560 }, { "epoch": 1.8371326431124935, "grad_norm": 48.18111038208008, "learning_rate": 1.8982775430614233e-07, "logits/chosen": -18.842620849609375, "logits/rejected": -17.55364418029785, "logps/chosen": -418.4707946777344, "logps/rejected": -274.9587707519531, "loss": 0.2702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9484825134277344, "rewards/margins": 1.9374094009399414, "rewards/rejected": 2.011073350906372, "step": 39570 }, { "epoch": 1.8375969172199267, "grad_norm": 28.928302764892578, "learning_rate": 1.8979989785969634e-07, "logits/chosen": -19.848600387573242, "logits/rejected": -19.40011215209961, "logps/chosen": -449.33966064453125, "logps/rejected": -386.328125, "loss": 0.4979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4851818084716797, "rewards/margins": 0.6774402856826782, "rewards/rejected": 2.807741641998291, "step": 39580 }, { "epoch": 1.8380611913273597, "grad_norm": 120.58284759521484, "learning_rate": 1.8977204141325038e-07, "logits/chosen": -18.50712776184082, "logits/rejected": -18.16889762878418, "logps/chosen": -442.05572509765625, "logps/rejected": -363.02130126953125, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3063015937805176, "rewards/margins": 1.0158888101577759, "rewards/rejected": 2.290412425994873, "step": 39590 }, { "epoch": 1.8385254654347927, "grad_norm": 43.528221130371094, "learning_rate": 1.8974418496680437e-07, "logits/chosen": -19.227153778076172, "logits/rejected": -18.835573196411133, "logps/chosen": -459.25048828125, "logps/rejected": -394.3724365234375, "loss": 0.5719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7739315032958984, "rewards/margins": 1.2087223529815674, "rewards/rejected": 2.5652098655700684, "step": 39600 }, { "epoch": 1.8389897395422259, "grad_norm": 303.18365478515625, "learning_rate": 1.8971632852035841e-07, "logits/chosen": -17.904691696166992, "logits/rejected": -17.790836334228516, "logps/chosen": -316.9906311035156, "logps/rejected": -334.9410705566406, "loss": 0.9351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2341830730438232, "rewards/margins": 0.20525391399860382, "rewards/rejected": 2.0289292335510254, "step": 39610 }, { "epoch": 1.8394540136496587, "grad_norm": 42.90260314941406, "learning_rate": 1.8968847207391243e-07, "logits/chosen": -19.403345108032227, "logits/rejected": -18.616859436035156, "logps/chosen": -336.67547607421875, "logps/rejected": -203.2880401611328, "loss": 0.5754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.895664691925049, "rewards/margins": 0.9707425236701965, "rewards/rejected": 1.9249217510223389, "step": 39620 }, { "epoch": 1.8399182877570919, "grad_norm": 54.82060241699219, "learning_rate": 1.8966061562746647e-07, "logits/chosen": -19.296337127685547, "logits/rejected": -18.5468692779541, "logps/chosen": -491.2457580566406, "logps/rejected": -387.26055908203125, "loss": 0.5864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4429588317871094, "rewards/margins": 0.6670106053352356, "rewards/rejected": 2.7759482860565186, "step": 39630 }, { "epoch": 1.8403825618645249, "grad_norm": 6.540585041046143, "learning_rate": 1.8963275918102046e-07, "logits/chosen": -19.182174682617188, "logits/rejected": -17.828693389892578, "logps/chosen": -503.44940185546875, "logps/rejected": -365.26788330078125, "loss": 0.4668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.13967227935791, "rewards/margins": 1.3103106021881104, "rewards/rejected": 2.829361915588379, "step": 39640 }, { "epoch": 1.8408468359719579, "grad_norm": 92.09905242919922, "learning_rate": 1.8960490273457447e-07, "logits/chosen": -19.567798614501953, "logits/rejected": -19.1060791015625, "logps/chosen": -448.12078857421875, "logps/rejected": -427.3038024902344, "loss": 0.9696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3605294227600098, "rewards/margins": -0.23419849574565887, "rewards/rejected": 3.5947277545928955, "step": 39650 }, { "epoch": 1.8413111100793909, "grad_norm": 193.18701171875, "learning_rate": 1.895770462881285e-07, "logits/chosen": -18.482868194580078, "logits/rejected": -17.853105545043945, "logps/chosen": -394.26458740234375, "logps/rejected": -383.1076354980469, "loss": 0.7252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.153281211853027, "rewards/margins": 1.1886188983917236, "rewards/rejected": 2.9646620750427246, "step": 39660 }, { "epoch": 1.8417753841868238, "grad_norm": 41.83209991455078, "learning_rate": 1.8954918984168253e-07, "logits/chosen": -18.941875457763672, "logits/rejected": -18.94300079345703, "logps/chosen": -407.27056884765625, "logps/rejected": -466.33721923828125, "loss": 0.7021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2558140754699707, "rewards/margins": 0.6601563692092896, "rewards/rejected": 2.5956578254699707, "step": 39670 }, { "epoch": 1.842239658294257, "grad_norm": 13.653319358825684, "learning_rate": 1.8952133339523652e-07, "logits/chosen": -18.487483978271484, "logits/rejected": -17.150346755981445, "logps/chosen": -386.22314453125, "logps/rejected": -281.62200927734375, "loss": 0.5185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9082305431365967, "rewards/margins": 2.2225241661071777, "rewards/rejected": 1.6857061386108398, "step": 39680 }, { "epoch": 1.8427039324016898, "grad_norm": 23.18818473815918, "learning_rate": 1.8949347694879056e-07, "logits/chosen": -19.05299949645996, "logits/rejected": -18.38248062133789, "logps/chosen": -449.2745056152344, "logps/rejected": -342.77288818359375, "loss": 0.5006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4001681804656982, "rewards/margins": 1.1557129621505737, "rewards/rejected": 2.244454860687256, "step": 39690 }, { "epoch": 1.843168206509123, "grad_norm": 12.922192573547363, "learning_rate": 1.8946562050234457e-07, "logits/chosen": -19.986631393432617, "logits/rejected": -18.942420959472656, "logps/chosen": -474.5802307128906, "logps/rejected": -434.17919921875, "loss": 0.5872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.549874305725098, "rewards/margins": 1.2057993412017822, "rewards/rejected": 3.3440749645233154, "step": 39700 }, { "epoch": 1.843632480616556, "grad_norm": 56.16853332519531, "learning_rate": 1.894377640558986e-07, "logits/chosen": -19.316089630126953, "logits/rejected": -18.339323043823242, "logps/chosen": -400.56658935546875, "logps/rejected": -269.45867919921875, "loss": 0.3571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.525472164154053, "rewards/margins": 1.7549972534179688, "rewards/rejected": 2.770474910736084, "step": 39710 }, { "epoch": 1.844096754723989, "grad_norm": 22.507579803466797, "learning_rate": 1.894099076094526e-07, "logits/chosen": -19.380313873291016, "logits/rejected": -18.100727081298828, "logps/chosen": -415.08111572265625, "logps/rejected": -399.9397888183594, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1815426349639893, "rewards/margins": 0.7579920887947083, "rewards/rejected": 2.4235503673553467, "step": 39720 }, { "epoch": 1.844561028831422, "grad_norm": 22.08123207092285, "learning_rate": 1.8938205116300664e-07, "logits/chosen": -19.14258575439453, "logits/rejected": -18.40575408935547, "logps/chosen": -484.53326416015625, "logps/rejected": -350.23834228515625, "loss": 0.7142, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.384364366531372, "rewards/margins": 0.9859983325004578, "rewards/rejected": 2.3983662128448486, "step": 39730 }, { "epoch": 1.845025302938855, "grad_norm": 172.81344604492188, "learning_rate": 1.8935419471656066e-07, "logits/chosen": -18.257564544677734, "logits/rejected": -17.972583770751953, "logps/chosen": -301.5707092285156, "logps/rejected": -306.9096984863281, "loss": 1.0398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9252429008483887, "rewards/margins": 0.3428456485271454, "rewards/rejected": 2.582397699356079, "step": 39740 }, { "epoch": 1.8454895770462882, "grad_norm": 148.5281219482422, "learning_rate": 1.8932633827011467e-07, "logits/chosen": -19.497512817382812, "logits/rejected": -18.531574249267578, "logps/chosen": -410.0152282714844, "logps/rejected": -327.0021667480469, "loss": 0.5786, "rewards/accuracies": 0.5, "rewards/chosen": 4.100857734680176, "rewards/margins": 1.3153648376464844, "rewards/rejected": 2.7854933738708496, "step": 39750 }, { "epoch": 1.845953851153721, "grad_norm": 3.5285396575927734, "learning_rate": 1.8929848182366868e-07, "logits/chosen": -18.888927459716797, "logits/rejected": -17.25071144104004, "logps/chosen": -326.1830139160156, "logps/rejected": -221.0828857421875, "loss": 0.4632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.662789821624756, "rewards/margins": 2.057405948638916, "rewards/rejected": 1.605384111404419, "step": 39760 }, { "epoch": 1.8464181252611542, "grad_norm": 31.536441802978516, "learning_rate": 1.892706253772227e-07, "logits/chosen": -19.571144104003906, "logits/rejected": -19.39748764038086, "logps/chosen": -494.71441650390625, "logps/rejected": -372.95172119140625, "loss": 0.536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.771843671798706, "rewards/margins": 0.7959343194961548, "rewards/rejected": 2.9759092330932617, "step": 39770 }, { "epoch": 1.8468823993685872, "grad_norm": 30.739145278930664, "learning_rate": 1.8924276893077674e-07, "logits/chosen": -17.953853607177734, "logits/rejected": -18.237703323364258, "logps/chosen": -340.2324523925781, "logps/rejected": -356.063232421875, "loss": 1.4994, "rewards/accuracies": 0.5, "rewards/chosen": 2.217360019683838, "rewards/margins": -0.556979775428772, "rewards/rejected": 2.7743396759033203, "step": 39780 }, { "epoch": 1.8473466734760202, "grad_norm": 1.3546903133392334, "learning_rate": 1.8921491248433073e-07, "logits/chosen": -18.824705123901367, "logits/rejected": -17.721263885498047, "logps/chosen": -350.46270751953125, "logps/rejected": -264.78045654296875, "loss": 0.5296, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3578522205352783, "rewards/margins": 1.4824635982513428, "rewards/rejected": 1.8753888607025146, "step": 39790 }, { "epoch": 1.8478109475834534, "grad_norm": 19.521804809570312, "learning_rate": 1.8918705603788474e-07, "logits/chosen": -19.28227996826172, "logits/rejected": -18.903888702392578, "logps/chosen": -328.19989013671875, "logps/rejected": -307.0797119140625, "loss": 0.557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.787503719329834, "rewards/margins": 0.7924872636795044, "rewards/rejected": 1.9950164556503296, "step": 39800 }, { "epoch": 1.8482752216908862, "grad_norm": 38.67152786254883, "learning_rate": 1.8915919959143878e-07, "logits/chosen": -18.0776309967041, "logits/rejected": -17.32469940185547, "logps/chosen": -431.3047790527344, "logps/rejected": -330.2315979003906, "loss": 0.4509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.942852735519409, "rewards/margins": 1.3513767719268799, "rewards/rejected": 2.5914759635925293, "step": 39810 }, { "epoch": 1.8487394957983194, "grad_norm": 17.80655860900879, "learning_rate": 1.891313431449928e-07, "logits/chosen": -19.717594146728516, "logits/rejected": -18.36867904663086, "logps/chosen": -465.40997314453125, "logps/rejected": -344.1470031738281, "loss": 0.5661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.280271530151367, "rewards/margins": 1.6208441257476807, "rewards/rejected": 2.6594271659851074, "step": 39820 }, { "epoch": 1.8492037699057522, "grad_norm": 43.56868362426758, "learning_rate": 1.8910348669854679e-07, "logits/chosen": -18.37632179260254, "logits/rejected": -18.46773338317871, "logps/chosen": -412.24786376953125, "logps/rejected": -375.2185974121094, "loss": 0.9403, "rewards/accuracies": 0.5, "rewards/chosen": 3.263274669647217, "rewards/margins": 0.1825878918170929, "rewards/rejected": 3.0806868076324463, "step": 39830 }, { "epoch": 1.8496680440131854, "grad_norm": 3.8559863567352295, "learning_rate": 1.8907563025210083e-07, "logits/chosen": -19.27216148376465, "logits/rejected": -17.222274780273438, "logps/chosen": -479.36083984375, "logps/rejected": -231.8517608642578, "loss": 0.3771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.463996410369873, "rewards/margins": 2.272611141204834, "rewards/rejected": 2.1913845539093018, "step": 39840 }, { "epoch": 1.8501323181206184, "grad_norm": 128.24853515625, "learning_rate": 1.8904777380565484e-07, "logits/chosen": -20.224275588989258, "logits/rejected": -19.600400924682617, "logps/chosen": -405.56640625, "logps/rejected": -328.3680114746094, "loss": 0.7673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.546217679977417, "rewards/margins": 0.6129497289657593, "rewards/rejected": 2.933267831802368, "step": 39850 }, { "epoch": 1.8505965922280514, "grad_norm": 79.98927307128906, "learning_rate": 1.8901991735920888e-07, "logits/chosen": -18.42951774597168, "logits/rejected": -17.6140079498291, "logps/chosen": -435.5335388183594, "logps/rejected": -355.51214599609375, "loss": 0.6559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8298161029815674, "rewards/margins": 1.3399807214736938, "rewards/rejected": 2.489834785461426, "step": 39860 }, { "epoch": 1.8510608663354846, "grad_norm": 18.288496017456055, "learning_rate": 1.8899206091276287e-07, "logits/chosen": -19.983402252197266, "logits/rejected": -19.69398307800293, "logps/chosen": -410.95660400390625, "logps/rejected": -415.96142578125, "loss": 1.0881, "rewards/accuracies": 0.5, "rewards/chosen": 3.0409562587738037, "rewards/margins": 0.08134309947490692, "rewards/rejected": 2.959613561630249, "step": 39870 }, { "epoch": 1.8515251404429174, "grad_norm": 125.14581298828125, "learning_rate": 1.889642044663169e-07, "logits/chosen": -18.6889705657959, "logits/rejected": -18.93790054321289, "logps/chosen": -387.07305908203125, "logps/rejected": -339.2403259277344, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.350532054901123, "rewards/margins": 1.274123191833496, "rewards/rejected": 3.0764095783233643, "step": 39880 }, { "epoch": 1.8519894145503506, "grad_norm": 21.44031524658203, "learning_rate": 1.8893634801987093e-07, "logits/chosen": -19.09535026550293, "logits/rejected": -18.421085357666016, "logps/chosen": -422.516845703125, "logps/rejected": -337.2112121582031, "loss": 0.5516, "rewards/accuracies": 0.5, "rewards/chosen": 3.233165740966797, "rewards/margins": 0.7521246075630188, "rewards/rejected": 2.4810409545898438, "step": 39890 }, { "epoch": 1.8524536886577836, "grad_norm": 168.98135375976562, "learning_rate": 1.8890849157342497e-07, "logits/chosen": -17.4340763092041, "logits/rejected": -18.177837371826172, "logps/chosen": -243.76806640625, "logps/rejected": -278.9018249511719, "loss": 1.4863, "rewards/accuracies": 0.5, "rewards/chosen": 1.4822776317596436, "rewards/margins": -0.8223360776901245, "rewards/rejected": 2.3046135902404785, "step": 39900 }, { "epoch": 1.8529179627652166, "grad_norm": 67.69206237792969, "learning_rate": 1.8888063512697896e-07, "logits/chosen": -19.127912521362305, "logits/rejected": -18.389408111572266, "logps/chosen": -295.38848876953125, "logps/rejected": -244.52493286132812, "loss": 0.5647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.350926160812378, "rewards/margins": 1.48739492893219, "rewards/rejected": 1.863531470298767, "step": 39910 }, { "epoch": 1.8533822368726496, "grad_norm": 235.8607940673828, "learning_rate": 1.8885277868053297e-07, "logits/chosen": -18.691364288330078, "logits/rejected": -18.39130210876465, "logps/chosen": -355.7251892089844, "logps/rejected": -360.1802673339844, "loss": 1.0753, "rewards/accuracies": 0.5, "rewards/chosen": 2.431068181991577, "rewards/margins": 0.14874248206615448, "rewards/rejected": 2.282325506210327, "step": 39920 }, { "epoch": 1.8538465109800826, "grad_norm": 153.33837890625, "learning_rate": 1.88824922234087e-07, "logits/chosen": -18.3131160736084, "logits/rejected": -18.217670440673828, "logps/chosen": -252.78848266601562, "logps/rejected": -236.6627960205078, "loss": 0.7615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7906862497329712, "rewards/margins": 0.4505457282066345, "rewards/rejected": 1.3401405811309814, "step": 39930 }, { "epoch": 1.8543107850875158, "grad_norm": 27.343076705932617, "learning_rate": 1.8879706578764103e-07, "logits/chosen": -18.409421920776367, "logits/rejected": -18.033994674682617, "logps/chosen": -353.92010498046875, "logps/rejected": -349.79052734375, "loss": 0.7672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8978075981140137, "rewards/margins": 0.5266870260238647, "rewards/rejected": 2.3711206912994385, "step": 39940 }, { "epoch": 1.8547750591949486, "grad_norm": 17.669879913330078, "learning_rate": 1.8876920934119501e-07, "logits/chosen": -17.842588424682617, "logits/rejected": -18.010717391967773, "logps/chosen": -335.86444091796875, "logps/rejected": -319.1579895019531, "loss": 1.1204, "rewards/accuracies": 0.5, "rewards/chosen": 2.3101541996002197, "rewards/margins": 0.08815126121044159, "rewards/rejected": 2.2220025062561035, "step": 39950 }, { "epoch": 1.8552393333023818, "grad_norm": 80.56781768798828, "learning_rate": 1.8874135289474905e-07, "logits/chosen": -18.888883590698242, "logits/rejected": -17.671916961669922, "logps/chosen": -370.78546142578125, "logps/rejected": -236.6035614013672, "loss": 0.4437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2662272453308105, "rewards/margins": 1.419668436050415, "rewards/rejected": 1.846558928489685, "step": 39960 }, { "epoch": 1.8557036074098148, "grad_norm": 6.475620746612549, "learning_rate": 1.8871349644830307e-07, "logits/chosen": -19.413372039794922, "logits/rejected": -19.43103790283203, "logps/chosen": -385.10675048828125, "logps/rejected": -415.40716552734375, "loss": 1.3894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0174596309661865, "rewards/margins": -0.13134267926216125, "rewards/rejected": 3.1488025188446045, "step": 39970 }, { "epoch": 1.8561678815172478, "grad_norm": 68.2285385131836, "learning_rate": 1.8868564000185708e-07, "logits/chosen": -18.254728317260742, "logits/rejected": -18.549938201904297, "logps/chosen": -292.8375244140625, "logps/rejected": -366.92877197265625, "loss": 0.7117, "rewards/accuracies": 0.5, "rewards/chosen": 3.2352893352508545, "rewards/margins": 0.6072342395782471, "rewards/rejected": 2.6280550956726074, "step": 39980 }, { "epoch": 1.8566321556246808, "grad_norm": 67.37372589111328, "learning_rate": 1.886577835554111e-07, "logits/chosen": -18.80596923828125, "logits/rejected": -18.519149780273438, "logps/chosen": -392.4194030761719, "logps/rejected": -361.7798767089844, "loss": 0.8077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9043025970458984, "rewards/margins": 0.11415471881628036, "rewards/rejected": 3.790147304534912, "step": 39990 }, { "epoch": 1.8570964297321138, "grad_norm": 1.0168023109436035, "learning_rate": 1.886299271089651e-07, "logits/chosen": -18.497119903564453, "logits/rejected": -18.229305267333984, "logps/chosen": -356.07965087890625, "logps/rejected": -261.39306640625, "loss": 1.4134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6211295127868652, "rewards/margins": -0.029340147972106934, "rewards/rejected": 2.6504695415496826, "step": 40000 }, { "epoch": 1.857560703839547, "grad_norm": 86.74239349365234, "learning_rate": 1.8860207066251915e-07, "logits/chosen": -18.864803314208984, "logits/rejected": -18.922080993652344, "logps/chosen": -435.79608154296875, "logps/rejected": -450.22845458984375, "loss": 0.9168, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.836721897125244, "rewards/margins": 0.2152681052684784, "rewards/rejected": 3.6214535236358643, "step": 40010 }, { "epoch": 1.8580249779469797, "grad_norm": 38.185142517089844, "learning_rate": 1.8857421421607314e-07, "logits/chosen": -19.76387596130371, "logits/rejected": -18.914228439331055, "logps/chosen": -359.13641357421875, "logps/rejected": -306.18682861328125, "loss": 0.4809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.872103214263916, "rewards/margins": 0.8523938059806824, "rewards/rejected": 2.019709348678589, "step": 40020 }, { "epoch": 1.858489252054413, "grad_norm": 18.83700180053711, "learning_rate": 1.8854635776962718e-07, "logits/chosen": -19.287656784057617, "logits/rejected": -17.684417724609375, "logps/chosen": -467.70892333984375, "logps/rejected": -336.44110107421875, "loss": 0.3468, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5616183280944824, "rewards/margins": 1.2491909265518188, "rewards/rejected": 2.312427520751953, "step": 40030 }, { "epoch": 1.858953526161846, "grad_norm": 39.29483413696289, "learning_rate": 1.885185013231812e-07, "logits/chosen": -19.42966079711914, "logits/rejected": -18.803333282470703, "logps/chosen": -376.82415771484375, "logps/rejected": -400.6826171875, "loss": 0.7713, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.089994430541992, "rewards/margins": 0.428006112575531, "rewards/rejected": 2.6619884967803955, "step": 40040 }, { "epoch": 1.859417800269279, "grad_norm": 77.1440658569336, "learning_rate": 1.8849064487673524e-07, "logits/chosen": -19.223520278930664, "logits/rejected": -17.90334129333496, "logps/chosen": -498.2267150878906, "logps/rejected": -314.32745361328125, "loss": 0.598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.609875202178955, "rewards/margins": 2.0880985260009766, "rewards/rejected": 2.5217766761779785, "step": 40050 }, { "epoch": 1.8598820743767122, "grad_norm": 87.59400939941406, "learning_rate": 1.8846278843028923e-07, "logits/chosen": -19.168628692626953, "logits/rejected": -18.886478424072266, "logps/chosen": -390.15191650390625, "logps/rejected": -380.42120361328125, "loss": 0.8981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7879462242126465, "rewards/margins": 0.19088074564933777, "rewards/rejected": 3.5970661640167236, "step": 40060 }, { "epoch": 1.860346348484145, "grad_norm": 0.5365516543388367, "learning_rate": 1.8843493198384324e-07, "logits/chosen": -19.00019073486328, "logits/rejected": -17.835342407226562, "logps/chosen": -401.31427001953125, "logps/rejected": -271.68609619140625, "loss": 0.7278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5985920429229736, "rewards/margins": 1.4767708778381348, "rewards/rejected": 2.1218209266662598, "step": 40070 }, { "epoch": 1.8608106225915781, "grad_norm": 95.53364562988281, "learning_rate": 1.8840707553739728e-07, "logits/chosen": -19.03980827331543, "logits/rejected": -19.0245361328125, "logps/chosen": -403.0358581542969, "logps/rejected": -376.66448974609375, "loss": 0.4082, "rewards/accuracies": 1.0, "rewards/chosen": 2.7760567665100098, "rewards/margins": 0.8696099519729614, "rewards/rejected": 1.9064466953277588, "step": 40080 }, { "epoch": 1.8612748966990111, "grad_norm": 138.30245971679688, "learning_rate": 1.883792190909513e-07, "logits/chosen": -19.876453399658203, "logits/rejected": -19.019725799560547, "logps/chosen": -438.96270751953125, "logps/rejected": -393.97125244140625, "loss": 0.4892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.789903163909912, "rewards/margins": 0.9438154101371765, "rewards/rejected": 2.846087694168091, "step": 40090 }, { "epoch": 1.8617391708064441, "grad_norm": 66.598876953125, "learning_rate": 1.8835136264450528e-07, "logits/chosen": -18.62956428527832, "logits/rejected": -17.52822494506836, "logps/chosen": -361.8518981933594, "logps/rejected": -291.05303955078125, "loss": 0.3241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9579765796661377, "rewards/margins": 1.9943149089813232, "rewards/rejected": 1.963661551475525, "step": 40100 }, { "epoch": 1.8622034449138771, "grad_norm": 235.7242431640625, "learning_rate": 1.8832350619805933e-07, "logits/chosen": -18.338211059570312, "logits/rejected": -17.985902786254883, "logps/chosen": -404.8496398925781, "logps/rejected": -401.12567138671875, "loss": 1.261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0444998741149902, "rewards/margins": 0.026537656784057617, "rewards/rejected": 3.0179622173309326, "step": 40110 }, { "epoch": 1.8626677190213101, "grad_norm": 13.347275733947754, "learning_rate": 1.8829564975161334e-07, "logits/chosen": -18.832218170166016, "logits/rejected": -18.691905975341797, "logps/chosen": -321.72845458984375, "logps/rejected": -285.87933349609375, "loss": 0.6792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9198100566864014, "rewards/margins": 0.25605010986328125, "rewards/rejected": 2.663759708404541, "step": 40120 }, { "epoch": 1.8631319931287433, "grad_norm": 43.26535415649414, "learning_rate": 1.8826779330516738e-07, "logits/chosen": -19.32338523864746, "logits/rejected": -19.3516902923584, "logps/chosen": -305.8011779785156, "logps/rejected": -321.5707702636719, "loss": 0.6249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0046255588531494, "rewards/margins": 0.6578091979026794, "rewards/rejected": 2.3468167781829834, "step": 40130 }, { "epoch": 1.863596267236176, "grad_norm": 226.6222686767578, "learning_rate": 1.8823993685872137e-07, "logits/chosen": -18.823606491088867, "logits/rejected": -18.573986053466797, "logps/chosen": -433.5782775878906, "logps/rejected": -424.8605041503906, "loss": 0.9813, "rewards/accuracies": 0.5, "rewards/chosen": 3.2831597328186035, "rewards/margins": 0.1579265594482422, "rewards/rejected": 3.1252331733703613, "step": 40140 }, { "epoch": 1.8640605413436093, "grad_norm": 40.15731430053711, "learning_rate": 1.882120804122754e-07, "logits/chosen": -18.672252655029297, "logits/rejected": -18.173076629638672, "logps/chosen": -412.35498046875, "logps/rejected": -351.4134216308594, "loss": 0.6787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6560471057891846, "rewards/margins": 1.1318689584732056, "rewards/rejected": 2.5241780281066895, "step": 40150 }, { "epoch": 1.8645248154510423, "grad_norm": 30.56926918029785, "learning_rate": 1.8818422396582943e-07, "logits/chosen": -18.241594314575195, "logits/rejected": -17.94004249572754, "logps/chosen": -259.5179138183594, "logps/rejected": -233.11587524414062, "loss": 0.8621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7687362432479858, "rewards/margins": 0.4668942987918854, "rewards/rejected": 1.3018418550491333, "step": 40160 }, { "epoch": 1.8649890895584753, "grad_norm": 28.52204704284668, "learning_rate": 1.8815636751938344e-07, "logits/chosen": -19.740192413330078, "logits/rejected": -19.500368118286133, "logps/chosen": -419.0057678222656, "logps/rejected": -409.0560607910156, "loss": 0.4778, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0554728507995605, "rewards/margins": 0.7226462364196777, "rewards/rejected": 3.332826614379883, "step": 40170 }, { "epoch": 1.8654533636659083, "grad_norm": 140.91983032226562, "learning_rate": 1.8812851107293745e-07, "logits/chosen": -18.941442489624023, "logits/rejected": -17.98546600341797, "logps/chosen": -304.24066162109375, "logps/rejected": -231.1936492919922, "loss": 0.5818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1213455200195312, "rewards/margins": 0.8011897802352905, "rewards/rejected": 2.320155620574951, "step": 40180 }, { "epoch": 1.8659176377733413, "grad_norm": 21.743839263916016, "learning_rate": 1.8810065462649147e-07, "logits/chosen": -18.211116790771484, "logits/rejected": -18.63542366027832, "logps/chosen": -333.0195007324219, "logps/rejected": -325.89849853515625, "loss": 0.9884, "rewards/accuracies": 0.5, "rewards/chosen": 3.1391119956970215, "rewards/margins": 0.09697167575359344, "rewards/rejected": 3.042140245437622, "step": 40190 }, { "epoch": 1.8663819118807745, "grad_norm": 168.9310760498047, "learning_rate": 1.880727981800455e-07, "logits/chosen": -18.92348289489746, "logits/rejected": -18.33795738220215, "logps/chosen": -313.99859619140625, "logps/rejected": -290.49365234375, "loss": 0.7653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.801281213760376, "rewards/margins": 0.38004204630851746, "rewards/rejected": 2.4212393760681152, "step": 40200 }, { "epoch": 1.8668461859882073, "grad_norm": 23.150348663330078, "learning_rate": 1.880449417335995e-07, "logits/chosen": -18.465042114257812, "logits/rejected": -17.734365463256836, "logps/chosen": -299.6211853027344, "logps/rejected": -220.41958618164062, "loss": 0.6796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9317076206207275, "rewards/margins": 1.280776023864746, "rewards/rejected": 1.6509315967559814, "step": 40210 }, { "epoch": 1.8673104600956405, "grad_norm": 65.61898040771484, "learning_rate": 1.880170852871535e-07, "logits/chosen": -19.60061264038086, "logits/rejected": -18.958181381225586, "logps/chosen": -304.28118896484375, "logps/rejected": -246.66860961914062, "loss": 0.4026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7614078521728516, "rewards/margins": 1.4465925693511963, "rewards/rejected": 1.3148154020309448, "step": 40220 }, { "epoch": 1.8677747342030735, "grad_norm": 81.29822540283203, "learning_rate": 1.8798922884070755e-07, "logits/chosen": -18.133989334106445, "logits/rejected": -17.04634666442871, "logps/chosen": -409.88909912109375, "logps/rejected": -244.88818359375, "loss": 0.5517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.087944746017456, "rewards/margins": 1.3519494533538818, "rewards/rejected": 1.7359952926635742, "step": 40230 }, { "epoch": 1.8682390083105065, "grad_norm": 70.63143920898438, "learning_rate": 1.8796137239426157e-07, "logits/chosen": -19.592538833618164, "logits/rejected": -18.446565628051758, "logps/chosen": -469.24456787109375, "logps/rejected": -256.970703125, "loss": 0.3425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.200429916381836, "rewards/margins": 1.952012300491333, "rewards/rejected": 2.2484171390533447, "step": 40240 }, { "epoch": 1.8687032824179397, "grad_norm": 48.711429595947266, "learning_rate": 1.8793351594781556e-07, "logits/chosen": -18.541507720947266, "logits/rejected": -18.574569702148438, "logps/chosen": -308.473876953125, "logps/rejected": -334.1814880371094, "loss": 0.8574, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.143219470977783, "rewards/margins": -0.06866950541734695, "rewards/rejected": 2.2118890285491943, "step": 40250 }, { "epoch": 1.8691675565253725, "grad_norm": 236.48001098632812, "learning_rate": 1.879056595013696e-07, "logits/chosen": -18.118515014648438, "logits/rejected": -18.710830688476562, "logps/chosen": -305.45477294921875, "logps/rejected": -337.9212951660156, "loss": 1.728, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9309093952178955, "rewards/margins": -0.46704596281051636, "rewards/rejected": 3.3979554176330566, "step": 40260 }, { "epoch": 1.8696318306328057, "grad_norm": 56.50279235839844, "learning_rate": 1.878778030549236e-07, "logits/chosen": -19.7674503326416, "logits/rejected": -19.759078979492188, "logps/chosen": -521.4244384765625, "logps/rejected": -435.05712890625, "loss": 0.6931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9438891410827637, "rewards/margins": 0.19572050869464874, "rewards/rejected": 3.748168468475342, "step": 40270 }, { "epoch": 1.8700961047402387, "grad_norm": 102.9352798461914, "learning_rate": 1.8784994660847765e-07, "logits/chosen": -18.487276077270508, "logits/rejected": -19.404743194580078, "logps/chosen": -318.62908935546875, "logps/rejected": -388.47576904296875, "loss": 1.1945, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.033271312713623, "rewards/margins": -0.7122650146484375, "rewards/rejected": 2.7455363273620605, "step": 40280 }, { "epoch": 1.8705603788476717, "grad_norm": 9.488070487976074, "learning_rate": 1.8782209016203164e-07, "logits/chosen": -19.217233657836914, "logits/rejected": -18.458833694458008, "logps/chosen": -464.6656799316406, "logps/rejected": -413.6189880371094, "loss": 0.3793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3302154541015625, "rewards/margins": 1.646912932395935, "rewards/rejected": 2.683302640914917, "step": 40290 }, { "epoch": 1.8710246529551047, "grad_norm": 98.94593048095703, "learning_rate": 1.8779423371558568e-07, "logits/chosen": -17.932933807373047, "logits/rejected": -17.90094566345215, "logps/chosen": -390.6144714355469, "logps/rejected": -406.5397644042969, "loss": 1.3235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2255706787109375, "rewards/margins": -0.19531163573265076, "rewards/rejected": 3.4208824634552, "step": 40300 }, { "epoch": 1.8714889270625377, "grad_norm": 207.5841064453125, "learning_rate": 1.877663772691397e-07, "logits/chosen": -18.95509910583496, "logits/rejected": -18.299360275268555, "logps/chosen": -396.1485900878906, "logps/rejected": -321.52886962890625, "loss": 0.6646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.596574306488037, "rewards/margins": 0.5096815824508667, "rewards/rejected": 2.086892604827881, "step": 40310 }, { "epoch": 1.8719532011699709, "grad_norm": 37.221675872802734, "learning_rate": 1.8773852082269374e-07, "logits/chosen": -18.341691970825195, "logits/rejected": -17.641000747680664, "logps/chosen": -396.6451416015625, "logps/rejected": -317.1199645996094, "loss": 0.7848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.210120439529419, "rewards/margins": 0.3548925220966339, "rewards/rejected": 2.8552279472351074, "step": 40320 }, { "epoch": 1.8724174752774037, "grad_norm": 93.7808837890625, "learning_rate": 1.8771066437624773e-07, "logits/chosen": -19.584579467773438, "logits/rejected": -19.343393325805664, "logps/chosen": -442.464111328125, "logps/rejected": -329.15814208984375, "loss": 0.5298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.226738929748535, "rewards/margins": 1.269547939300537, "rewards/rejected": 2.957191228866577, "step": 40330 }, { "epoch": 1.8728817493848369, "grad_norm": 36.736812591552734, "learning_rate": 1.8768280792980174e-07, "logits/chosen": -18.849058151245117, "logits/rejected": -18.874176025390625, "logps/chosen": -383.05657958984375, "logps/rejected": -445.4535217285156, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 3.173099994659424, "rewards/margins": 0.4493176341056824, "rewards/rejected": 2.723782539367676, "step": 40340 }, { "epoch": 1.8733460234922699, "grad_norm": 85.35953521728516, "learning_rate": 1.8765495148335578e-07, "logits/chosen": -18.789031982421875, "logits/rejected": -17.840044021606445, "logps/chosen": -343.39312744140625, "logps/rejected": -244.03634643554688, "loss": 0.4716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4127681255340576, "rewards/margins": 1.893670678138733, "rewards/rejected": 1.5190975666046143, "step": 40350 }, { "epoch": 1.8738102975997029, "grad_norm": 0.19016996026039124, "learning_rate": 1.876270950369098e-07, "logits/chosen": -18.740102767944336, "logits/rejected": -17.902915954589844, "logps/chosen": -435.47088623046875, "logps/rejected": -262.69500732421875, "loss": 0.3852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9618396759033203, "rewards/margins": 2.2215609550476074, "rewards/rejected": 1.7402784824371338, "step": 40360 }, { "epoch": 1.8742745717071359, "grad_norm": 2.5217621326446533, "learning_rate": 1.8759923859046378e-07, "logits/chosen": -18.006803512573242, "logits/rejected": -17.552675247192383, "logps/chosen": -363.00335693359375, "logps/rejected": -256.74224853515625, "loss": 0.7173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.420560359954834, "rewards/margins": 1.7621933221817017, "rewards/rejected": 1.6583671569824219, "step": 40370 }, { "epoch": 1.8747388458145688, "grad_norm": 180.40667724609375, "learning_rate": 1.8757138214401782e-07, "logits/chosen": -20.249492645263672, "logits/rejected": -18.55837631225586, "logps/chosen": -408.51739501953125, "logps/rejected": -288.7601623535156, "loss": 0.4868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2920241355896, "rewards/margins": 1.354357361793518, "rewards/rejected": 2.93766713142395, "step": 40380 }, { "epoch": 1.875203119922002, "grad_norm": 113.07003784179688, "learning_rate": 1.8754352569757184e-07, "logits/chosen": -19.418025970458984, "logits/rejected": -19.19831085205078, "logps/chosen": -438.48431396484375, "logps/rejected": -381.66180419921875, "loss": 0.7985, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.203354358673096, "rewards/margins": 0.33135607838630676, "rewards/rejected": 3.8719983100891113, "step": 40390 }, { "epoch": 1.8756673940294348, "grad_norm": 139.29458618164062, "learning_rate": 1.8751566925112585e-07, "logits/chosen": -19.349376678466797, "logits/rejected": -18.893796920776367, "logps/chosen": -477.6669921875, "logps/rejected": -408.81005859375, "loss": 0.8846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0973002910614014, "rewards/margins": -0.12882278859615326, "rewards/rejected": 3.226123094558716, "step": 40400 }, { "epoch": 1.876131668136868, "grad_norm": 92.59416961669922, "learning_rate": 1.8748781280467987e-07, "logits/chosen": -18.643526077270508, "logits/rejected": -18.612628936767578, "logps/chosen": -465.39935302734375, "logps/rejected": -427.6500549316406, "loss": 1.1106, "rewards/accuracies": 0.5, "rewards/chosen": 3.78723406791687, "rewards/margins": 0.33078983426094055, "rewards/rejected": 3.456444263458252, "step": 40410 }, { "epoch": 1.876595942244301, "grad_norm": 12.393000602722168, "learning_rate": 1.8745995635823388e-07, "logits/chosen": -18.425676345825195, "logits/rejected": -17.72481346130371, "logps/chosen": -332.8215026855469, "logps/rejected": -243.07522583007812, "loss": 0.8326, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.633373498916626, "rewards/margins": 0.755752444267273, "rewards/rejected": 1.8776209354400635, "step": 40420 }, { "epoch": 1.877060216351734, "grad_norm": 99.49317932128906, "learning_rate": 1.8743209991178792e-07, "logits/chosen": -19.599496841430664, "logits/rejected": -18.7889347076416, "logps/chosen": -455.06903076171875, "logps/rejected": -340.1395263671875, "loss": 0.4095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.722491502761841, "rewards/margins": 1.6364009380340576, "rewards/rejected": 2.086090564727783, "step": 40430 }, { "epoch": 1.8775244904591673, "grad_norm": 141.32711791992188, "learning_rate": 1.874042434653419e-07, "logits/chosen": -18.649824142456055, "logits/rejected": -17.620080947875977, "logps/chosen": -476.7782287597656, "logps/rejected": -293.1888732910156, "loss": 0.4068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2244186401367188, "rewards/margins": 1.2674773931503296, "rewards/rejected": 1.9569413661956787, "step": 40440 }, { "epoch": 1.8779887645666, "grad_norm": 64.56379699707031, "learning_rate": 1.8737638701889595e-07, "logits/chosen": -17.713233947753906, "logits/rejected": -16.846538543701172, "logps/chosen": -291.5140686035156, "logps/rejected": -182.3571319580078, "loss": 0.53, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.638960123062134, "rewards/margins": 1.0072909593582153, "rewards/rejected": 1.631669282913208, "step": 40450 }, { "epoch": 1.8784530386740332, "grad_norm": 4.378802299499512, "learning_rate": 1.8734853057244997e-07, "logits/chosen": -19.083032608032227, "logits/rejected": -18.418880462646484, "logps/chosen": -364.1754455566406, "logps/rejected": -300.48980712890625, "loss": 0.4399, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.239901304244995, "rewards/margins": 1.3657790422439575, "rewards/rejected": 1.8741222620010376, "step": 40460 }, { "epoch": 1.8789173127814662, "grad_norm": 2.629945755004883, "learning_rate": 1.87320674126004e-07, "logits/chosen": -18.765867233276367, "logits/rejected": -17.319866180419922, "logps/chosen": -412.51373291015625, "logps/rejected": -272.45684814453125, "loss": 0.4518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.603008985519409, "rewards/margins": 2.0242414474487305, "rewards/rejected": 1.5787674188613892, "step": 40470 }, { "epoch": 1.8793815868888992, "grad_norm": 22.966537475585938, "learning_rate": 1.87292817679558e-07, "logits/chosen": -19.300399780273438, "logits/rejected": -18.405147552490234, "logps/chosen": -354.8815002441406, "logps/rejected": -243.1743621826172, "loss": 0.5534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.695330858230591, "rewards/margins": 0.9090040326118469, "rewards/rejected": 2.7863268852233887, "step": 40480 }, { "epoch": 1.8798458609963322, "grad_norm": 7.085368633270264, "learning_rate": 1.87264961233112e-07, "logits/chosen": -17.782672882080078, "logits/rejected": -18.496816635131836, "logps/chosen": -325.3575744628906, "logps/rejected": -355.082763671875, "loss": 1.4833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6728274822235107, "rewards/margins": -0.26390063762664795, "rewards/rejected": 2.9367282390594482, "step": 40490 }, { "epoch": 1.8803101351037652, "grad_norm": 11.57714557647705, "learning_rate": 1.8723710478666605e-07, "logits/chosen": -18.03131866455078, "logits/rejected": -17.591398239135742, "logps/chosen": -315.8096008300781, "logps/rejected": -264.8865051269531, "loss": 0.8278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.345633029937744, "rewards/margins": 0.5473083257675171, "rewards/rejected": 1.7983249425888062, "step": 40500 }, { "epoch": 1.8807744092111984, "grad_norm": 33.35601043701172, "learning_rate": 1.8720924834022007e-07, "logits/chosen": -19.07962989807129, "logits/rejected": -17.71261978149414, "logps/chosen": -384.2841796875, "logps/rejected": -225.130126953125, "loss": 0.6552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.961366653442383, "rewards/margins": 1.9141395092010498, "rewards/rejected": 2.047227382659912, "step": 40510 }, { "epoch": 1.8812386833186312, "grad_norm": 155.3069305419922, "learning_rate": 1.8718139189377405e-07, "logits/chosen": -19.394786834716797, "logits/rejected": -19.056270599365234, "logps/chosen": -474.262939453125, "logps/rejected": -483.6900939941406, "loss": 0.9204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.153292417526245, "rewards/margins": 0.26717743277549744, "rewards/rejected": 2.8861145973205566, "step": 40520 }, { "epoch": 1.8817029574260644, "grad_norm": 172.90777587890625, "learning_rate": 1.871535354473281e-07, "logits/chosen": -17.529584884643555, "logits/rejected": -17.789363861083984, "logps/chosen": -236.2426300048828, "logps/rejected": -234.6934814453125, "loss": 0.945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3049304485321045, "rewards/margins": -0.06736941635608673, "rewards/rejected": 1.3722997903823853, "step": 40530 }, { "epoch": 1.8821672315334974, "grad_norm": 44.10967254638672, "learning_rate": 1.871256790008821e-07, "logits/chosen": -18.696666717529297, "logits/rejected": -18.50215721130371, "logps/chosen": -474.29425048828125, "logps/rejected": -391.1548767089844, "loss": 0.508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.852246046066284, "rewards/margins": 0.8445250391960144, "rewards/rejected": 3.007721185684204, "step": 40540 }, { "epoch": 1.8826315056409304, "grad_norm": 41.27034378051758, "learning_rate": 1.8709782255443615e-07, "logits/chosen": -18.51016616821289, "logits/rejected": -18.18825912475586, "logps/chosen": -436.11346435546875, "logps/rejected": -356.67657470703125, "loss": 0.7374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3792641162872314, "rewards/margins": 1.022268533706665, "rewards/rejected": 2.3569953441619873, "step": 40550 }, { "epoch": 1.8830957797483634, "grad_norm": 94.05055236816406, "learning_rate": 1.8706996610799014e-07, "logits/chosen": -18.266490936279297, "logits/rejected": -18.037817001342773, "logps/chosen": -368.7998046875, "logps/rejected": -303.25531005859375, "loss": 0.9153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.673339366912842, "rewards/margins": 0.7792655229568481, "rewards/rejected": 1.8940738439559937, "step": 40560 }, { "epoch": 1.8835600538557964, "grad_norm": 139.97384643554688, "learning_rate": 1.8704210966154418e-07, "logits/chosen": -18.721057891845703, "logits/rejected": -17.859018325805664, "logps/chosen": -439.4737854003906, "logps/rejected": -341.81488037109375, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.574392795562744, "rewards/margins": 1.4294335842132568, "rewards/rejected": 2.1449592113494873, "step": 40570 }, { "epoch": 1.8840243279632296, "grad_norm": 98.91812133789062, "learning_rate": 1.870142532150982e-07, "logits/chosen": -18.65497589111328, "logits/rejected": -18.05903434753418, "logps/chosen": -405.83331298828125, "logps/rejected": -329.00091552734375, "loss": 0.4896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.872288227081299, "rewards/margins": 1.0827229022979736, "rewards/rejected": 2.7895655632019043, "step": 40580 }, { "epoch": 1.8844886020706624, "grad_norm": 10.931561470031738, "learning_rate": 1.8698639676865218e-07, "logits/chosen": -18.833314895629883, "logits/rejected": -18.72426986694336, "logps/chosen": -446.35107421875, "logps/rejected": -412.9104919433594, "loss": 0.4876, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.479719161987305, "rewards/margins": 1.3779289722442627, "rewards/rejected": 3.101789951324463, "step": 40590 }, { "epoch": 1.8849528761780956, "grad_norm": 159.3498992919922, "learning_rate": 1.8695854032220622e-07, "logits/chosen": -18.866260528564453, "logits/rejected": -17.759517669677734, "logps/chosen": -433.34814453125, "logps/rejected": -262.64288330078125, "loss": 0.5734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.434958457946777, "rewards/margins": 2.562324047088623, "rewards/rejected": 1.8726348876953125, "step": 40600 }, { "epoch": 1.8854171502855286, "grad_norm": 69.06317138671875, "learning_rate": 1.8693068387576024e-07, "logits/chosen": -18.50106430053711, "logits/rejected": -18.547550201416016, "logps/chosen": -368.8794250488281, "logps/rejected": -418.4371032714844, "loss": 1.1164, "rewards/accuracies": 0.5, "rewards/chosen": 3.056208610534668, "rewards/margins": -0.3591840863227844, "rewards/rejected": 3.415393114089966, "step": 40610 }, { "epoch": 1.8858814243929616, "grad_norm": 53.12990188598633, "learning_rate": 1.8690282742931428e-07, "logits/chosen": -19.020000457763672, "logits/rejected": -17.624792098999023, "logps/chosen": -335.8659973144531, "logps/rejected": -183.97027587890625, "loss": 0.4288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5228586196899414, "rewards/margins": 1.4233180284500122, "rewards/rejected": 1.0995405912399292, "step": 40620 }, { "epoch": 1.8863456985003948, "grad_norm": 4.600813388824463, "learning_rate": 1.8687497098286827e-07, "logits/chosen": -19.056835174560547, "logits/rejected": -18.154077529907227, "logps/chosen": -313.703369140625, "logps/rejected": -296.2295837402344, "loss": 0.6458, "rewards/accuracies": 0.5, "rewards/chosen": 3.4314589500427246, "rewards/margins": 1.3487175703048706, "rewards/rejected": 2.0827410221099854, "step": 40630 }, { "epoch": 1.8868099726078276, "grad_norm": 266.59942626953125, "learning_rate": 1.8684711453642228e-07, "logits/chosen": -20.01999855041504, "logits/rejected": -18.654760360717773, "logps/chosen": -352.0390319824219, "logps/rejected": -236.0051727294922, "loss": 0.7541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4955859184265137, "rewards/margins": 0.3830099105834961, "rewards/rejected": 2.1125760078430176, "step": 40640 }, { "epoch": 1.8872742467152608, "grad_norm": 68.78002166748047, "learning_rate": 1.8681925808997632e-07, "logits/chosen": -19.2470703125, "logits/rejected": -19.333683013916016, "logps/chosen": -389.8758239746094, "logps/rejected": -338.44989013671875, "loss": 1.0224, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8077287673950195, "rewards/margins": -0.17801223695278168, "rewards/rejected": 2.985741138458252, "step": 40650 }, { "epoch": 1.8877385208226936, "grad_norm": 181.37457275390625, "learning_rate": 1.8679140164353034e-07, "logits/chosen": -18.88553810119629, "logits/rejected": -19.24224281311035, "logps/chosen": -298.56683349609375, "logps/rejected": -281.6139221191406, "loss": 1.4453, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.8661991357803345, "rewards/margins": -0.6711040735244751, "rewards/rejected": 2.5373034477233887, "step": 40660 }, { "epoch": 1.8882027949301268, "grad_norm": 247.49815368652344, "learning_rate": 1.8676354519708433e-07, "logits/chosen": -17.78439712524414, "logits/rejected": -17.597009658813477, "logps/chosen": -382.15704345703125, "logps/rejected": -337.8309326171875, "loss": 1.0484, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.388047695159912, "rewards/margins": 0.32554835081100464, "rewards/rejected": 3.062499761581421, "step": 40670 }, { "epoch": 1.8886670690375598, "grad_norm": 12.179445266723633, "learning_rate": 1.8673568875063837e-07, "logits/chosen": -18.05992889404297, "logits/rejected": -17.87398910522461, "logps/chosen": -367.00445556640625, "logps/rejected": -311.24755859375, "loss": 1.1726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.794595241546631, "rewards/margins": 0.16647855937480927, "rewards/rejected": 2.6281168460845947, "step": 40680 }, { "epoch": 1.8891313431449928, "grad_norm": 11.527033805847168, "learning_rate": 1.8670783230419238e-07, "logits/chosen": -18.87279510498047, "logits/rejected": -18.18529510498047, "logps/chosen": -341.2701110839844, "logps/rejected": -301.07904052734375, "loss": 0.3616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.131362199783325, "rewards/margins": 1.1115522384643555, "rewards/rejected": 2.019810199737549, "step": 40690 }, { "epoch": 1.889595617252426, "grad_norm": 18.463830947875977, "learning_rate": 1.8667997585774642e-07, "logits/chosen": -19.166484832763672, "logits/rejected": -18.952518463134766, "logps/chosen": -451.1136169433594, "logps/rejected": -323.119140625, "loss": 0.6204, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.19299578666687, "rewards/margins": 0.6415039300918579, "rewards/rejected": 2.551492214202881, "step": 40700 }, { "epoch": 1.8900598913598587, "grad_norm": 0.08916175365447998, "learning_rate": 1.866521194113004e-07, "logits/chosen": -18.287796020507812, "logits/rejected": -16.741531372070312, "logps/chosen": -365.96063232421875, "logps/rejected": -225.2230682373047, "loss": 0.3423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5510196685791016, "rewards/margins": 2.2597970962524414, "rewards/rejected": 1.2912224531173706, "step": 40710 }, { "epoch": 1.890524165467292, "grad_norm": 24.421537399291992, "learning_rate": 1.8662426296485445e-07, "logits/chosen": -18.521406173706055, "logits/rejected": -17.031892776489258, "logps/chosen": -442.32733154296875, "logps/rejected": -299.6457214355469, "loss": 0.4972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.046602725982666, "rewards/margins": 1.1216557025909424, "rewards/rejected": 1.9249467849731445, "step": 40720 }, { "epoch": 1.890988439574725, "grad_norm": 2.588752508163452, "learning_rate": 1.8659640651840847e-07, "logits/chosen": -18.70937156677246, "logits/rejected": -17.956195831298828, "logps/chosen": -369.3977355957031, "logps/rejected": -262.2050476074219, "loss": 0.7404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.420199155807495, "rewards/margins": 1.1513671875, "rewards/rejected": 2.268831729888916, "step": 40730 }, { "epoch": 1.891452713682158, "grad_norm": 92.1758804321289, "learning_rate": 1.865685500719625e-07, "logits/chosen": -17.971561431884766, "logits/rejected": -18.00019645690918, "logps/chosen": -309.6790771484375, "logps/rejected": -284.2537841796875, "loss": 0.7154, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6784093379974365, "rewards/margins": 0.6109021902084351, "rewards/rejected": 2.067507028579712, "step": 40740 }, { "epoch": 1.891916987789591, "grad_norm": 165.28382873535156, "learning_rate": 1.865406936255165e-07, "logits/chosen": -19.022403717041016, "logits/rejected": -18.47115707397461, "logps/chosen": -405.5716857910156, "logps/rejected": -340.8251647949219, "loss": 0.597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3944060802459717, "rewards/margins": 1.1087335348129272, "rewards/rejected": 2.285672903060913, "step": 40750 }, { "epoch": 1.892381261897024, "grad_norm": 59.157867431640625, "learning_rate": 1.865156228237151e-07, "logits/chosen": -19.210124969482422, "logits/rejected": -17.319791793823242, "logps/chosen": -431.3882751464844, "logps/rejected": -286.68658447265625, "loss": 0.3952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4339704513549805, "rewards/margins": 1.7803630828857422, "rewards/rejected": 2.65360689163208, "step": 40760 }, { "epoch": 1.8928455360044572, "grad_norm": 5.475057601928711, "learning_rate": 1.8648776637726913e-07, "logits/chosen": -18.74466323852539, "logits/rejected": -17.880598068237305, "logps/chosen": -401.91400146484375, "logps/rejected": -312.12677001953125, "loss": 0.7666, "rewards/accuracies": 0.5, "rewards/chosen": 3.4132282733917236, "rewards/margins": 0.47685450315475464, "rewards/rejected": 2.9363739490509033, "step": 40770 }, { "epoch": 1.89330981011189, "grad_norm": 0.8052262663841248, "learning_rate": 1.8645990993082314e-07, "logits/chosen": -18.49080467224121, "logits/rejected": -17.51833152770996, "logps/chosen": -537.6145629882812, "logps/rejected": -369.5169982910156, "loss": 0.4367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.763805389404297, "rewards/margins": 1.9075229167938232, "rewards/rejected": 2.8562824726104736, "step": 40780 }, { "epoch": 1.8937740842193231, "grad_norm": 63.801856994628906, "learning_rate": 1.8643205348437719e-07, "logits/chosen": -17.947643280029297, "logits/rejected": -17.773542404174805, "logps/chosen": -325.6600036621094, "logps/rejected": -271.7471008300781, "loss": 0.4265, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.477886199951172, "rewards/margins": 0.9083648920059204, "rewards/rejected": 1.5695213079452515, "step": 40790 }, { "epoch": 1.8942383583267561, "grad_norm": 98.72058868408203, "learning_rate": 1.8640419703793117e-07, "logits/chosen": -18.56880760192871, "logits/rejected": -17.85489845275879, "logps/chosen": -413.426025390625, "logps/rejected": -372.72735595703125, "loss": 0.8454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7031409740448, "rewards/margins": 0.6964200735092163, "rewards/rejected": 3.006721019744873, "step": 40800 }, { "epoch": 1.8947026324341891, "grad_norm": 0.5406479835510254, "learning_rate": 1.8637634059148521e-07, "logits/chosen": -18.78255271911621, "logits/rejected": -18.084354400634766, "logps/chosen": -374.7978820800781, "logps/rejected": -270.15948486328125, "loss": 0.7518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5691819190979004, "rewards/margins": 1.169656753540039, "rewards/rejected": 2.3995251655578613, "step": 40810 }, { "epoch": 1.8951669065416221, "grad_norm": 227.60523986816406, "learning_rate": 1.8634848414503923e-07, "logits/chosen": -18.693513870239258, "logits/rejected": -18.437511444091797, "logps/chosen": -394.66619873046875, "logps/rejected": -281.50787353515625, "loss": 0.4334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.397603988647461, "rewards/margins": 1.8666470050811768, "rewards/rejected": 1.5309569835662842, "step": 40820 }, { "epoch": 1.8956311806490551, "grad_norm": 49.41781234741211, "learning_rate": 1.8632062769859322e-07, "logits/chosen": -19.574398040771484, "logits/rejected": -19.55624008178711, "logps/chosen": -365.55560302734375, "logps/rejected": -347.7469482421875, "loss": 0.717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8775360584259033, "rewards/margins": 0.9188645482063293, "rewards/rejected": 2.9586713314056396, "step": 40830 }, { "epoch": 1.8960954547564883, "grad_norm": 4.266972064971924, "learning_rate": 1.8629277125214726e-07, "logits/chosen": -18.591381072998047, "logits/rejected": -17.73036003112793, "logps/chosen": -420.36883544921875, "logps/rejected": -357.0899353027344, "loss": 0.614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7214770317077637, "rewards/margins": 0.6935855746269226, "rewards/rejected": 3.0278916358947754, "step": 40840 }, { "epoch": 1.896559728863921, "grad_norm": 24.245155334472656, "learning_rate": 1.8626491480570127e-07, "logits/chosen": -19.411884307861328, "logits/rejected": -18.511722564697266, "logps/chosen": -448.62872314453125, "logps/rejected": -391.48614501953125, "loss": 0.6942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.158804893493652, "rewards/margins": 1.5488536357879639, "rewards/rejected": 2.6099512577056885, "step": 40850 }, { "epoch": 1.8970240029713543, "grad_norm": 25.782833099365234, "learning_rate": 1.8623705835925531e-07, "logits/chosen": -19.535390853881836, "logits/rejected": -18.802749633789062, "logps/chosen": -376.92144775390625, "logps/rejected": -298.3708190917969, "loss": 0.4249, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.613055467605591, "rewards/margins": 1.352005124092102, "rewards/rejected": 2.2610507011413574, "step": 40860 }, { "epoch": 1.8974882770787873, "grad_norm": 0.04908491298556328, "learning_rate": 1.862092019128093e-07, "logits/chosen": -18.609111785888672, "logits/rejected": -17.646278381347656, "logps/chosen": -438.0281677246094, "logps/rejected": -332.49615478515625, "loss": 0.4971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9835205078125, "rewards/margins": 1.571784496307373, "rewards/rejected": 2.4117355346679688, "step": 40870 }, { "epoch": 1.8979525511862203, "grad_norm": 34.503990173339844, "learning_rate": 1.8618134546636332e-07, "logits/chosen": -19.054468154907227, "logits/rejected": -18.980051040649414, "logps/chosen": -274.587158203125, "logps/rejected": -307.0480651855469, "loss": 1.5484, "rewards/accuracies": 0.5, "rewards/chosen": 1.9879014492034912, "rewards/margins": -0.5980842113494873, "rewards/rejected": 2.5859856605529785, "step": 40880 }, { "epoch": 1.8984168252936535, "grad_norm": 55.54690933227539, "learning_rate": 1.8615348901991736e-07, "logits/chosen": -18.687467575073242, "logits/rejected": -18.59494400024414, "logps/chosen": -412.51153564453125, "logps/rejected": -358.82696533203125, "loss": 0.9436, "rewards/accuracies": 0.5, "rewards/chosen": 3.3101649284362793, "rewards/margins": 0.4200301170349121, "rewards/rejected": 2.8901352882385254, "step": 40890 }, { "epoch": 1.8988810994010863, "grad_norm": 70.63304138183594, "learning_rate": 1.8612563257347137e-07, "logits/chosen": -18.963428497314453, "logits/rejected": -18.650548934936523, "logps/chosen": -400.74365234375, "logps/rejected": -371.78961181640625, "loss": 1.4248, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2344939708709717, "rewards/margins": -0.2743709087371826, "rewards/rejected": 3.5088648796081543, "step": 40900 }, { "epoch": 1.8993453735085195, "grad_norm": 63.4546012878418, "learning_rate": 1.8609777612702539e-07, "logits/chosen": -17.994159698486328, "logits/rejected": -17.612417221069336, "logps/chosen": -352.78369140625, "logps/rejected": -289.1748352050781, "loss": 1.0329, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6189026832580566, "rewards/margins": 0.35384294390678406, "rewards/rejected": 2.26505970954895, "step": 40910 }, { "epoch": 1.8998096476159525, "grad_norm": 74.73179626464844, "learning_rate": 1.860699196805794e-07, "logits/chosen": -19.68597984313965, "logits/rejected": -18.60130500793457, "logps/chosen": -405.26898193359375, "logps/rejected": -370.1377868652344, "loss": 0.832, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.993358850479126, "rewards/margins": 0.2566149830818176, "rewards/rejected": 2.7367441654205322, "step": 40920 }, { "epoch": 1.9002739217233855, "grad_norm": 45.98179244995117, "learning_rate": 1.8604206323413342e-07, "logits/chosen": -19.04554557800293, "logits/rejected": -18.11541175842285, "logps/chosen": -360.6940002441406, "logps/rejected": -335.6144714355469, "loss": 0.3798, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9150168895721436, "rewards/margins": 1.3289321660995483, "rewards/rejected": 1.5860844850540161, "step": 40930 }, { "epoch": 1.9007381958308185, "grad_norm": 45.22389602661133, "learning_rate": 1.8601420678768746e-07, "logits/chosen": -18.922876358032227, "logits/rejected": -18.501712799072266, "logps/chosen": -468.7378845214844, "logps/rejected": -369.5123596191406, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 4.21873140335083, "rewards/margins": 1.4166754484176636, "rewards/rejected": 2.802055835723877, "step": 40940 }, { "epoch": 1.9012024699382515, "grad_norm": 105.80769348144531, "learning_rate": 1.8598635034124144e-07, "logits/chosen": -17.999231338500977, "logits/rejected": -17.176013946533203, "logps/chosen": -476.35137939453125, "logps/rejected": -341.17376708984375, "loss": 0.4471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3697781562805176, "rewards/margins": 1.1559944152832031, "rewards/rejected": 2.2137835025787354, "step": 40950 }, { "epoch": 1.9016667440456847, "grad_norm": 87.13106536865234, "learning_rate": 1.8595849389479549e-07, "logits/chosen": -18.27078628540039, "logits/rejected": -17.445011138916016, "logps/chosen": -359.7108459472656, "logps/rejected": -269.1257019042969, "loss": 0.625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4470651149749756, "rewards/margins": 0.8458563089370728, "rewards/rejected": 1.6012089252471924, "step": 40960 }, { "epoch": 1.9021310181531175, "grad_norm": 121.34966278076172, "learning_rate": 1.859306374483495e-07, "logits/chosen": -19.667909622192383, "logits/rejected": -18.74332618713379, "logps/chosen": -436.227294921875, "logps/rejected": -386.518798828125, "loss": 0.6535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2317798137664795, "rewards/margins": 0.7828448414802551, "rewards/rejected": 2.448935031890869, "step": 40970 }, { "epoch": 1.9025952922605507, "grad_norm": 45.13667297363281, "learning_rate": 1.8590278100190354e-07, "logits/chosen": -19.323383331298828, "logits/rejected": -19.120967864990234, "logps/chosen": -394.38018798828125, "logps/rejected": -405.8741760253906, "loss": 1.0141, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.767249345779419, "rewards/margins": -0.4095117449760437, "rewards/rejected": 4.176761150360107, "step": 40980 }, { "epoch": 1.9030595663679837, "grad_norm": 45.224578857421875, "learning_rate": 1.8587492455545753e-07, "logits/chosen": -18.85517120361328, "logits/rejected": -18.626888275146484, "logps/chosen": -351.60980224609375, "logps/rejected": -252.71240234375, "loss": 0.758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.854982376098633, "rewards/margins": 0.7952994108200073, "rewards/rejected": 2.059682846069336, "step": 40990 }, { "epoch": 1.9035238404754167, "grad_norm": 27.595937728881836, "learning_rate": 1.8584706810901154e-07, "logits/chosen": -18.274005889892578, "logits/rejected": -17.57792854309082, "logps/chosen": -411.5755310058594, "logps/rejected": -239.86001586914062, "loss": 0.8377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0921270847320557, "rewards/margins": 0.8601906895637512, "rewards/rejected": 2.231936454772949, "step": 41000 }, { "epoch": 1.9039881145828497, "grad_norm": 117.09530639648438, "learning_rate": 1.8581921166256558e-07, "logits/chosen": -18.3426513671875, "logits/rejected": -18.532325744628906, "logps/chosen": -404.445068359375, "logps/rejected": -401.0959167480469, "loss": 1.0328, "rewards/accuracies": 0.5, "rewards/chosen": 2.943396806716919, "rewards/margins": -0.23192426562309265, "rewards/rejected": 3.175321340560913, "step": 41010 }, { "epoch": 1.9044523886902827, "grad_norm": 29.66378402709961, "learning_rate": 1.857913552161196e-07, "logits/chosen": -19.16902732849121, "logits/rejected": -17.64405059814453, "logps/chosen": -373.80902099609375, "logps/rejected": -232.1385955810547, "loss": 0.2924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.435572385787964, "rewards/margins": 2.144482374191284, "rewards/rejected": 1.2910900115966797, "step": 41020 }, { "epoch": 1.9049166627977159, "grad_norm": 170.67945861816406, "learning_rate": 1.857634987696736e-07, "logits/chosen": -18.238819122314453, "logits/rejected": -17.972400665283203, "logps/chosen": -384.9716796875, "logps/rejected": -308.56988525390625, "loss": 0.6365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5288166999816895, "rewards/margins": 0.9960119128227234, "rewards/rejected": 2.532804489135742, "step": 41030 }, { "epoch": 1.9053809369051486, "grad_norm": 8.747537612915039, "learning_rate": 1.8573564232322763e-07, "logits/chosen": -19.773784637451172, "logits/rejected": -18.58216667175293, "logps/chosen": -465.322998046875, "logps/rejected": -330.08380126953125, "loss": 0.6592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.310117244720459, "rewards/margins": 1.2951024770736694, "rewards/rejected": 3.015015125274658, "step": 41040 }, { "epoch": 1.9058452110125819, "grad_norm": 238.39112854003906, "learning_rate": 1.8570778587678164e-07, "logits/chosen": -18.629940032958984, "logits/rejected": -18.725358963012695, "logps/chosen": -411.0575256347656, "logps/rejected": -480.364501953125, "loss": 0.8591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7807624340057373, "rewards/margins": 0.0882808193564415, "rewards/rejected": 3.692481279373169, "step": 41050 }, { "epoch": 1.9063094851200149, "grad_norm": 11.412210464477539, "learning_rate": 1.8567992943033566e-07, "logits/chosen": -18.400875091552734, "logits/rejected": -18.059558868408203, "logps/chosen": -312.016357421875, "logps/rejected": -286.3831481933594, "loss": 0.7234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.590521812438965, "rewards/margins": 0.5327881574630737, "rewards/rejected": 2.0577335357666016, "step": 41060 }, { "epoch": 1.9067737592274479, "grad_norm": 57.8218879699707, "learning_rate": 1.8565207298388967e-07, "logits/chosen": -18.716629028320312, "logits/rejected": -18.080768585205078, "logps/chosen": -464.4345703125, "logps/rejected": -373.22216796875, "loss": 0.4269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7678756713867188, "rewards/margins": 1.1933107376098633, "rewards/rejected": 2.5745649337768555, "step": 41070 }, { "epoch": 1.907238033334881, "grad_norm": 259.3511657714844, "learning_rate": 1.856242165374437e-07, "logits/chosen": -20.188459396362305, "logits/rejected": -19.618667602539062, "logps/chosen": -432.99395751953125, "logps/rejected": -369.09283447265625, "loss": 0.7471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.365450382232666, "rewards/margins": 0.636433482170105, "rewards/rejected": 2.7290167808532715, "step": 41080 }, { "epoch": 1.9077023074423138, "grad_norm": 56.955711364746094, "learning_rate": 1.8559636009099773e-07, "logits/chosen": -18.92917251586914, "logits/rejected": -18.408878326416016, "logps/chosen": -381.2734680175781, "logps/rejected": -274.2564697265625, "loss": 0.3784, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4105403423309326, "rewards/margins": 1.1490294933319092, "rewards/rejected": 2.2615108489990234, "step": 41090 }, { "epoch": 1.908166581549747, "grad_norm": 4.746201038360596, "learning_rate": 1.8556850364455172e-07, "logits/chosen": -19.071701049804688, "logits/rejected": -18.810884475708008, "logps/chosen": -318.0560607910156, "logps/rejected": -294.7336730957031, "loss": 0.7249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.045738458633423, "rewards/margins": 0.48457878828048706, "rewards/rejected": 2.561159610748291, "step": 41100 }, { "epoch": 1.90863085565718, "grad_norm": 2.5517659187316895, "learning_rate": 1.8554064719810576e-07, "logits/chosen": -20.125137329101562, "logits/rejected": -18.072708129882812, "logps/chosen": -435.43572998046875, "logps/rejected": -258.44281005859375, "loss": 0.3475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.953524351119995, "rewards/margins": 2.2491934299468994, "rewards/rejected": 1.7043306827545166, "step": 41110 }, { "epoch": 1.909095129764613, "grad_norm": 109.80680084228516, "learning_rate": 1.8551279075165977e-07, "logits/chosen": -20.055700302124023, "logits/rejected": -19.44917106628418, "logps/chosen": -489.97723388671875, "logps/rejected": -434.5221252441406, "loss": 0.9377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.730478763580322, "rewards/margins": 0.3250119090080261, "rewards/rejected": 4.4054670333862305, "step": 41120 }, { "epoch": 1.909559403872046, "grad_norm": 13.123698234558105, "learning_rate": 1.854849343052138e-07, "logits/chosen": -18.986133575439453, "logits/rejected": -18.312772750854492, "logps/chosen": -302.38885498046875, "logps/rejected": -236.6441650390625, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4405317306518555, "rewards/margins": 0.8665315508842468, "rewards/rejected": 1.5740001201629639, "step": 41130 }, { "epoch": 1.910023677979479, "grad_norm": 7.062644004821777, "learning_rate": 1.854570778587678e-07, "logits/chosen": -18.013286590576172, "logits/rejected": -17.269893646240234, "logps/chosen": -428.011962890625, "logps/rejected": -311.8301696777344, "loss": 0.7896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9222512245178223, "rewards/margins": 0.7471925616264343, "rewards/rejected": 2.175058603286743, "step": 41140 }, { "epoch": 1.9104879520869122, "grad_norm": 28.93465805053711, "learning_rate": 1.8542922141232181e-07, "logits/chosen": -19.3480167388916, "logits/rejected": -17.735376358032227, "logps/chosen": -352.2393493652344, "logps/rejected": -263.4934387207031, "loss": 0.2765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.187497138977051, "rewards/margins": 1.7433849573135376, "rewards/rejected": 1.4441121816635132, "step": 41150 }, { "epoch": 1.910952226194345, "grad_norm": 21.453922271728516, "learning_rate": 1.8540136496587586e-07, "logits/chosen": -18.612548828125, "logits/rejected": -17.39694595336914, "logps/chosen": -399.94830322265625, "logps/rejected": -262.12176513671875, "loss": 0.4031, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.984649419784546, "rewards/margins": 1.337950348854065, "rewards/rejected": 1.64669930934906, "step": 41160 }, { "epoch": 1.9114165003017782, "grad_norm": 0.8073902726173401, "learning_rate": 1.8537350851942987e-07, "logits/chosen": -19.24392318725586, "logits/rejected": -18.858192443847656, "logps/chosen": -334.9772644042969, "logps/rejected": -269.09405517578125, "loss": 0.6023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.056382179260254, "rewards/margins": 1.642572045326233, "rewards/rejected": 2.4138102531433105, "step": 41170 }, { "epoch": 1.9118807744092112, "grad_norm": 18.992477416992188, "learning_rate": 1.8534565207298386e-07, "logits/chosen": -19.707294464111328, "logits/rejected": -18.34296417236328, "logps/chosen": -392.2535095214844, "logps/rejected": -298.7084655761719, "loss": 0.8475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7719616889953613, "rewards/margins": 1.1737780570983887, "rewards/rejected": 2.5981833934783936, "step": 41180 }, { "epoch": 1.9123450485166442, "grad_norm": 5.464127540588379, "learning_rate": 1.853177956265379e-07, "logits/chosen": -18.811304092407227, "logits/rejected": -17.69838523864746, "logps/chosen": -527.3128662109375, "logps/rejected": -343.18206787109375, "loss": 0.7287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.922092437744141, "rewards/margins": 1.4208624362945557, "rewards/rejected": 3.5012295246124268, "step": 41190 }, { "epoch": 1.9128093226240772, "grad_norm": 50.47257614135742, "learning_rate": 1.8528993918009191e-07, "logits/chosen": -19.050777435302734, "logits/rejected": -18.89802360534668, "logps/chosen": -339.6777038574219, "logps/rejected": -278.29461669921875, "loss": 0.5953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.193350076675415, "rewards/margins": 0.9103476405143738, "rewards/rejected": 2.2830023765563965, "step": 41200 }, { "epoch": 1.9132735967315102, "grad_norm": 11.608179092407227, "learning_rate": 1.8526208273364595e-07, "logits/chosen": -19.504554748535156, "logits/rejected": -18.77403450012207, "logps/chosen": -368.91436767578125, "logps/rejected": -282.3963928222656, "loss": 0.7724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6451473236083984, "rewards/margins": 0.43839845061302185, "rewards/rejected": 2.2067489624023438, "step": 41210 }, { "epoch": 1.9137378708389434, "grad_norm": 62.988975524902344, "learning_rate": 1.8523422628719994e-07, "logits/chosen": -17.646160125732422, "logits/rejected": -17.22173309326172, "logps/chosen": -393.67144775390625, "logps/rejected": -295.4976501464844, "loss": 0.6896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.940276622772217, "rewards/margins": 0.8435994386672974, "rewards/rejected": 2.096677303314209, "step": 41220 }, { "epoch": 1.9142021449463762, "grad_norm": 32.56352615356445, "learning_rate": 1.8520636984075398e-07, "logits/chosen": -17.976533889770508, "logits/rejected": -16.69073486328125, "logps/chosen": -487.55517578125, "logps/rejected": -261.20867919921875, "loss": 0.2611, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.869448184967041, "rewards/margins": 1.940646767616272, "rewards/rejected": 1.9288012981414795, "step": 41230 }, { "epoch": 1.9146664190538094, "grad_norm": 6.577829837799072, "learning_rate": 1.85178513394308e-07, "logits/chosen": -19.367685317993164, "logits/rejected": -17.607351303100586, "logps/chosen": -390.1231689453125, "logps/rejected": -258.09381103515625, "loss": 0.5937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.505995273590088, "rewards/margins": 1.9838218688964844, "rewards/rejected": 1.5221734046936035, "step": 41240 }, { "epoch": 1.9151306931612424, "grad_norm": 39.269535064697266, "learning_rate": 1.8515065694786199e-07, "logits/chosen": -17.637866973876953, "logits/rejected": -17.955326080322266, "logps/chosen": -296.36004638671875, "logps/rejected": -314.956298828125, "loss": 1.2064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9342246055603027, "rewards/margins": 0.016127396374940872, "rewards/rejected": 2.9180972576141357, "step": 41250 }, { "epoch": 1.9155949672686754, "grad_norm": 3.7544517517089844, "learning_rate": 1.8512280050141603e-07, "logits/chosen": -18.993221282958984, "logits/rejected": -17.373889923095703, "logps/chosen": -459.294921875, "logps/rejected": -250.7324676513672, "loss": 0.1801, "rewards/accuracies": 1.0, "rewards/chosen": 4.283555030822754, "rewards/margins": 2.5392467975616455, "rewards/rejected": 1.7443077564239502, "step": 41260 }, { "epoch": 1.9160592413761086, "grad_norm": 39.182647705078125, "learning_rate": 1.8509494405497004e-07, "logits/chosen": -19.300758361816406, "logits/rejected": -18.76430892944336, "logps/chosen": -330.7048034667969, "logps/rejected": -325.94989013671875, "loss": 0.951, "rewards/accuracies": 0.5, "rewards/chosen": 2.9764840602874756, "rewards/margins": 0.14523909986019135, "rewards/rejected": 2.831244945526123, "step": 41270 }, { "epoch": 1.9165235154835414, "grad_norm": 51.42443084716797, "learning_rate": 1.8506708760852408e-07, "logits/chosen": -18.887374877929688, "logits/rejected": -18.876331329345703, "logps/chosen": -357.3070068359375, "logps/rejected": -371.484619140625, "loss": 0.7892, "rewards/accuracies": 0.5, "rewards/chosen": 2.559584379196167, "rewards/margins": 0.023176515474915504, "rewards/rejected": 2.5364081859588623, "step": 41280 }, { "epoch": 1.9169877895909746, "grad_norm": 179.2188262939453, "learning_rate": 1.8503923116207807e-07, "logits/chosen": -19.6293888092041, "logits/rejected": -19.755168914794922, "logps/chosen": -405.316650390625, "logps/rejected": -427.27197265625, "loss": 0.9028, "rewards/accuracies": 0.5, "rewards/chosen": 2.694035530090332, "rewards/margins": -0.19957441091537476, "rewards/rejected": 2.8936097621917725, "step": 41290 }, { "epoch": 1.9174520636984076, "grad_norm": 62.74951934814453, "learning_rate": 1.8501137471563209e-07, "logits/chosen": -19.768577575683594, "logits/rejected": -19.365427017211914, "logps/chosen": -481.90704345703125, "logps/rejected": -391.2183837890625, "loss": 0.7342, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.4556884765625, "rewards/margins": 0.9285072088241577, "rewards/rejected": 3.5271811485290527, "step": 41300 }, { "epoch": 1.9179163378058406, "grad_norm": 124.49736785888672, "learning_rate": 1.8498351826918613e-07, "logits/chosen": -19.19136619567871, "logits/rejected": -18.20917320251465, "logps/chosen": -456.63616943359375, "logps/rejected": -345.2817687988281, "loss": 0.3312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.051181793212891, "rewards/margins": 1.287500023841858, "rewards/rejected": 2.7636818885803223, "step": 41310 }, { "epoch": 1.9183806119132736, "grad_norm": 199.9556884765625, "learning_rate": 1.8495566182274014e-07, "logits/chosen": -19.63688850402832, "logits/rejected": -19.23027992248535, "logps/chosen": -460.5345764160156, "logps/rejected": -443.63995361328125, "loss": 0.5465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.521477222442627, "rewards/margins": 1.0820837020874023, "rewards/rejected": 3.4393932819366455, "step": 41320 }, { "epoch": 1.9188448860207066, "grad_norm": 49.42595672607422, "learning_rate": 1.8492780537629416e-07, "logits/chosen": -17.946788787841797, "logits/rejected": -17.527599334716797, "logps/chosen": -227.42538452148438, "logps/rejected": -255.44107055664062, "loss": 0.9621, "rewards/accuracies": 0.5, "rewards/chosen": 2.2738261222839355, "rewards/margins": 0.5079613924026489, "rewards/rejected": 1.7658647298812866, "step": 41330 }, { "epoch": 1.9193091601281398, "grad_norm": 159.13674926757812, "learning_rate": 1.8489994892984817e-07, "logits/chosen": -18.73251724243164, "logits/rejected": -18.865257263183594, "logps/chosen": -259.15496826171875, "logps/rejected": -291.11163330078125, "loss": 1.2733, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0388054847717285, "rewards/margins": -0.614907443523407, "rewards/rejected": 2.653712749481201, "step": 41340 }, { "epoch": 1.9197734342355726, "grad_norm": 146.3784942626953, "learning_rate": 1.8487209248340218e-07, "logits/chosen": -19.117263793945312, "logits/rejected": -19.114627838134766, "logps/chosen": -486.7455139160156, "logps/rejected": -462.1236267089844, "loss": 1.1606, "rewards/accuracies": 0.5, "rewards/chosen": 3.656644344329834, "rewards/margins": -0.3696928918361664, "rewards/rejected": 4.026337623596191, "step": 41350 }, { "epoch": 1.9202377083430058, "grad_norm": 10.82578182220459, "learning_rate": 1.8484423603695623e-07, "logits/chosen": -19.26947021484375, "logits/rejected": -19.37743377685547, "logps/chosen": -328.9779968261719, "logps/rejected": -316.86273193359375, "loss": 1.1847, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7706449031829834, "rewards/margins": -0.39860400557518005, "rewards/rejected": 3.1692492961883545, "step": 41360 }, { "epoch": 1.9207019824504388, "grad_norm": 89.69259643554688, "learning_rate": 1.8481637959051021e-07, "logits/chosen": -18.96444320678711, "logits/rejected": -18.345230102539062, "logps/chosen": -336.3772888183594, "logps/rejected": -289.67620849609375, "loss": 0.3444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.442531108856201, "rewards/margins": 1.343530297279358, "rewards/rejected": 2.099000930786133, "step": 41370 }, { "epoch": 1.9211662565578718, "grad_norm": 31.40192413330078, "learning_rate": 1.8478852314406426e-07, "logits/chosen": -18.67408561706543, "logits/rejected": -18.681198120117188, "logps/chosen": -347.17449951171875, "logps/rejected": -305.88983154296875, "loss": 1.4189, "rewards/accuracies": 0.5, "rewards/chosen": 3.734633207321167, "rewards/margins": -0.02895965613424778, "rewards/rejected": 3.7635929584503174, "step": 41380 }, { "epoch": 1.9216305306653048, "grad_norm": 24.249378204345703, "learning_rate": 1.8476066669761827e-07, "logits/chosen": -18.34292221069336, "logits/rejected": -17.875986099243164, "logps/chosen": -297.36004638671875, "logps/rejected": -241.6453094482422, "loss": 0.5761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.64633846282959, "rewards/margins": 1.074292540550232, "rewards/rejected": 1.5720458030700684, "step": 41390 }, { "epoch": 1.9220948047727378, "grad_norm": 69.56976318359375, "learning_rate": 1.847328102511723e-07, "logits/chosen": -18.042470932006836, "logits/rejected": -17.9981632232666, "logps/chosen": -191.3422088623047, "logps/rejected": -196.8690643310547, "loss": 0.9626, "rewards/accuracies": 0.5, "rewards/chosen": 0.83543461561203, "rewards/margins": -0.15816812217235565, "rewards/rejected": 0.9936027526855469, "step": 41400 }, { "epoch": 1.922559078880171, "grad_norm": 26.08734703063965, "learning_rate": 1.847049538047263e-07, "logits/chosen": -19.088136672973633, "logits/rejected": -17.896923065185547, "logps/chosen": -322.100830078125, "logps/rejected": -233.34481811523438, "loss": 0.5727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.931175708770752, "rewards/margins": 1.2793638706207275, "rewards/rejected": 1.651812195777893, "step": 41410 }, { "epoch": 1.9230233529876037, "grad_norm": 61.0495719909668, "learning_rate": 1.846770973582803e-07, "logits/chosen": -18.554529190063477, "logits/rejected": -17.491235733032227, "logps/chosen": -382.0994567871094, "logps/rejected": -231.034423828125, "loss": 0.3026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.58727765083313, "rewards/margins": 2.146026611328125, "rewards/rejected": 1.441251277923584, "step": 41420 }, { "epoch": 1.923487627095037, "grad_norm": 98.8576431274414, "learning_rate": 1.8464924091183435e-07, "logits/chosen": -20.574710845947266, "logits/rejected": -19.095054626464844, "logps/chosen": -465.0738220214844, "logps/rejected": -399.4601135253906, "loss": 0.5672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.235479354858398, "rewards/margins": 1.0873554944992065, "rewards/rejected": 3.1481239795684814, "step": 41430 }, { "epoch": 1.92395190120247, "grad_norm": 42.066463470458984, "learning_rate": 1.8462138446538834e-07, "logits/chosen": -19.44961166381836, "logits/rejected": -19.04207992553711, "logps/chosen": -464.02166748046875, "logps/rejected": -446.54473876953125, "loss": 0.7469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.197427749633789, "rewards/margins": 0.16451677680015564, "rewards/rejected": 3.0329110622406006, "step": 41440 }, { "epoch": 1.924416175309903, "grad_norm": 5.110694885253906, "learning_rate": 1.8459352801894236e-07, "logits/chosen": -18.132774353027344, "logits/rejected": -17.479736328125, "logps/chosen": -341.56060791015625, "logps/rejected": -252.5069580078125, "loss": 0.7113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.660109758377075, "rewards/margins": 0.7873221635818481, "rewards/rejected": 1.8727874755859375, "step": 41450 }, { "epoch": 1.9248804494173362, "grad_norm": 100.33574676513672, "learning_rate": 1.845656715724964e-07, "logits/chosen": -19.02505874633789, "logits/rejected": -17.472248077392578, "logps/chosen": -368.24163818359375, "logps/rejected": -244.96505737304688, "loss": 0.5077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.404771327972412, "rewards/margins": 1.7373018264770508, "rewards/rejected": 1.6674697399139404, "step": 41460 }, { "epoch": 1.925344723524769, "grad_norm": 139.15194702148438, "learning_rate": 1.845378151260504e-07, "logits/chosen": -19.99850845336914, "logits/rejected": -18.482196807861328, "logps/chosen": -395.99884033203125, "logps/rejected": -278.89404296875, "loss": 0.6501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.896392822265625, "rewards/margins": 1.424875259399414, "rewards/rejected": 2.471518039703369, "step": 41470 }, { "epoch": 1.9258089976322021, "grad_norm": 51.68813705444336, "learning_rate": 1.8450995867960443e-07, "logits/chosen": -19.21065902709961, "logits/rejected": -19.23036766052246, "logps/chosen": -410.15435791015625, "logps/rejected": -504.2774963378906, "loss": 0.9082, "rewards/accuracies": 0.5, "rewards/chosen": 3.4581856727600098, "rewards/margins": 0.19891896843910217, "rewards/rejected": 3.2592663764953613, "step": 41480 }, { "epoch": 1.926273271739635, "grad_norm": 33.11211013793945, "learning_rate": 1.8448210223315844e-07, "logits/chosen": -19.224231719970703, "logits/rejected": -17.671606063842773, "logps/chosen": -474.61212158203125, "logps/rejected": -299.7234802246094, "loss": 0.4474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.76060152053833, "rewards/margins": 0.980734646320343, "rewards/rejected": 1.7798669338226318, "step": 41490 }, { "epoch": 1.9267375458470681, "grad_norm": 26.8786678314209, "learning_rate": 1.8445424578671248e-07, "logits/chosen": -19.27815055847168, "logits/rejected": -18.443294525146484, "logps/chosen": -468.01300048828125, "logps/rejected": -332.40972900390625, "loss": 0.3719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9221065044403076, "rewards/margins": 1.2189643383026123, "rewards/rejected": 2.703141689300537, "step": 41500 }, { "epoch": 1.9272018199545011, "grad_norm": 83.95874786376953, "learning_rate": 1.844263893402665e-07, "logits/chosen": -20.03672981262207, "logits/rejected": -17.773365020751953, "logps/chosen": -450.97735595703125, "logps/rejected": -276.9977111816406, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1735758781433105, "rewards/margins": 1.2955127954483032, "rewards/rejected": 2.878063201904297, "step": 41510 }, { "epoch": 1.9276660940619341, "grad_norm": 71.34612274169922, "learning_rate": 1.8439853289382049e-07, "logits/chosen": -19.050933837890625, "logits/rejected": -18.743165969848633, "logps/chosen": -299.31732177734375, "logps/rejected": -325.906982421875, "loss": 1.2958, "rewards/accuracies": 0.5, "rewards/chosen": 1.725450873374939, "rewards/margins": -0.6072050929069519, "rewards/rejected": 2.332656145095825, "step": 41520 }, { "epoch": 1.9281303681693673, "grad_norm": 69.22773742675781, "learning_rate": 1.8437067644737453e-07, "logits/chosen": -18.999011993408203, "logits/rejected": -18.457744598388672, "logps/chosen": -413.15386962890625, "logps/rejected": -294.6551208496094, "loss": 0.4669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.180316925048828, "rewards/margins": 1.0222628116607666, "rewards/rejected": 2.1580541133880615, "step": 41530 }, { "epoch": 1.9285946422768, "grad_norm": 281.8600769042969, "learning_rate": 1.8434282000092854e-07, "logits/chosen": -17.491748809814453, "logits/rejected": -18.19234848022461, "logps/chosen": -269.15899658203125, "logps/rejected": -364.6307373046875, "loss": 1.2408, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8325324058532715, "rewards/margins": -0.47160616517066956, "rewards/rejected": 3.304138660430908, "step": 41540 }, { "epoch": 1.9290589163842333, "grad_norm": 23.642101287841797, "learning_rate": 1.8431496355448258e-07, "logits/chosen": -18.87942886352539, "logits/rejected": -17.52941131591797, "logps/chosen": -386.2799377441406, "logps/rejected": -255.2851104736328, "loss": 0.3811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.702643871307373, "rewards/margins": 1.6941158771514893, "rewards/rejected": 1.0085281133651733, "step": 41550 }, { "epoch": 1.9295231904916663, "grad_norm": 48.213321685791016, "learning_rate": 1.8428710710803657e-07, "logits/chosen": -17.836849212646484, "logits/rejected": -17.336946487426758, "logps/chosen": -444.073486328125, "logps/rejected": -374.70758056640625, "loss": 0.3369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.087384223937988, "rewards/margins": 1.6438531875610352, "rewards/rejected": 2.4435312747955322, "step": 41560 }, { "epoch": 1.9299874645990993, "grad_norm": 162.22357177734375, "learning_rate": 1.8425925066159058e-07, "logits/chosen": -18.903783798217773, "logits/rejected": -17.999025344848633, "logps/chosen": -539.6152954101562, "logps/rejected": -424.72515869140625, "loss": 1.0819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.363678932189941, "rewards/margins": 1.2000738382339478, "rewards/rejected": 3.163605213165283, "step": 41570 }, { "epoch": 1.9304517387065323, "grad_norm": 2.244023323059082, "learning_rate": 1.8423139421514463e-07, "logits/chosen": -19.407630920410156, "logits/rejected": -18.521238327026367, "logps/chosen": -336.4683532714844, "logps/rejected": -354.1468505859375, "loss": 0.8334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.16286301612854, "rewards/margins": 0.6233576536178589, "rewards/rejected": 2.5395054817199707, "step": 41580 }, { "epoch": 1.9309160128139653, "grad_norm": 1.7795778512954712, "learning_rate": 1.8420353776869864e-07, "logits/chosen": -20.064104080200195, "logits/rejected": -17.736970901489258, "logps/chosen": -382.9394836425781, "logps/rejected": -211.1780548095703, "loss": 0.3627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5665104389190674, "rewards/margins": 2.070871114730835, "rewards/rejected": 1.4956386089324951, "step": 41590 }, { "epoch": 1.9313802869213985, "grad_norm": 14.934429168701172, "learning_rate": 1.8417568132225263e-07, "logits/chosen": -18.74966049194336, "logits/rejected": -17.558483123779297, "logps/chosen": -338.91412353515625, "logps/rejected": -269.8341064453125, "loss": 0.3319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1799347400665283, "rewards/margins": 1.5112766027450562, "rewards/rejected": 1.6686580181121826, "step": 41600 }, { "epoch": 1.9318445610288313, "grad_norm": 43.297298431396484, "learning_rate": 1.8414782487580667e-07, "logits/chosen": -19.514476776123047, "logits/rejected": -18.45147132873535, "logps/chosen": -414.0938415527344, "logps/rejected": -254.7029266357422, "loss": 0.3062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8614094257354736, "rewards/margins": 1.9110333919525146, "rewards/rejected": 1.9503759145736694, "step": 41610 }, { "epoch": 1.9323088351362645, "grad_norm": 95.13485717773438, "learning_rate": 1.8411996842936068e-07, "logits/chosen": -19.389589309692383, "logits/rejected": -18.980491638183594, "logps/chosen": -430.80438232421875, "logps/rejected": -345.5055847167969, "loss": 0.531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.888906478881836, "rewards/margins": 0.7709609866142273, "rewards/rejected": 3.1179451942443848, "step": 41620 }, { "epoch": 1.9327731092436975, "grad_norm": 56.004878997802734, "learning_rate": 1.840921119829147e-07, "logits/chosen": -17.842302322387695, "logits/rejected": -17.79550552368164, "logps/chosen": -335.1297302246094, "logps/rejected": -385.54693603515625, "loss": 0.9384, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4220974445343018, "rewards/margins": 0.07101666927337646, "rewards/rejected": 3.3510806560516357, "step": 41630 }, { "epoch": 1.9332373833511305, "grad_norm": 5.142828464508057, "learning_rate": 1.840642555364687e-07, "logits/chosen": -18.529193878173828, "logits/rejected": -18.02753257751465, "logps/chosen": -378.16241455078125, "logps/rejected": -296.06982421875, "loss": 0.4918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2220993041992188, "rewards/margins": 1.0097808837890625, "rewards/rejected": 2.2123184204101562, "step": 41640 }, { "epoch": 1.9337016574585635, "grad_norm": 28.121660232543945, "learning_rate": 1.8403639909002275e-07, "logits/chosen": -18.60654640197754, "logits/rejected": -18.018339157104492, "logps/chosen": -368.4810485839844, "logps/rejected": -373.3233947753906, "loss": 1.2046, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.54441499710083, "rewards/margins": 0.24531328678131104, "rewards/rejected": 3.2991013526916504, "step": 41650 }, { "epoch": 1.9341659315659965, "grad_norm": 2.30077862739563, "learning_rate": 1.8400854264357677e-07, "logits/chosen": -17.87497329711914, "logits/rejected": -17.133989334106445, "logps/chosen": -390.7536315917969, "logps/rejected": -345.68243408203125, "loss": 0.3676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.938955783843994, "rewards/margins": 1.544015884399414, "rewards/rejected": 1.394939661026001, "step": 41660 }, { "epoch": 1.9346302056734297, "grad_norm": 69.90760803222656, "learning_rate": 1.8398068619713076e-07, "logits/chosen": -18.679216384887695, "logits/rejected": -19.21149253845215, "logps/chosen": -434.576171875, "logps/rejected": -382.64141845703125, "loss": 1.1223, "rewards/accuracies": 0.5, "rewards/chosen": 2.9526312351226807, "rewards/margins": -0.0662999302148819, "rewards/rejected": 3.0189309120178223, "step": 41670 }, { "epoch": 1.9350944797808625, "grad_norm": 438.7917785644531, "learning_rate": 1.839528297506848e-07, "logits/chosen": -19.46184539794922, "logits/rejected": -18.973735809326172, "logps/chosen": -453.62286376953125, "logps/rejected": -333.88140869140625, "loss": 0.6389, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4491934776306152, "rewards/margins": 1.074103593826294, "rewards/rejected": 2.3750905990600586, "step": 41680 }, { "epoch": 1.9355587538882957, "grad_norm": 32.525108337402344, "learning_rate": 1.839249733042388e-07, "logits/chosen": -19.365163803100586, "logits/rejected": -17.517105102539062, "logps/chosen": -490.04168701171875, "logps/rejected": -261.6195373535156, "loss": 0.1817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.52520227432251, "rewards/margins": 2.968684196472168, "rewards/rejected": 1.5565177202224731, "step": 41690 }, { "epoch": 1.9360230279957287, "grad_norm": 17.346599578857422, "learning_rate": 1.8389711685779285e-07, "logits/chosen": -18.950420379638672, "logits/rejected": -18.364017486572266, "logps/chosen": -395.9889221191406, "logps/rejected": -357.6095275878906, "loss": 0.6405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6971001625061035, "rewards/margins": 1.3793054819107056, "rewards/rejected": 2.3177945613861084, "step": 41700 }, { "epoch": 1.9364873021031617, "grad_norm": 0.28771933913230896, "learning_rate": 1.8386926041134684e-07, "logits/chosen": -19.268878936767578, "logits/rejected": -18.38210678100586, "logps/chosen": -342.33209228515625, "logps/rejected": -279.0469665527344, "loss": 0.7421, "rewards/accuracies": 0.5, "rewards/chosen": 3.36578631401062, "rewards/margins": 0.8171610832214355, "rewards/rejected": 2.5486254692077637, "step": 41710 }, { "epoch": 1.9369515762105949, "grad_norm": 46.453887939453125, "learning_rate": 1.8384140396490086e-07, "logits/chosen": -19.12319564819336, "logits/rejected": -16.87495994567871, "logps/chosen": -410.79241943359375, "logps/rejected": -260.398193359375, "loss": 0.3042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.759697675704956, "rewards/margins": 2.0301873683929443, "rewards/rejected": 1.7295100688934326, "step": 41720 }, { "epoch": 1.9374158503180277, "grad_norm": 6.223354339599609, "learning_rate": 1.838135475184549e-07, "logits/chosen": -19.221149444580078, "logits/rejected": -18.231510162353516, "logps/chosen": -448.2403259277344, "logps/rejected": -399.920654296875, "loss": 1.1009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.905489444732666, "rewards/margins": 0.31599438190460205, "rewards/rejected": 3.5894947052001953, "step": 41730 }, { "epoch": 1.9378801244254609, "grad_norm": 66.85488891601562, "learning_rate": 1.837856910720089e-07, "logits/chosen": -18.708236694335938, "logits/rejected": -18.781646728515625, "logps/chosen": -406.91265869140625, "logps/rejected": -383.857666015625, "loss": 1.0021, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.786745548248291, "rewards/margins": -0.2510862350463867, "rewards/rejected": 3.0378317832946777, "step": 41740 }, { "epoch": 1.9383443985328939, "grad_norm": 6.6107988357543945, "learning_rate": 1.8375783462556293e-07, "logits/chosen": -19.2168025970459, "logits/rejected": -19.2158260345459, "logps/chosen": -329.0293273925781, "logps/rejected": -342.11553955078125, "loss": 1.213, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.235882043838501, "rewards/margins": -0.25682947039604187, "rewards/rejected": 2.492711067199707, "step": 41750 }, { "epoch": 1.9388086726403269, "grad_norm": 55.363250732421875, "learning_rate": 1.8372997817911694e-07, "logits/chosen": -18.740758895874023, "logits/rejected": -17.511625289916992, "logps/chosen": -511.97833251953125, "logps/rejected": -409.2647705078125, "loss": 0.5997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.200395584106445, "rewards/margins": 1.0507255792617798, "rewards/rejected": 3.149670124053955, "step": 41760 }, { "epoch": 1.9392729467477599, "grad_norm": 212.2345733642578, "learning_rate": 1.8370212173267095e-07, "logits/chosen": -18.911285400390625, "logits/rejected": -18.697343826293945, "logps/chosen": -400.56488037109375, "logps/rejected": -358.71893310546875, "loss": 0.6365, "rewards/accuracies": 0.5, "rewards/chosen": 3.521265745162964, "rewards/margins": 0.5916725993156433, "rewards/rejected": 2.9295926094055176, "step": 41770 }, { "epoch": 1.9397372208551928, "grad_norm": 4.408566474914551, "learning_rate": 1.83674265286225e-07, "logits/chosen": -18.629322052001953, "logits/rejected": -16.86216163635254, "logps/chosen": -349.80340576171875, "logps/rejected": -182.80699157714844, "loss": 0.5644, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.048438549041748, "rewards/margins": 1.6798677444458008, "rewards/rejected": 1.3685710430145264, "step": 41780 }, { "epoch": 1.940201494962626, "grad_norm": 54.85363006591797, "learning_rate": 1.8364640883977898e-07, "logits/chosen": -18.513378143310547, "logits/rejected": -18.271060943603516, "logps/chosen": -270.96917724609375, "logps/rejected": -283.75384521484375, "loss": 1.2392, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0914416313171387, "rewards/margins": -0.4850069582462311, "rewards/rejected": 2.576448440551758, "step": 41790 }, { "epoch": 1.9406657690700588, "grad_norm": 110.93463897705078, "learning_rate": 1.8361855239333302e-07, "logits/chosen": -17.97243881225586, "logits/rejected": -18.561927795410156, "logps/chosen": -299.14031982421875, "logps/rejected": -387.22662353515625, "loss": 1.4805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0588207244873047, "rewards/margins": -0.7410097122192383, "rewards/rejected": 2.799830675125122, "step": 41800 }, { "epoch": 1.941130043177492, "grad_norm": 0.12570200860500336, "learning_rate": 1.8359069594688704e-07, "logits/chosen": -17.919252395629883, "logits/rejected": -16.940174102783203, "logps/chosen": -354.8143005371094, "logps/rejected": -255.1714324951172, "loss": 0.3497, "rewards/accuracies": 1.0, "rewards/chosen": 2.791928291320801, "rewards/margins": 1.579970121383667, "rewards/rejected": 1.2119581699371338, "step": 41810 }, { "epoch": 1.941594317284925, "grad_norm": 47.883460998535156, "learning_rate": 1.8356283950044103e-07, "logits/chosen": -18.571134567260742, "logits/rejected": -17.773916244506836, "logps/chosen": -416.28009033203125, "logps/rejected": -319.1462707519531, "loss": 0.6469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2490391731262207, "rewards/margins": 0.933955192565918, "rewards/rejected": 2.3150839805603027, "step": 41820 }, { "epoch": 1.942058591392358, "grad_norm": 0.2132963091135025, "learning_rate": 1.8353498305399507e-07, "logits/chosen": -19.664188385009766, "logits/rejected": -18.39155387878418, "logps/chosen": -489.67938232421875, "logps/rejected": -273.16241455078125, "loss": 0.4992, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.559660911560059, "rewards/margins": 2.0977888107299805, "rewards/rejected": 2.461872100830078, "step": 41830 }, { "epoch": 1.942522865499791, "grad_norm": 115.18456268310547, "learning_rate": 1.8350712660754908e-07, "logits/chosen": -19.69101905822754, "logits/rejected": -19.457515716552734, "logps/chosen": -423.2911071777344, "logps/rejected": -414.35736083984375, "loss": 0.8514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.703540802001953, "rewards/margins": 0.4322008192539215, "rewards/rejected": 3.2713398933410645, "step": 41840 }, { "epoch": 1.942987139607224, "grad_norm": 2.6092071533203125, "learning_rate": 1.8347927016110312e-07, "logits/chosen": -18.69756507873535, "logits/rejected": -18.219249725341797, "logps/chosen": -340.8614807128906, "logps/rejected": -253.4734344482422, "loss": 0.4916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.456510066986084, "rewards/margins": 1.0344574451446533, "rewards/rejected": 1.4220526218414307, "step": 41850 }, { "epoch": 1.9434514137146572, "grad_norm": 24.75153923034668, "learning_rate": 1.834514137146571e-07, "logits/chosen": -18.814102172851562, "logits/rejected": -18.050830841064453, "logps/chosen": -291.40191650390625, "logps/rejected": -184.46768188476562, "loss": 0.6039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5267205238342285, "rewards/margins": 0.9325703382492065, "rewards/rejected": 1.594150185585022, "step": 41860 }, { "epoch": 1.94391568782209, "grad_norm": 57.18269729614258, "learning_rate": 1.8342355726821113e-07, "logits/chosen": -18.05654525756836, "logits/rejected": -17.833419799804688, "logps/chosen": -244.1204376220703, "logps/rejected": -180.19119262695312, "loss": 0.5599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5567820072174072, "rewards/margins": 0.7352250218391418, "rewards/rejected": 0.8215569257736206, "step": 41870 }, { "epoch": 1.9443799619295232, "grad_norm": 0.8556182980537415, "learning_rate": 1.8339570082176517e-07, "logits/chosen": -18.68861198425293, "logits/rejected": -17.729639053344727, "logps/chosen": -499.64892578125, "logps/rejected": -412.58453369140625, "loss": 0.3983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.7460198402404785, "rewards/margins": 2.3300604820251465, "rewards/rejected": 2.415959358215332, "step": 41880 }, { "epoch": 1.9448442360369562, "grad_norm": 0.4256819784641266, "learning_rate": 1.8336784437531918e-07, "logits/chosen": -19.412776947021484, "logits/rejected": -17.607227325439453, "logps/chosen": -411.98492431640625, "logps/rejected": -252.56802368164062, "loss": 0.2112, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.4286394119262695, "rewards/margins": 2.890352725982666, "rewards/rejected": 2.5382871627807617, "step": 41890 }, { "epoch": 1.9453085101443892, "grad_norm": 14.771827697753906, "learning_rate": 1.833399879288732e-07, "logits/chosen": -20.22540283203125, "logits/rejected": -18.964305877685547, "logps/chosen": -373.54632568359375, "logps/rejected": -309.9497985839844, "loss": 0.438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.688662528991699, "rewards/margins": 1.4659583568572998, "rewards/rejected": 3.2227044105529785, "step": 41900 }, { "epoch": 1.9457727842518224, "grad_norm": 29.482736587524414, "learning_rate": 1.833121314824272e-07, "logits/chosen": -18.609481811523438, "logits/rejected": -18.048383712768555, "logps/chosen": -441.3829040527344, "logps/rejected": -405.701904296875, "loss": 0.9611, "rewards/accuracies": 0.5, "rewards/chosen": 3.5537734031677246, "rewards/margins": 0.5629864931106567, "rewards/rejected": 2.9907870292663574, "step": 41910 }, { "epoch": 1.9462370583592552, "grad_norm": 77.0234375, "learning_rate": 1.8328427503598125e-07, "logits/chosen": -19.381227493286133, "logits/rejected": -18.144289016723633, "logps/chosen": -426.9766540527344, "logps/rejected": -404.49127197265625, "loss": 1.3419, "rewards/accuracies": 0.5, "rewards/chosen": 3.4896559715270996, "rewards/margins": 0.053191959857940674, "rewards/rejected": 3.436464309692383, "step": 41920 }, { "epoch": 1.9467013324666884, "grad_norm": 23.004587173461914, "learning_rate": 1.8325641858953527e-07, "logits/chosen": -18.73362159729004, "logits/rejected": -17.626340866088867, "logps/chosen": -367.7168884277344, "logps/rejected": -318.6136779785156, "loss": 0.9531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.508009195327759, "rewards/margins": 0.2629871964454651, "rewards/rejected": 2.2450218200683594, "step": 41930 }, { "epoch": 1.9471656065741214, "grad_norm": 135.01365661621094, "learning_rate": 1.8322856214308925e-07, "logits/chosen": -20.08306884765625, "logits/rejected": -20.115341186523438, "logps/chosen": -394.69757080078125, "logps/rejected": -397.8022155761719, "loss": 0.6969, "rewards/accuracies": 0.5, "rewards/chosen": 4.006091117858887, "rewards/margins": 0.450996458530426, "rewards/rejected": 3.5550944805145264, "step": 41940 }, { "epoch": 1.9476298806815544, "grad_norm": 35.55949020385742, "learning_rate": 1.832007056966433e-07, "logits/chosen": -19.385150909423828, "logits/rejected": -19.67009735107422, "logps/chosen": -321.965576171875, "logps/rejected": -379.86248779296875, "loss": 1.1983, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.656116008758545, "rewards/margins": -0.08997342735528946, "rewards/rejected": 2.746089458465576, "step": 41950 }, { "epoch": 1.9480941547889874, "grad_norm": 80.81243896484375, "learning_rate": 1.831728492501973e-07, "logits/chosen": -20.290990829467773, "logits/rejected": -19.12700080871582, "logps/chosen": -406.50872802734375, "logps/rejected": -329.5775451660156, "loss": 0.8043, "rewards/accuracies": 0.5, "rewards/chosen": 2.972874164581299, "rewards/margins": 0.42473259568214417, "rewards/rejected": 2.5481414794921875, "step": 41960 }, { "epoch": 1.9485584288964204, "grad_norm": 9.358138084411621, "learning_rate": 1.8314499280375135e-07, "logits/chosen": -18.65727996826172, "logits/rejected": -18.217609405517578, "logps/chosen": -362.095703125, "logps/rejected": -310.3220520019531, "loss": 1.4329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.919412612915039, "rewards/margins": 0.18764153122901917, "rewards/rejected": 2.7317709922790527, "step": 41970 }, { "epoch": 1.9490227030038536, "grad_norm": 23.35626220703125, "learning_rate": 1.8311713635730534e-07, "logits/chosen": -19.307924270629883, "logits/rejected": -17.739925384521484, "logps/chosen": -384.35284423828125, "logps/rejected": -251.2982635498047, "loss": 0.1967, "rewards/accuracies": 1.0, "rewards/chosen": 3.3355324268341064, "rewards/margins": 1.9336646795272827, "rewards/rejected": 1.4018681049346924, "step": 41980 }, { "epoch": 1.9494869771112864, "grad_norm": 82.99292755126953, "learning_rate": 1.8308927991085935e-07, "logits/chosen": -19.53525161743164, "logits/rejected": -18.922788619995117, "logps/chosen": -404.40850830078125, "logps/rejected": -297.6424560546875, "loss": 0.5479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.955317974090576, "rewards/margins": 1.1258270740509033, "rewards/rejected": 1.8294906616210938, "step": 41990 }, { "epoch": 1.9499512512187196, "grad_norm": 217.51434326171875, "learning_rate": 1.830614234644134e-07, "logits/chosen": -18.884572982788086, "logits/rejected": -18.473217010498047, "logps/chosen": -449.45587158203125, "logps/rejected": -421.3658752441406, "loss": 1.2352, "rewards/accuracies": 0.5, "rewards/chosen": 3.928938388824463, "rewards/margins": 0.2330102026462555, "rewards/rejected": 3.6959285736083984, "step": 42000 }, { "epoch": 1.9504155253261526, "grad_norm": 6.072988510131836, "learning_rate": 1.830335670179674e-07, "logits/chosen": -18.88982582092285, "logits/rejected": -17.20259666442871, "logps/chosen": -505.2213439941406, "logps/rejected": -313.4752502441406, "loss": 0.1971, "rewards/accuracies": 1.0, "rewards/chosen": 4.426274299621582, "rewards/margins": 2.1871066093444824, "rewards/rejected": 2.2391676902770996, "step": 42010 }, { "epoch": 1.9508797994335856, "grad_norm": 135.91683959960938, "learning_rate": 1.830057105715214e-07, "logits/chosen": -19.035627365112305, "logits/rejected": -19.179853439331055, "logps/chosen": -414.69647216796875, "logps/rejected": -387.3297424316406, "loss": 1.0449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.37575101852417, "rewards/margins": 0.1633656620979309, "rewards/rejected": 3.2123851776123047, "step": 42020 }, { "epoch": 1.9513440735410186, "grad_norm": 121.00059509277344, "learning_rate": 1.8297785412507544e-07, "logits/chosen": -18.819082260131836, "logits/rejected": -18.19683074951172, "logps/chosen": -312.2177429199219, "logps/rejected": -288.45684814453125, "loss": 0.738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.041703701019287, "rewards/margins": 0.9575206637382507, "rewards/rejected": 2.0841829776763916, "step": 42030 }, { "epoch": 1.9518083476484516, "grad_norm": 0.6156349778175354, "learning_rate": 1.8294999767862945e-07, "logits/chosen": -17.951709747314453, "logits/rejected": -16.84910011291504, "logps/chosen": -368.9200439453125, "logps/rejected": -249.4049530029297, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0704188346862793, "rewards/margins": 1.474268913269043, "rewards/rejected": 1.5961499214172363, "step": 42040 }, { "epoch": 1.9522726217558848, "grad_norm": 7.339773178100586, "learning_rate": 1.8292214123218347e-07, "logits/chosen": -19.34042739868164, "logits/rejected": -17.792844772338867, "logps/chosen": -350.85394287109375, "logps/rejected": -249.87442016601562, "loss": 0.5202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4571971893310547, "rewards/margins": 1.6763954162597656, "rewards/rejected": 1.780801773071289, "step": 42050 }, { "epoch": 1.9527368958633176, "grad_norm": 114.4449691772461, "learning_rate": 1.8289428478573748e-07, "logits/chosen": -19.474971771240234, "logits/rejected": -18.222673416137695, "logps/chosen": -416.89495849609375, "logps/rejected": -279.37811279296875, "loss": 0.7281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9582059383392334, "rewards/margins": 0.7003111839294434, "rewards/rejected": 2.25789475440979, "step": 42060 }, { "epoch": 1.9532011699707508, "grad_norm": 20.18172836303711, "learning_rate": 1.8286642833929152e-07, "logits/chosen": -17.42231559753418, "logits/rejected": -16.764806747436523, "logps/chosen": -245.6612548828125, "logps/rejected": -162.27862548828125, "loss": 0.4176, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5590506792068481, "rewards/margins": 1.202806830406189, "rewards/rejected": 0.3562437891960144, "step": 42070 }, { "epoch": 1.9536654440781838, "grad_norm": 108.99385070800781, "learning_rate": 1.8283857189284554e-07, "logits/chosen": -19.51233673095703, "logits/rejected": -18.32763671875, "logps/chosen": -408.8276062011719, "logps/rejected": -324.8299255371094, "loss": 0.7172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.522688627243042, "rewards/margins": 0.6753100156784058, "rewards/rejected": 2.847378730773926, "step": 42080 }, { "epoch": 1.9541297181856168, "grad_norm": 107.57772064208984, "learning_rate": 1.8281071544639953e-07, "logits/chosen": -19.127609252929688, "logits/rejected": -18.814855575561523, "logps/chosen": -381.2159118652344, "logps/rejected": -322.4191589355469, "loss": 1.1267, "rewards/accuracies": 0.5, "rewards/chosen": 2.071613311767578, "rewards/margins": -0.09466905891895294, "rewards/rejected": 2.1662821769714355, "step": 42090 }, { "epoch": 1.95459399229305, "grad_norm": 60.151878356933594, "learning_rate": 1.8278285899995357e-07, "logits/chosen": -19.524559020996094, "logits/rejected": -19.59250259399414, "logps/chosen": -470.86920166015625, "logps/rejected": -420.274658203125, "loss": 0.6389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4367222785949707, "rewards/margins": 0.24835658073425293, "rewards/rejected": 3.1883656978607178, "step": 42100 }, { "epoch": 1.9550582664004827, "grad_norm": 37.65486526489258, "learning_rate": 1.8275500255350758e-07, "logits/chosen": -19.65171241760254, "logits/rejected": -20.109516143798828, "logps/chosen": -478.82733154296875, "logps/rejected": -413.111572265625, "loss": 0.7986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.904576063156128, "rewards/margins": 0.12471888214349747, "rewards/rejected": 3.7798571586608887, "step": 42110 }, { "epoch": 1.955522540507916, "grad_norm": 36.31463623046875, "learning_rate": 1.8272714610706162e-07, "logits/chosen": -19.102588653564453, "logits/rejected": -18.804922103881836, "logps/chosen": -353.33209228515625, "logps/rejected": -378.267578125, "loss": 1.0767, "rewards/accuracies": 0.5, "rewards/chosen": 2.765634059906006, "rewards/margins": 0.03342914581298828, "rewards/rejected": 2.7322049140930176, "step": 42120 }, { "epoch": 1.955986814615349, "grad_norm": 88.82666778564453, "learning_rate": 1.826992896606156e-07, "logits/chosen": -18.692996978759766, "logits/rejected": -18.874805450439453, "logps/chosen": -450.456787109375, "logps/rejected": -456.5306091308594, "loss": 0.6822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.243874549865723, "rewards/margins": 0.4286120533943176, "rewards/rejected": 3.8152618408203125, "step": 42130 }, { "epoch": 1.956451088722782, "grad_norm": 17.932228088378906, "learning_rate": 1.8267143321416962e-07, "logits/chosen": -19.903888702392578, "logits/rejected": -19.191211700439453, "logps/chosen": -380.23028564453125, "logps/rejected": -318.7236022949219, "loss": 0.6108, "rewards/accuracies": 0.5, "rewards/chosen": 3.6169819831848145, "rewards/margins": 1.1079379320144653, "rewards/rejected": 2.5090441703796387, "step": 42140 }, { "epoch": 1.956915362830215, "grad_norm": 3.161585569381714, "learning_rate": 1.8264357676772367e-07, "logits/chosen": -18.95144271850586, "logits/rejected": -18.422771453857422, "logps/chosen": -495.47625732421875, "logps/rejected": -373.99273681640625, "loss": 0.6472, "rewards/accuracies": 0.5, "rewards/chosen": 3.479999542236328, "rewards/margins": 1.2103862762451172, "rewards/rejected": 2.269613265991211, "step": 42150 }, { "epoch": 1.957379636937648, "grad_norm": 61.16600799560547, "learning_rate": 1.8261572032127768e-07, "logits/chosen": -19.246158599853516, "logits/rejected": -18.695886611938477, "logps/chosen": -363.2383728027344, "logps/rejected": -319.244140625, "loss": 0.6917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.129669666290283, "rewards/margins": 1.1527378559112549, "rewards/rejected": 1.9769315719604492, "step": 42160 }, { "epoch": 1.9578439110450812, "grad_norm": 60.96245574951172, "learning_rate": 1.825878638748317e-07, "logits/chosen": -18.410926818847656, "logits/rejected": -18.120609283447266, "logps/chosen": -401.01373291015625, "logps/rejected": -371.77520751953125, "loss": 0.7055, "rewards/accuracies": 0.5, "rewards/chosen": 3.02439022064209, "rewards/margins": 0.3314162492752075, "rewards/rejected": 2.6929736137390137, "step": 42170 }, { "epoch": 1.958308185152514, "grad_norm": 132.69717407226562, "learning_rate": 1.825600074283857e-07, "logits/chosen": -18.868553161621094, "logits/rejected": -18.478206634521484, "logps/chosen": -459.90179443359375, "logps/rejected": -377.55364990234375, "loss": 0.4154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.103249549865723, "rewards/margins": 1.6385557651519775, "rewards/rejected": 2.464693546295166, "step": 42180 }, { "epoch": 1.9587724592599471, "grad_norm": 31.349519729614258, "learning_rate": 1.8253215098193972e-07, "logits/chosen": -18.94439697265625, "logits/rejected": -18.640056610107422, "logps/chosen": -369.81512451171875, "logps/rejected": -343.04278564453125, "loss": 0.556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3573012351989746, "rewards/margins": 0.717919647693634, "rewards/rejected": 2.6393816471099854, "step": 42190 }, { "epoch": 1.9592367333673801, "grad_norm": 184.85870361328125, "learning_rate": 1.8250429453549376e-07, "logits/chosen": -18.186012268066406, "logits/rejected": -18.19025230407715, "logps/chosen": -406.47027587890625, "logps/rejected": -338.2411193847656, "loss": 0.9237, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6858603954315186, "rewards/margins": -0.19958464801311493, "rewards/rejected": 2.8854451179504395, "step": 42200 }, { "epoch": 1.9597010074748131, "grad_norm": 0.12757405638694763, "learning_rate": 1.8247643808904775e-07, "logits/chosen": -18.534189224243164, "logits/rejected": -17.141557693481445, "logps/chosen": -365.25830078125, "logps/rejected": -271.64715576171875, "loss": 1.1141, "rewards/accuracies": 0.5, "rewards/chosen": 3.140641927719116, "rewards/margins": 0.9619655609130859, "rewards/rejected": 2.1786763668060303, "step": 42210 }, { "epoch": 1.9601652815822461, "grad_norm": 107.27288818359375, "learning_rate": 1.824485816426018e-07, "logits/chosen": -18.708221435546875, "logits/rejected": -18.567161560058594, "logps/chosen": -376.75311279296875, "logps/rejected": -351.90692138671875, "loss": 0.81, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.367006301879883, "rewards/margins": 0.45641833543777466, "rewards/rejected": 2.9105875492095947, "step": 42220 }, { "epoch": 1.9606295556896791, "grad_norm": 61.18074035644531, "learning_rate": 1.824207251961558e-07, "logits/chosen": -17.401308059692383, "logits/rejected": -17.387086868286133, "logps/chosen": -406.5782165527344, "logps/rejected": -339.4120788574219, "loss": 1.1366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5398929119110107, "rewards/margins": 0.2075139731168747, "rewards/rejected": 3.332379102706909, "step": 42230 }, { "epoch": 1.9610938297971123, "grad_norm": 19.62616539001465, "learning_rate": 1.823928687497098e-07, "logits/chosen": -18.363567352294922, "logits/rejected": -17.26764678955078, "logps/chosen": -478.88555908203125, "logps/rejected": -293.0727844238281, "loss": 0.2878, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.388232707977295, "rewards/margins": 1.441516637802124, "rewards/rejected": 1.9467159509658813, "step": 42240 }, { "epoch": 1.961558103904545, "grad_norm": 105.87979888916016, "learning_rate": 1.8236501230326384e-07, "logits/chosen": -18.932147979736328, "logits/rejected": -18.662092208862305, "logps/chosen": -356.5981750488281, "logps/rejected": -314.81890869140625, "loss": 0.8919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5143890380859375, "rewards/margins": -0.09764274209737778, "rewards/rejected": 2.6120314598083496, "step": 42250 }, { "epoch": 1.9620223780119783, "grad_norm": 103.80611419677734, "learning_rate": 1.8233715585681785e-07, "logits/chosen": -18.869670867919922, "logits/rejected": -17.534223556518555, "logps/chosen": -278.4515075683594, "logps/rejected": -217.84561157226562, "loss": 0.5489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5900449752807617, "rewards/margins": 1.0232231616973877, "rewards/rejected": 1.5668216943740845, "step": 42260 }, { "epoch": 1.9624866521194113, "grad_norm": 82.51806640625, "learning_rate": 1.8231208505501647e-07, "logits/chosen": -18.214317321777344, "logits/rejected": -18.42003631591797, "logps/chosen": -426.639404296875, "logps/rejected": -414.16937255859375, "loss": 1.2039, "rewards/accuracies": 0.5, "rewards/chosen": 2.9695844650268555, "rewards/margins": -0.4613662660121918, "rewards/rejected": 3.430950880050659, "step": 42270 }, { "epoch": 1.9629509262268443, "grad_norm": 85.1544418334961, "learning_rate": 1.822842286085705e-07, "logits/chosen": -19.220252990722656, "logits/rejected": -18.175209045410156, "logps/chosen": -508.74822998046875, "logps/rejected": -401.9361877441406, "loss": 0.9968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.225892543792725, "rewards/margins": 1.0689284801483154, "rewards/rejected": 3.156964063644409, "step": 42280 }, { "epoch": 1.9634152003342775, "grad_norm": 51.931121826171875, "learning_rate": 1.822563721621245e-07, "logits/chosen": -18.208112716674805, "logits/rejected": -17.981876373291016, "logps/chosen": -314.14825439453125, "logps/rejected": -340.2015686035156, "loss": 0.8069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9563100337982178, "rewards/margins": 0.5420713424682617, "rewards/rejected": 2.414238452911377, "step": 42290 }, { "epoch": 1.9638794744417103, "grad_norm": 27.614587783813477, "learning_rate": 1.8222851571567852e-07, "logits/chosen": -19.436695098876953, "logits/rejected": -19.240779876708984, "logps/chosen": -509.669189453125, "logps/rejected": -427.8014221191406, "loss": 0.5314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.737734794616699, "rewards/margins": 1.3201541900634766, "rewards/rejected": 3.4175808429718018, "step": 42300 }, { "epoch": 1.9643437485491435, "grad_norm": 221.33355712890625, "learning_rate": 1.8220065926923256e-07, "logits/chosen": -19.18813705444336, "logits/rejected": -17.805002212524414, "logps/chosen": -499.22137451171875, "logps/rejected": -331.9200744628906, "loss": 0.5799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4009647369384766, "rewards/margins": 1.306395173072815, "rewards/rejected": 2.0945699214935303, "step": 42310 }, { "epoch": 1.9648080226565763, "grad_norm": 51.611656188964844, "learning_rate": 1.8217280282278657e-07, "logits/chosen": -19.736560821533203, "logits/rejected": -17.859060287475586, "logps/chosen": -387.0459899902344, "logps/rejected": -264.27996826171875, "loss": 0.3304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.720378875732422, "rewards/margins": 1.8981527090072632, "rewards/rejected": 1.8222262859344482, "step": 42320 }, { "epoch": 1.9652722967640095, "grad_norm": 6.478548526763916, "learning_rate": 1.8214494637634056e-07, "logits/chosen": -18.859155654907227, "logits/rejected": -18.255645751953125, "logps/chosen": -345.0323181152344, "logps/rejected": -291.00048828125, "loss": 0.7422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.942396879196167, "rewards/margins": 0.8382382392883301, "rewards/rejected": 2.104158639907837, "step": 42330 }, { "epoch": 1.9657365708714425, "grad_norm": 0.99751216173172, "learning_rate": 1.821170899298946e-07, "logits/chosen": -19.036800384521484, "logits/rejected": -17.520992279052734, "logps/chosen": -381.8861999511719, "logps/rejected": -252.8522186279297, "loss": 0.2818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8405990600585938, "rewards/margins": 2.1138415336608887, "rewards/rejected": 1.7267580032348633, "step": 42340 }, { "epoch": 1.9662008449788755, "grad_norm": 87.79495239257812, "learning_rate": 1.8208923348344862e-07, "logits/chosen": -17.913036346435547, "logits/rejected": -17.449264526367188, "logps/chosen": -268.7281188964844, "logps/rejected": -220.0655517578125, "loss": 0.6212, "rewards/accuracies": 0.5, "rewards/chosen": 1.6550085544586182, "rewards/margins": 0.409907728433609, "rewards/rejected": 1.245100736618042, "step": 42350 }, { "epoch": 1.9666651190863087, "grad_norm": 0.5474165678024292, "learning_rate": 1.8206137703700266e-07, "logits/chosen": -20.251367568969727, "logits/rejected": -18.35207748413086, "logps/chosen": -494.66827392578125, "logps/rejected": -301.6368713378906, "loss": 0.4215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3105244636535645, "rewards/margins": 2.4609599113464355, "rewards/rejected": 1.8495643138885498, "step": 42360 }, { "epoch": 1.9671293931937415, "grad_norm": 103.86624145507812, "learning_rate": 1.8203352059055664e-07, "logits/chosen": -19.874300003051758, "logits/rejected": -18.872386932373047, "logps/chosen": -438.09637451171875, "logps/rejected": -371.27215576171875, "loss": 0.357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.008772850036621, "rewards/margins": 1.1454509496688843, "rewards/rejected": 2.863321542739868, "step": 42370 }, { "epoch": 1.9675936673011747, "grad_norm": 1.3919302225112915, "learning_rate": 1.8200566414411066e-07, "logits/chosen": -20.89860725402832, "logits/rejected": -19.567123413085938, "logps/chosen": -369.59820556640625, "logps/rejected": -297.2861633300781, "loss": 0.6551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.40429949760437, "rewards/margins": 1.0732451677322388, "rewards/rejected": 2.3310546875, "step": 42380 }, { "epoch": 1.9680579414086077, "grad_norm": 0.5785614252090454, "learning_rate": 1.819778076976647e-07, "logits/chosen": -18.946836471557617, "logits/rejected": -18.454402923583984, "logps/chosen": -310.94903564453125, "logps/rejected": -262.36639404296875, "loss": 0.7671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.080361366271973, "rewards/margins": 1.969313383102417, "rewards/rejected": 2.1110479831695557, "step": 42390 }, { "epoch": 1.9685222155160407, "grad_norm": 31.776668548583984, "learning_rate": 1.8194995125121871e-07, "logits/chosen": -18.605907440185547, "logits/rejected": -17.422996520996094, "logps/chosen": -446.21563720703125, "logps/rejected": -278.052978515625, "loss": 0.326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.540545463562012, "rewards/margins": 2.099250078201294, "rewards/rejected": 2.441295862197876, "step": 42400 }, { "epoch": 1.9689864896234737, "grad_norm": 156.91339111328125, "learning_rate": 1.8192209480477273e-07, "logits/chosen": -18.70024871826172, "logits/rejected": -18.458759307861328, "logps/chosen": -285.9315490722656, "logps/rejected": -298.7745361328125, "loss": 0.772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4525744915008545, "rewards/margins": 0.26445597410202026, "rewards/rejected": 2.1881184577941895, "step": 42410 }, { "epoch": 1.9694507637309067, "grad_norm": 40.043704986572266, "learning_rate": 1.8189423835832674e-07, "logits/chosen": -18.262935638427734, "logits/rejected": -18.426973342895508, "logps/chosen": -406.92034912109375, "logps/rejected": -390.42498779296875, "loss": 0.7997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3620445728302, "rewards/margins": 0.7089788913726807, "rewards/rejected": 2.6530654430389404, "step": 42420 }, { "epoch": 1.9699150378383399, "grad_norm": 121.82951354980469, "learning_rate": 1.8186638191188078e-07, "logits/chosen": -19.346141815185547, "logits/rejected": -18.60301971435547, "logps/chosen": -356.01141357421875, "logps/rejected": -299.12603759765625, "loss": 0.6031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.201653003692627, "rewards/margins": 0.715100109577179, "rewards/rejected": 2.4865527153015137, "step": 42430 }, { "epoch": 1.9703793119457726, "grad_norm": 6.9052205085754395, "learning_rate": 1.818385254654348e-07, "logits/chosen": -19.282230377197266, "logits/rejected": -17.897937774658203, "logps/chosen": -349.1459045410156, "logps/rejected": -327.542236328125, "loss": 1.1056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6365866661071777, "rewards/margins": 0.7900747060775757, "rewards/rejected": 2.8465120792388916, "step": 42440 }, { "epoch": 1.9708435860532059, "grad_norm": 184.4447784423828, "learning_rate": 1.818106690189888e-07, "logits/chosen": -18.648056030273438, "logits/rejected": -18.146465301513672, "logps/chosen": -322.6502685546875, "logps/rejected": -281.4465637207031, "loss": 0.8959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3202059268951416, "rewards/margins": 0.07336696982383728, "rewards/rejected": 2.2468390464782715, "step": 42450 }, { "epoch": 1.9713078601606389, "grad_norm": 121.4513931274414, "learning_rate": 1.8178281257254283e-07, "logits/chosen": -18.666040420532227, "logits/rejected": -17.298173904418945, "logps/chosen": -495.00555419921875, "logps/rejected": -350.8936462402344, "loss": 1.1374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8783106803894043, "rewards/margins": 1.1594347953796387, "rewards/rejected": 2.718876361846924, "step": 42460 }, { "epoch": 1.9717721342680719, "grad_norm": 96.1029281616211, "learning_rate": 1.8175495612609684e-07, "logits/chosen": -18.84464454650879, "logits/rejected": -18.501598358154297, "logps/chosen": -379.2105407714844, "logps/rejected": -265.3074035644531, "loss": 0.4992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2617580890655518, "rewards/margins": 1.2334336042404175, "rewards/rejected": 2.0283243656158447, "step": 42470 }, { "epoch": 1.9722364083755048, "grad_norm": 124.66107177734375, "learning_rate": 1.8172709967965083e-07, "logits/chosen": -18.95945167541504, "logits/rejected": -17.163576126098633, "logps/chosen": -398.07733154296875, "logps/rejected": -251.0264892578125, "loss": 0.4364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1616337299346924, "rewards/margins": 1.397409200668335, "rewards/rejected": 1.7642244100570679, "step": 42480 }, { "epoch": 1.9727006824829378, "grad_norm": 11.331705093383789, "learning_rate": 1.8169924323320487e-07, "logits/chosen": -20.01219940185547, "logits/rejected": -19.565885543823242, "logps/chosen": -578.950927734375, "logps/rejected": -432.554443359375, "loss": 0.5713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7846405506134033, "rewards/margins": 0.7576920390129089, "rewards/rejected": 3.0269486904144287, "step": 42490 }, { "epoch": 1.973164956590371, "grad_norm": 27.487171173095703, "learning_rate": 1.8167138678675889e-07, "logits/chosen": -18.104780197143555, "logits/rejected": -17.447437286376953, "logps/chosen": -293.82366943359375, "logps/rejected": -242.53262329101562, "loss": 0.43, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6608102321624756, "rewards/margins": 0.8178327679634094, "rewards/rejected": 1.8429772853851318, "step": 42500 }, { "epoch": 1.9736292306978038, "grad_norm": 27.053224563598633, "learning_rate": 1.8164353034031293e-07, "logits/chosen": -18.33245849609375, "logits/rejected": -17.349414825439453, "logps/chosen": -507.66412353515625, "logps/rejected": -364.27642822265625, "loss": 0.7544, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8567397594451904, "rewards/margins": 1.0200868844985962, "rewards/rejected": 2.836653232574463, "step": 42510 }, { "epoch": 1.974093504805237, "grad_norm": 194.9723663330078, "learning_rate": 1.8161567389386692e-07, "logits/chosen": -19.3186092376709, "logits/rejected": -19.180419921875, "logps/chosen": -386.8966979980469, "logps/rejected": -355.5835876464844, "loss": 0.7277, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.227191925048828, "rewards/margins": 0.6356468200683594, "rewards/rejected": 2.5915448665618896, "step": 42520 }, { "epoch": 1.97455777891267, "grad_norm": 37.710227966308594, "learning_rate": 1.8158781744742093e-07, "logits/chosen": -19.71684455871582, "logits/rejected": -19.094797134399414, "logps/chosen": -377.44647216796875, "logps/rejected": -397.9712829589844, "loss": 0.7226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1749062538146973, "rewards/margins": 0.3130609095096588, "rewards/rejected": 2.8618452548980713, "step": 42530 }, { "epoch": 1.975022053020103, "grad_norm": 29.700653076171875, "learning_rate": 1.8155996100097497e-07, "logits/chosen": -20.337085723876953, "logits/rejected": -17.362829208374023, "logps/chosen": -564.4881591796875, "logps/rejected": -272.5950012207031, "loss": 0.2006, "rewards/accuracies": 1.0, "rewards/chosen": 4.281317710876465, "rewards/margins": 3.055260419845581, "rewards/rejected": 1.2260578870773315, "step": 42540 }, { "epoch": 1.9754863271275362, "grad_norm": 108.11743927001953, "learning_rate": 1.8153210455452899e-07, "logits/chosen": -18.92232322692871, "logits/rejected": -18.450672149658203, "logps/chosen": -362.45458984375, "logps/rejected": -282.24072265625, "loss": 0.7454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.748894453048706, "rewards/margins": 0.7367315292358398, "rewards/rejected": 2.012162923812866, "step": 42550 }, { "epoch": 1.975950601234969, "grad_norm": 189.5052490234375, "learning_rate": 1.81504248108083e-07, "logits/chosen": -19.484878540039062, "logits/rejected": -20.162006378173828, "logps/chosen": -418.81622314453125, "logps/rejected": -505.09881591796875, "loss": 1.1994, "rewards/accuracies": 0.5, "rewards/chosen": 4.105198860168457, "rewards/margins": -0.26087644696235657, "rewards/rejected": 4.366075038909912, "step": 42560 }, { "epoch": 1.9764148753424022, "grad_norm": 11.086915016174316, "learning_rate": 1.8147639166163701e-07, "logits/chosen": -18.207611083984375, "logits/rejected": -17.475330352783203, "logps/chosen": -351.04217529296875, "logps/rejected": -268.38616943359375, "loss": 0.7927, "rewards/accuracies": 0.5, "rewards/chosen": 2.9848647117614746, "rewards/margins": 0.4901299476623535, "rewards/rejected": 2.494734764099121, "step": 42570 }, { "epoch": 1.9768791494498352, "grad_norm": 220.3790283203125, "learning_rate": 1.8144853521519106e-07, "logits/chosen": -18.386356353759766, "logits/rejected": -17.344593048095703, "logps/chosen": -477.48040771484375, "logps/rejected": -320.7225646972656, "loss": 0.8805, "rewards/accuracies": 0.5, "rewards/chosen": 3.8938286304473877, "rewards/margins": 1.2158666849136353, "rewards/rejected": 2.677961826324463, "step": 42580 }, { "epoch": 1.9773434235572682, "grad_norm": 49.26434326171875, "learning_rate": 1.8142067876874507e-07, "logits/chosen": -18.99570083618164, "logits/rejected": -17.944561004638672, "logps/chosen": -296.260986328125, "logps/rejected": -179.8692626953125, "loss": 0.5494, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7738051414489746, "rewards/margins": 1.2423522472381592, "rewards/rejected": 1.5314526557922363, "step": 42590 }, { "epoch": 1.9778076976647012, "grad_norm": 40.7407112121582, "learning_rate": 1.8139282232229906e-07, "logits/chosen": -19.227840423583984, "logits/rejected": -19.174528121948242, "logps/chosen": -358.45068359375, "logps/rejected": -387.56329345703125, "loss": 1.1381, "rewards/accuracies": 0.5, "rewards/chosen": 3.9175033569335938, "rewards/margins": 0.24460859596729279, "rewards/rejected": 3.6728949546813965, "step": 42600 }, { "epoch": 1.9782719717721342, "grad_norm": 41.96134567260742, "learning_rate": 1.813649658758531e-07, "logits/chosen": -18.034122467041016, "logits/rejected": -17.6800537109375, "logps/chosen": -313.94000244140625, "logps/rejected": -278.5326232910156, "loss": 0.5351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0003669261932373, "rewards/margins": 0.9853755831718445, "rewards/rejected": 2.014991283416748, "step": 42610 }, { "epoch": 1.9787362458795674, "grad_norm": 4.641422271728516, "learning_rate": 1.8133710942940711e-07, "logits/chosen": -19.229175567626953, "logits/rejected": -18.265487670898438, "logps/chosen": -516.5594482421875, "logps/rejected": -329.27734375, "loss": 0.4597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9311935901641846, "rewards/margins": 1.7504148483276367, "rewards/rejected": 2.1807782649993896, "step": 42620 }, { "epoch": 1.9792005199870002, "grad_norm": 38.671791076660156, "learning_rate": 1.8130925298296116e-07, "logits/chosen": -18.220151901245117, "logits/rejected": -17.49997329711914, "logps/chosen": -331.63360595703125, "logps/rejected": -260.3877868652344, "loss": 0.6475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.629427671432495, "rewards/margins": 1.7048208713531494, "rewards/rejected": 1.9246069192886353, "step": 42630 }, { "epoch": 1.9796647940944334, "grad_norm": 170.2883758544922, "learning_rate": 1.8128139653651514e-07, "logits/chosen": -19.17383575439453, "logits/rejected": -18.025466918945312, "logps/chosen": -397.86761474609375, "logps/rejected": -368.5692138671875, "loss": 0.6334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8244621753692627, "rewards/margins": 1.1325511932373047, "rewards/rejected": 2.691910743713379, "step": 42640 }, { "epoch": 1.9801290682018664, "grad_norm": 2.2450616359710693, "learning_rate": 1.8125354009006916e-07, "logits/chosen": -19.647262573242188, "logits/rejected": -18.932626724243164, "logps/chosen": -278.677490234375, "logps/rejected": -255.51181030273438, "loss": 0.688, "rewards/accuracies": 0.5, "rewards/chosen": 2.7300922870635986, "rewards/margins": 0.7289517521858215, "rewards/rejected": 2.0011401176452637, "step": 42650 }, { "epoch": 1.9805933423092994, "grad_norm": 31.76222801208496, "learning_rate": 1.812256836436232e-07, "logits/chosen": -19.28017807006836, "logits/rejected": -18.709999084472656, "logps/chosen": -259.54498291015625, "logps/rejected": -220.18179321289062, "loss": 1.0202, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6293351650238037, "rewards/margins": 0.538042426109314, "rewards/rejected": 2.0912926197052, "step": 42660 }, { "epoch": 1.9810576164167324, "grad_norm": 73.49739837646484, "learning_rate": 1.811978271971772e-07, "logits/chosen": -18.364028930664062, "logits/rejected": -17.592195510864258, "logps/chosen": -265.6153869628906, "logps/rejected": -200.7653350830078, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.454296112060547, "rewards/margins": 0.8881757855415344, "rewards/rejected": 1.5661202669143677, "step": 42670 }, { "epoch": 1.9815218905241654, "grad_norm": 2.8718433380126953, "learning_rate": 1.8116997075073123e-07, "logits/chosen": -18.514217376708984, "logits/rejected": -18.815488815307617, "logps/chosen": -396.60491943359375, "logps/rejected": -363.2648010253906, "loss": 0.5245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.759997844696045, "rewards/margins": 1.5244619846343994, "rewards/rejected": 2.2355360984802246, "step": 42680 }, { "epoch": 1.9819861646315986, "grad_norm": 41.167171478271484, "learning_rate": 1.8114211430428524e-07, "logits/chosen": -19.4045352935791, "logits/rejected": -18.205303192138672, "logps/chosen": -316.84454345703125, "logps/rejected": -264.5675354003906, "loss": 0.4171, "rewards/accuracies": 1.0, "rewards/chosen": 3.0817155838012695, "rewards/margins": 0.7721154093742371, "rewards/rejected": 2.309600353240967, "step": 42690 }, { "epoch": 1.9824504387390314, "grad_norm": 106.89360809326172, "learning_rate": 1.8111425785783926e-07, "logits/chosen": -18.40285301208496, "logits/rejected": -17.937641143798828, "logps/chosen": -411.30377197265625, "logps/rejected": -340.743408203125, "loss": 0.6836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.066134452819824, "rewards/margins": 1.1616698503494263, "rewards/rejected": 2.9044644832611084, "step": 42700 }, { "epoch": 1.9829147128464646, "grad_norm": 72.5691909790039, "learning_rate": 1.8108640141139327e-07, "logits/chosen": -17.7968807220459, "logits/rejected": -17.284976959228516, "logps/chosen": -393.58343505859375, "logps/rejected": -323.2862548828125, "loss": 0.4646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8275630474090576, "rewards/margins": 0.8368999361991882, "rewards/rejected": 1.9906628131866455, "step": 42710 }, { "epoch": 1.9833789869538976, "grad_norm": 84.56959533691406, "learning_rate": 1.8105854496494729e-07, "logits/chosen": -19.074581146240234, "logits/rejected": -18.850872039794922, "logps/chosen": -363.368408203125, "logps/rejected": -343.347412109375, "loss": 0.7119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8962554931640625, "rewards/margins": 0.5039781332015991, "rewards/rejected": 3.392277479171753, "step": 42720 }, { "epoch": 1.9838432610613306, "grad_norm": 62.10097122192383, "learning_rate": 1.8103068851850133e-07, "logits/chosen": -19.477556228637695, "logits/rejected": -18.11239242553711, "logps/chosen": -377.6654357910156, "logps/rejected": -282.42156982421875, "loss": 0.3514, "rewards/accuracies": 1.0, "rewards/chosen": 3.1669881343841553, "rewards/margins": 1.2502434253692627, "rewards/rejected": 1.9167448282241821, "step": 42730 }, { "epoch": 1.9843075351687638, "grad_norm": 203.4615936279297, "learning_rate": 1.8100283207205534e-07, "logits/chosen": -17.599483489990234, "logits/rejected": -17.714014053344727, "logps/chosen": -336.5934143066406, "logps/rejected": -375.7364807128906, "loss": 1.5052, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9007993936538696, "rewards/margins": -0.6773046255111694, "rewards/rejected": 2.578104019165039, "step": 42740 }, { "epoch": 1.9847718092761966, "grad_norm": 293.5203552246094, "learning_rate": 1.8097497562560933e-07, "logits/chosen": -19.278427124023438, "logits/rejected": -18.16835594177246, "logps/chosen": -400.9444274902344, "logps/rejected": -298.1169738769531, "loss": 0.7335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2311553955078125, "rewards/margins": 0.9542596936225891, "rewards/rejected": 2.2768962383270264, "step": 42750 }, { "epoch": 1.9852360833836298, "grad_norm": 91.18539428710938, "learning_rate": 1.8094711917916337e-07, "logits/chosen": -19.52165412902832, "logits/rejected": -18.575658798217773, "logps/chosen": -384.9106750488281, "logps/rejected": -358.5903015136719, "loss": 0.3586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6053805351257324, "rewards/margins": 1.1408838033676147, "rewards/rejected": 2.4644968509674072, "step": 42760 }, { "epoch": 1.9857003574910628, "grad_norm": 1.4574534893035889, "learning_rate": 1.8091926273271739e-07, "logits/chosen": -18.824546813964844, "logits/rejected": -18.2510929107666, "logps/chosen": -325.99896240234375, "logps/rejected": -292.1728515625, "loss": 0.6262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.619255781173706, "rewards/margins": 1.2111538648605347, "rewards/rejected": 2.408101797103882, "step": 42770 }, { "epoch": 1.9861646315984958, "grad_norm": 228.95741271972656, "learning_rate": 1.8089140628627143e-07, "logits/chosen": -18.830949783325195, "logits/rejected": -17.935176849365234, "logps/chosen": -521.1243896484375, "logps/rejected": -353.03521728515625, "loss": 0.6248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8584110736846924, "rewards/margins": 1.9125102758407593, "rewards/rejected": 1.9459006786346436, "step": 42780 }, { "epoch": 1.9866289057059288, "grad_norm": 279.3818054199219, "learning_rate": 1.8086354983982541e-07, "logits/chosen": -19.663837432861328, "logits/rejected": -18.837871551513672, "logps/chosen": -472.8114318847656, "logps/rejected": -377.6531982421875, "loss": 0.5639, "rewards/accuracies": 0.5, "rewards/chosen": 2.738002300262451, "rewards/margins": 0.6702307462692261, "rewards/rejected": 2.0677714347839355, "step": 42790 }, { "epoch": 1.9870931798133618, "grad_norm": 183.0138702392578, "learning_rate": 1.8083569339337943e-07, "logits/chosen": -19.147445678710938, "logits/rejected": -18.56405258178711, "logps/chosen": -385.73944091796875, "logps/rejected": -321.9242248535156, "loss": 0.7972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3183531761169434, "rewards/margins": 0.4042627215385437, "rewards/rejected": 2.914091110229492, "step": 42800 }, { "epoch": 1.987557453920795, "grad_norm": 93.032958984375, "learning_rate": 1.8080783694693347e-07, "logits/chosen": -18.99798583984375, "logits/rejected": -18.423036575317383, "logps/chosen": -311.8733215332031, "logps/rejected": -265.14068603515625, "loss": 0.6855, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2853503227233887, "rewards/margins": 0.7840378880500793, "rewards/rejected": 1.5013126134872437, "step": 42810 }, { "epoch": 1.9880217280282277, "grad_norm": 11.306546211242676, "learning_rate": 1.8077998050048748e-07, "logits/chosen": -18.78314971923828, "logits/rejected": -18.009502410888672, "logps/chosen": -287.0216064453125, "logps/rejected": -268.91510009765625, "loss": 0.6941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7985635995864868, "rewards/margins": 0.6654070615768433, "rewards/rejected": 1.1331565380096436, "step": 42820 }, { "epoch": 1.988486002135661, "grad_norm": 164.275390625, "learning_rate": 1.807521240540415e-07, "logits/chosen": -19.56230926513672, "logits/rejected": -17.95060920715332, "logps/chosen": -490.64422607421875, "logps/rejected": -430.25982666015625, "loss": 0.6506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.306225776672363, "rewards/margins": 1.285437822341919, "rewards/rejected": 3.0207877159118652, "step": 42830 }, { "epoch": 1.988950276243094, "grad_norm": 42.40623092651367, "learning_rate": 1.8072426760759551e-07, "logits/chosen": -19.280303955078125, "logits/rejected": -18.660324096679688, "logps/chosen": -527.5831909179688, "logps/rejected": -419.079345703125, "loss": 0.8413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9170830249786377, "rewards/margins": 0.6695173382759094, "rewards/rejected": 3.247565746307373, "step": 42840 }, { "epoch": 1.989414550350527, "grad_norm": 38.95508575439453, "learning_rate": 1.8069641116114955e-07, "logits/chosen": -17.722192764282227, "logits/rejected": -17.940929412841797, "logps/chosen": -379.3086853027344, "logps/rejected": -401.7926330566406, "loss": 1.1921, "rewards/accuracies": 0.5, "rewards/chosen": 2.427433490753174, "rewards/margins": -0.34094899892807007, "rewards/rejected": 2.7683827877044678, "step": 42850 }, { "epoch": 1.98987882445796, "grad_norm": 38.72517013549805, "learning_rate": 1.8066855471470357e-07, "logits/chosen": -19.097370147705078, "logits/rejected": -18.6097412109375, "logps/chosen": -356.69525146484375, "logps/rejected": -305.7466735839844, "loss": 0.6517, "rewards/accuracies": 0.5, "rewards/chosen": 2.7459750175476074, "rewards/margins": 0.576809287071228, "rewards/rejected": 2.169165849685669, "step": 42860 }, { "epoch": 1.990343098565393, "grad_norm": 205.99066162109375, "learning_rate": 1.8064069826825756e-07, "logits/chosen": -18.18876838684082, "logits/rejected": -17.9501953125, "logps/chosen": -294.62420654296875, "logps/rejected": -252.62002563476562, "loss": 0.7885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6491856575012207, "rewards/margins": 0.6762650012969971, "rewards/rejected": 1.9729210138320923, "step": 42870 }, { "epoch": 1.9908073726728261, "grad_norm": 207.0225067138672, "learning_rate": 1.806128418218116e-07, "logits/chosen": -18.620601654052734, "logits/rejected": -18.072566986083984, "logps/chosen": -431.07598876953125, "logps/rejected": -313.3578186035156, "loss": 0.8807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.800708055496216, "rewards/margins": 0.49484318494796753, "rewards/rejected": 3.3058650493621826, "step": 42880 }, { "epoch": 1.991271646780259, "grad_norm": 24.03898048400879, "learning_rate": 1.805849853753656e-07, "logits/chosen": -20.151996612548828, "logits/rejected": -18.67233657836914, "logps/chosen": -347.52239990234375, "logps/rejected": -275.4693908691406, "loss": 0.8105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5186946392059326, "rewards/margins": 0.8422960042953491, "rewards/rejected": 1.6763986349105835, "step": 42890 }, { "epoch": 1.9917359208876921, "grad_norm": 179.37832641601562, "learning_rate": 1.805571289289196e-07, "logits/chosen": -18.87381362915039, "logits/rejected": -18.638141632080078, "logps/chosen": -380.4137878417969, "logps/rejected": -382.278564453125, "loss": 0.8873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1544597148895264, "rewards/margins": -0.04423251003026962, "rewards/rejected": 3.1986923217773438, "step": 42900 }, { "epoch": 1.9922001949951251, "grad_norm": 64.21070861816406, "learning_rate": 1.8052927248247364e-07, "logits/chosen": -19.038267135620117, "logits/rejected": -19.087947845458984, "logps/chosen": -411.2718200683594, "logps/rejected": -384.7752990722656, "loss": 1.0638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5378596782684326, "rewards/margins": -0.02378857135772705, "rewards/rejected": 3.56164813041687, "step": 42910 }, { "epoch": 1.9926644691025581, "grad_norm": 42.86430358886719, "learning_rate": 1.8050141603602766e-07, "logits/chosen": -19.636215209960938, "logits/rejected": -19.139148712158203, "logps/chosen": -395.00433349609375, "logps/rejected": -445.46051025390625, "loss": 1.1786, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.276853084564209, "rewards/margins": -0.5147190093994141, "rewards/rejected": 3.7915725708007812, "step": 42920 }, { "epoch": 1.9931287432099913, "grad_norm": 71.3222885131836, "learning_rate": 1.804735595895817e-07, "logits/chosen": -18.802621841430664, "logits/rejected": -18.230480194091797, "logps/chosen": -305.3500061035156, "logps/rejected": -239.53085327148438, "loss": 0.4467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3089118003845215, "rewards/margins": 0.9387578964233398, "rewards/rejected": 1.3701540231704712, "step": 42930 }, { "epoch": 1.993593017317424, "grad_norm": 2.349644899368286, "learning_rate": 1.8044570314313569e-07, "logits/chosen": -18.34107780456543, "logits/rejected": -18.124292373657227, "logps/chosen": -352.3793640136719, "logps/rejected": -277.00250244140625, "loss": 0.4268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2147746086120605, "rewards/margins": 1.4576902389526367, "rewards/rejected": 1.7570844888687134, "step": 42940 }, { "epoch": 1.9940572914248573, "grad_norm": 158.7109375, "learning_rate": 1.804178466966897e-07, "logits/chosen": -18.641704559326172, "logits/rejected": -18.2716007232666, "logps/chosen": -337.9026794433594, "logps/rejected": -243.6305694580078, "loss": 0.5288, "rewards/accuracies": 0.5, "rewards/chosen": 2.8695945739746094, "rewards/margins": 1.0994665622711182, "rewards/rejected": 1.7701280117034912, "step": 42950 }, { "epoch": 1.9945215655322903, "grad_norm": 1.816926121711731, "learning_rate": 1.8038999025024374e-07, "logits/chosen": -19.050235748291016, "logits/rejected": -18.329971313476562, "logps/chosen": -412.00836181640625, "logps/rejected": -290.16351318359375, "loss": 0.3755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.037295341491699, "rewards/margins": 1.5875519514083862, "rewards/rejected": 2.4497432708740234, "step": 42960 }, { "epoch": 1.9949858396397233, "grad_norm": 41.35200881958008, "learning_rate": 1.8036213380379776e-07, "logits/chosen": -19.0234432220459, "logits/rejected": -18.918533325195312, "logps/chosen": -351.9905700683594, "logps/rejected": -349.97967529296875, "loss": 0.6127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9691641330718994, "rewards/margins": 0.7137571573257446, "rewards/rejected": 2.2554070949554443, "step": 42970 }, { "epoch": 1.9954501137471563, "grad_norm": 129.84165954589844, "learning_rate": 1.8033427735735177e-07, "logits/chosen": -18.82355308532715, "logits/rejected": -17.83591651916504, "logps/chosen": -355.92437744140625, "logps/rejected": -263.14910888671875, "loss": 0.9219, "rewards/accuracies": 0.5, "rewards/chosen": 2.3359713554382324, "rewards/margins": 0.0007840991020202637, "rewards/rejected": 2.3351874351501465, "step": 42980 }, { "epoch": 1.9959143878545893, "grad_norm": 12.26635456085205, "learning_rate": 1.8030642091090578e-07, "logits/chosen": -19.72203254699707, "logits/rejected": -19.172897338867188, "logps/chosen": -423.7127380371094, "logps/rejected": -354.9239196777344, "loss": 0.5438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.425365924835205, "rewards/margins": 0.9603629112243652, "rewards/rejected": 2.4650025367736816, "step": 42990 }, { "epoch": 1.9963786619620225, "grad_norm": 191.6250762939453, "learning_rate": 1.8027856446445983e-07, "logits/chosen": -18.809642791748047, "logits/rejected": -18.245159149169922, "logps/chosen": -335.8052062988281, "logps/rejected": -268.24884033203125, "loss": 0.5576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.780907154083252, "rewards/margins": 1.1826794147491455, "rewards/rejected": 1.5982275009155273, "step": 43000 }, { "epoch": 1.9968429360694553, "grad_norm": 41.823543548583984, "learning_rate": 1.8025070801801384e-07, "logits/chosen": -18.708017349243164, "logits/rejected": -18.83115005493164, "logps/chosen": -367.3856506347656, "logps/rejected": -347.5245056152344, "loss": 1.1654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7666335105895996, "rewards/margins": 0.5389829874038696, "rewards/rejected": 3.2276508808135986, "step": 43010 }, { "epoch": 1.9973072101768885, "grad_norm": 200.6897735595703, "learning_rate": 1.8022285157156783e-07, "logits/chosen": -20.15322494506836, "logits/rejected": -18.748239517211914, "logps/chosen": -437.7386169433594, "logps/rejected": -287.549072265625, "loss": 0.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3519370555877686, "rewards/margins": 0.8308296203613281, "rewards/rejected": 2.5211071968078613, "step": 43020 }, { "epoch": 1.9977714842843215, "grad_norm": 305.3041687011719, "learning_rate": 1.8019499512512187e-07, "logits/chosen": -19.59585189819336, "logits/rejected": -17.86004066467285, "logps/chosen": -444.285400390625, "logps/rejected": -374.1369323730469, "loss": 0.4737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.11525821685791, "rewards/margins": 1.8538916110992432, "rewards/rejected": 2.261366844177246, "step": 43030 }, { "epoch": 1.9982357583917545, "grad_norm": 47.968379974365234, "learning_rate": 1.8016713867867588e-07, "logits/chosen": -19.106956481933594, "logits/rejected": -18.333593368530273, "logps/chosen": -378.00372314453125, "logps/rejected": -245.93826293945312, "loss": 0.3927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4591827392578125, "rewards/margins": 1.8305232524871826, "rewards/rejected": 1.6286592483520508, "step": 43040 }, { "epoch": 1.9987000324991875, "grad_norm": 166.72528076171875, "learning_rate": 1.8013928223222992e-07, "logits/chosen": -18.41118621826172, "logits/rejected": -18.14236831665039, "logps/chosen": -326.3479309082031, "logps/rejected": -277.20782470703125, "loss": 1.1059, "rewards/accuracies": 0.5, "rewards/chosen": 2.5836007595062256, "rewards/margins": 0.31817150115966797, "rewards/rejected": 2.2654290199279785, "step": 43050 }, { "epoch": 1.9991643066066205, "grad_norm": 17.118526458740234, "learning_rate": 1.801114257857839e-07, "logits/chosen": -19.36874771118164, "logits/rejected": -18.831993103027344, "logps/chosen": -391.72161865234375, "logps/rejected": -308.28460693359375, "loss": 0.3763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.837066650390625, "rewards/margins": 1.5382635593414307, "rewards/rejected": 2.2988035678863525, "step": 43060 }, { "epoch": 1.9996285807140537, "grad_norm": 0.24908098578453064, "learning_rate": 1.8008356933933793e-07, "logits/chosen": -18.746301651000977, "logits/rejected": -17.94179916381836, "logps/chosen": -426.33648681640625, "logps/rejected": -326.3373107910156, "loss": 0.6262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.117153644561768, "rewards/margins": 1.731398582458496, "rewards/rejected": 2.3857550621032715, "step": 43070 }, { "epoch": 2.0000928548214865, "grad_norm": 194.31260681152344, "learning_rate": 1.8005571289289197e-07, "logits/chosen": -18.030033111572266, "logits/rejected": -17.469688415527344, "logps/chosen": -372.5929260253906, "logps/rejected": -266.1416015625, "loss": 0.961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.723098039627075, "rewards/margins": 0.5176913142204285, "rewards/rejected": 2.205406665802002, "step": 43080 }, { "epoch": 2.0005571289289197, "grad_norm": 9.08421802520752, "learning_rate": 1.8002785644644596e-07, "logits/chosen": -17.889251708984375, "logits/rejected": -17.82131576538086, "logps/chosen": -279.26239013671875, "logps/rejected": -261.59490966796875, "loss": 0.9688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8786815404891968, "rewards/margins": -0.08003872632980347, "rewards/rejected": 1.9587202072143555, "step": 43090 }, { "epoch": 2.0010214030363525, "grad_norm": 150.63070678710938, "learning_rate": 1.8e-07, "logits/chosen": -19.378726959228516, "logits/rejected": -18.838268280029297, "logps/chosen": -364.1375427246094, "logps/rejected": -314.870849609375, "loss": 0.7132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.059782981872559, "rewards/margins": 0.41151851415634155, "rewards/rejected": 3.6482644081115723, "step": 43100 }, { "epoch": 2.0014856771437857, "grad_norm": 54.56712341308594, "learning_rate": 1.79972143553554e-07, "logits/chosen": -18.620079040527344, "logits/rejected": -18.575904846191406, "logps/chosen": -278.0565490722656, "logps/rejected": -289.9712219238281, "loss": 0.885, "rewards/accuracies": 0.5, "rewards/chosen": 2.4700980186462402, "rewards/margins": 0.30523252487182617, "rewards/rejected": 2.164865732192993, "step": 43110 }, { "epoch": 2.001949951251219, "grad_norm": 120.10720825195312, "learning_rate": 1.7994428710710803e-07, "logits/chosen": -18.99234390258789, "logits/rejected": -18.222991943359375, "logps/chosen": -472.5804138183594, "logps/rejected": -440.873046875, "loss": 0.6502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8629150390625, "rewards/margins": 0.9713441729545593, "rewards/rejected": 2.891570806503296, "step": 43120 }, { "epoch": 2.0024142253586517, "grad_norm": 1.5194998979568481, "learning_rate": 1.7991643066066204e-07, "logits/chosen": -19.723297119140625, "logits/rejected": -18.409175872802734, "logps/chosen": -460.8097229003906, "logps/rejected": -301.5444641113281, "loss": 0.421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.618660926818848, "rewards/margins": 2.0207695960998535, "rewards/rejected": 2.597891330718994, "step": 43130 }, { "epoch": 2.002878499466085, "grad_norm": 96.91455078125, "learning_rate": 1.7988857421421606e-07, "logits/chosen": -18.92562484741211, "logits/rejected": -17.84929656982422, "logps/chosen": -395.01763916015625, "logps/rejected": -335.81622314453125, "loss": 0.6677, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.270151615142822, "rewards/margins": 1.4573750495910645, "rewards/rejected": 2.8127760887145996, "step": 43140 }, { "epoch": 2.0033427735735176, "grad_norm": 292.5623779296875, "learning_rate": 1.798607177677701e-07, "logits/chosen": -18.28243637084961, "logits/rejected": -18.255573272705078, "logps/chosen": -396.8036804199219, "logps/rejected": -409.7127380371094, "loss": 0.8385, "rewards/accuracies": 0.5, "rewards/chosen": 3.8560149669647217, "rewards/margins": 0.9031203389167786, "rewards/rejected": 2.952894449234009, "step": 43150 }, { "epoch": 2.003807047680951, "grad_norm": 23.597396850585938, "learning_rate": 1.798328613213241e-07, "logits/chosen": -19.55625343322754, "logits/rejected": -19.099815368652344, "logps/chosen": -385.26190185546875, "logps/rejected": -304.32281494140625, "loss": 0.8371, "rewards/accuracies": 0.5, "rewards/chosen": 3.1607351303100586, "rewards/margins": 0.8368943333625793, "rewards/rejected": 2.323840618133545, "step": 43160 }, { "epoch": 2.004271321788384, "grad_norm": 24.46442222595215, "learning_rate": 1.798050048748781e-07, "logits/chosen": -19.18989372253418, "logits/rejected": -17.761051177978516, "logps/chosen": -340.98553466796875, "logps/rejected": -158.05470275878906, "loss": 0.4217, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.17348051071167, "rewards/margins": 1.8407049179077148, "rewards/rejected": 1.3327758312225342, "step": 43170 }, { "epoch": 2.004735595895817, "grad_norm": 2.118476390838623, "learning_rate": 1.7977714842843214e-07, "logits/chosen": -19.45284652709961, "logits/rejected": -18.32381820678711, "logps/chosen": -437.0039978027344, "logps/rejected": -310.6437072753906, "loss": 0.1983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3118438720703125, "rewards/margins": 2.4357833862304688, "rewards/rejected": 1.8760607242584229, "step": 43180 }, { "epoch": 2.00519987000325, "grad_norm": 27.3399658203125, "learning_rate": 1.7974929198198615e-07, "logits/chosen": -18.59480857849121, "logits/rejected": -18.937772750854492, "logps/chosen": -345.4969177246094, "logps/rejected": -368.85369873046875, "loss": 0.8575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1544675827026367, "rewards/margins": 0.36839431524276733, "rewards/rejected": 2.786073684692383, "step": 43190 }, { "epoch": 2.005664144110683, "grad_norm": 64.89081573486328, "learning_rate": 1.797214355355402e-07, "logits/chosen": -19.40268898010254, "logits/rejected": -17.877809524536133, "logps/chosen": -464.72039794921875, "logps/rejected": -364.6218566894531, "loss": 0.3629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.299437046051025, "rewards/margins": 1.487798810005188, "rewards/rejected": 2.8116374015808105, "step": 43200 }, { "epoch": 2.006128418218116, "grad_norm": 1.1184940338134766, "learning_rate": 1.7969357908909418e-07, "logits/chosen": -19.146991729736328, "logits/rejected": -18.635499954223633, "logps/chosen": -350.8233337402344, "logps/rejected": -299.2852783203125, "loss": 0.5377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.570927858352661, "rewards/margins": 1.31070876121521, "rewards/rejected": 2.260219097137451, "step": 43210 }, { "epoch": 2.006592692325549, "grad_norm": 83.57025146484375, "learning_rate": 1.796657226426482e-07, "logits/chosen": -19.148372650146484, "logits/rejected": -17.97930908203125, "logps/chosen": -333.67230224609375, "logps/rejected": -242.12564086914062, "loss": 0.2874, "rewards/accuracies": 1.0, "rewards/chosen": 3.0089306831359863, "rewards/margins": 1.6276426315307617, "rewards/rejected": 1.3812880516052246, "step": 43220 }, { "epoch": 2.007056966432982, "grad_norm": 87.5181655883789, "learning_rate": 1.7963786619620224e-07, "logits/chosen": -19.576784133911133, "logits/rejected": -18.138996124267578, "logps/chosen": -373.52337646484375, "logps/rejected": -266.8445739746094, "loss": 0.291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.798215866088867, "rewards/margins": 2.1321582794189453, "rewards/rejected": 1.6660579442977905, "step": 43230 }, { "epoch": 2.0075212405404153, "grad_norm": 167.26370239257812, "learning_rate": 1.7961000974975625e-07, "logits/chosen": -18.148670196533203, "logits/rejected": -17.750226974487305, "logps/chosen": -375.64239501953125, "logps/rejected": -307.8279113769531, "loss": 0.641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.25585675239563, "rewards/margins": 1.0811288356781006, "rewards/rejected": 2.1747279167175293, "step": 43240 }, { "epoch": 2.007985514647848, "grad_norm": 139.61553955078125, "learning_rate": 1.7958215330331027e-07, "logits/chosen": -19.017427444458008, "logits/rejected": -18.388141632080078, "logps/chosen": -473.84124755859375, "logps/rejected": -358.9829406738281, "loss": 0.8618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.123570442199707, "rewards/margins": 0.8249955177307129, "rewards/rejected": 3.2985751628875732, "step": 43250 }, { "epoch": 2.0084497887552812, "grad_norm": 3.7419657707214355, "learning_rate": 1.7955429685686428e-07, "logits/chosen": -19.021717071533203, "logits/rejected": -18.034893035888672, "logps/chosen": -372.2939453125, "logps/rejected": -245.4025421142578, "loss": 0.6897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9349894523620605, "rewards/margins": 1.1747245788574219, "rewards/rejected": 1.7602649927139282, "step": 43260 }, { "epoch": 2.008914062862714, "grad_norm": 167.03379821777344, "learning_rate": 1.7952644041041832e-07, "logits/chosen": -18.832033157348633, "logits/rejected": -19.05031967163086, "logps/chosen": -352.8998718261719, "logps/rejected": -321.03631591796875, "loss": 1.2617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6093125343322754, "rewards/margins": -0.33619457483291626, "rewards/rejected": 2.9455068111419678, "step": 43270 }, { "epoch": 2.0093783369701472, "grad_norm": 93.35286712646484, "learning_rate": 1.794985839639723e-07, "logits/chosen": -18.974159240722656, "logits/rejected": -18.053096771240234, "logps/chosen": -273.02960205078125, "logps/rejected": -219.8070831298828, "loss": 0.5178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9775972366333008, "rewards/margins": 0.8616237640380859, "rewards/rejected": 1.1159733533859253, "step": 43280 }, { "epoch": 2.00984261107758, "grad_norm": 49.19605255126953, "learning_rate": 1.7947072751752633e-07, "logits/chosen": -19.02791976928711, "logits/rejected": -18.439342498779297, "logps/chosen": -395.2756652832031, "logps/rejected": -312.15081787109375, "loss": 0.4644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.324462413787842, "rewards/margins": 1.4110691547393799, "rewards/rejected": 1.9133933782577515, "step": 43290 }, { "epoch": 2.010306885185013, "grad_norm": 6.1750617027282715, "learning_rate": 1.7944287107108037e-07, "logits/chosen": -19.38312530517578, "logits/rejected": -18.174434661865234, "logps/chosen": -452.7962341308594, "logps/rejected": -329.4296569824219, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 5.3359055519104, "rewards/margins": 2.6145567893981934, "rewards/rejected": 2.7213492393493652, "step": 43300 }, { "epoch": 2.0107711592924464, "grad_norm": 59.22254943847656, "learning_rate": 1.7941501462463438e-07, "logits/chosen": -18.548480987548828, "logits/rejected": -18.813547134399414, "logps/chosen": -438.6534118652344, "logps/rejected": -444.31903076171875, "loss": 1.7956, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.344754695892334, "rewards/margins": -0.47832876443862915, "rewards/rejected": 3.8230834007263184, "step": 43310 }, { "epoch": 2.011235433399879, "grad_norm": 59.48331832885742, "learning_rate": 1.7938715817818837e-07, "logits/chosen": -19.441816329956055, "logits/rejected": -18.633411407470703, "logps/chosen": -424.92437744140625, "logps/rejected": -297.2718811035156, "loss": 0.4661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.583237886428833, "rewards/margins": 1.4862316846847534, "rewards/rejected": 2.097006320953369, "step": 43320 }, { "epoch": 2.0116997075073124, "grad_norm": 47.40639877319336, "learning_rate": 1.793593017317424e-07, "logits/chosen": -18.890979766845703, "logits/rejected": -18.01860237121582, "logps/chosen": -493.6333923339844, "logps/rejected": -345.7381286621094, "loss": 0.3639, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.164098739624023, "rewards/margins": 1.6531736850738525, "rewards/rejected": 2.51092529296875, "step": 43330 }, { "epoch": 2.012163981614745, "grad_norm": 121.92152404785156, "learning_rate": 1.7933144528529643e-07, "logits/chosen": -18.74139976501465, "logits/rejected": -18.443309783935547, "logps/chosen": -409.2865295410156, "logps/rejected": -306.4384765625, "loss": 0.8931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.401360034942627, "rewards/margins": 0.26549941301345825, "rewards/rejected": 3.1358606815338135, "step": 43340 }, { "epoch": 2.0126282557221784, "grad_norm": 202.75474548339844, "learning_rate": 1.7930358883885047e-07, "logits/chosen": -19.023263931274414, "logits/rejected": -17.88411521911621, "logps/chosen": -429.65484619140625, "logps/rejected": -317.4249572753906, "loss": 0.5978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.663074016571045, "rewards/margins": 1.2498358488082886, "rewards/rejected": 3.4132378101348877, "step": 43350 }, { "epoch": 2.0130925298296116, "grad_norm": 99.42668151855469, "learning_rate": 1.7927573239240445e-07, "logits/chosen": -19.835155487060547, "logits/rejected": -19.486652374267578, "logps/chosen": -473.8863220214844, "logps/rejected": -504.975830078125, "loss": 0.8347, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.12489128112793, "rewards/margins": 0.012280428782105446, "rewards/rejected": 4.112610816955566, "step": 43360 }, { "epoch": 2.0135568039370444, "grad_norm": 0.37330594658851624, "learning_rate": 1.7924787594595847e-07, "logits/chosen": -18.142932891845703, "logits/rejected": -18.369556427001953, "logps/chosen": -375.28643798828125, "logps/rejected": -383.48114013671875, "loss": 1.6153, "rewards/accuracies": 0.5, "rewards/chosen": 2.6990270614624023, "rewards/margins": -0.3258458077907562, "rewards/rejected": 3.0248730182647705, "step": 43370 }, { "epoch": 2.0140210780444776, "grad_norm": 18.98337173461914, "learning_rate": 1.792200194995125e-07, "logits/chosen": -19.46518325805664, "logits/rejected": -19.054195404052734, "logps/chosen": -553.4002685546875, "logps/rejected": -410.1143493652344, "loss": 0.3684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.524713516235352, "rewards/margins": 1.3740098476409912, "rewards/rejected": 3.150703191757202, "step": 43380 }, { "epoch": 2.0144853521519104, "grad_norm": 235.17723083496094, "learning_rate": 1.7919216305306652e-07, "logits/chosen": -18.243295669555664, "logits/rejected": -18.06179428100586, "logps/chosen": -336.80804443359375, "logps/rejected": -315.6391296386719, "loss": 1.1722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.707441806793213, "rewards/margins": 0.6968801617622375, "rewards/rejected": 2.010561466217041, "step": 43390 }, { "epoch": 2.0149496262593436, "grad_norm": 23.94161033630371, "learning_rate": 1.7916430660662054e-07, "logits/chosen": -19.117130279541016, "logits/rejected": -18.143299102783203, "logps/chosen": -366.7433166503906, "logps/rejected": -274.1612548828125, "loss": 0.4231, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8210272789001465, "rewards/margins": 0.9963180422782898, "rewards/rejected": 1.824709177017212, "step": 43400 }, { "epoch": 2.0154139003667764, "grad_norm": 124.209716796875, "learning_rate": 1.7913645016017455e-07, "logits/chosen": -19.10177230834961, "logits/rejected": -18.270244598388672, "logps/chosen": -340.8135986328125, "logps/rejected": -314.9864501953125, "loss": 1.0814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9299094676971436, "rewards/margins": 1.2280267477035522, "rewards/rejected": 2.7018826007843018, "step": 43410 }, { "epoch": 2.0158781744742096, "grad_norm": 32.946231842041016, "learning_rate": 1.791085937137286e-07, "logits/chosen": -19.73240089416504, "logits/rejected": -18.713455200195312, "logps/chosen": -461.60223388671875, "logps/rejected": -327.4007263183594, "loss": 0.7659, "rewards/accuracies": 0.5, "rewards/chosen": 3.5051143169403076, "rewards/margins": 0.9715865254402161, "rewards/rejected": 2.5335280895233154, "step": 43420 }, { "epoch": 2.016342448581643, "grad_norm": 198.52658081054688, "learning_rate": 1.790807372672826e-07, "logits/chosen": -18.28278160095215, "logits/rejected": -17.516551971435547, "logps/chosen": -509.6839904785156, "logps/rejected": -343.763671875, "loss": 0.4257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7725157737731934, "rewards/margins": 1.3368819952011108, "rewards/rejected": 2.435633897781372, "step": 43430 }, { "epoch": 2.0168067226890756, "grad_norm": 2.646310329437256, "learning_rate": 1.790528808208366e-07, "logits/chosen": -18.107707977294922, "logits/rejected": -17.649242401123047, "logps/chosen": -416.2701110839844, "logps/rejected": -375.960205078125, "loss": 0.4583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.474656581878662, "rewards/margins": 1.3262765407562256, "rewards/rejected": 3.1483798027038574, "step": 43440 }, { "epoch": 2.017270996796509, "grad_norm": 198.55413818359375, "learning_rate": 1.7902502437439064e-07, "logits/chosen": -18.974319458007812, "logits/rejected": -18.56013298034668, "logps/chosen": -492.747314453125, "logps/rejected": -478.7890625, "loss": 1.3217, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9474544525146484, "rewards/margins": -0.38744139671325684, "rewards/rejected": 4.334895610809326, "step": 43450 }, { "epoch": 2.0177352709039416, "grad_norm": 102.98956298828125, "learning_rate": 1.7899716792794465e-07, "logits/chosen": -19.037086486816406, "logits/rejected": -18.319774627685547, "logps/chosen": -460.2120056152344, "logps/rejected": -376.8802185058594, "loss": 0.6891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2682743072509766, "rewards/margins": 0.7195330858230591, "rewards/rejected": 2.548741340637207, "step": 43460 }, { "epoch": 2.0181995450113748, "grad_norm": 129.02517700195312, "learning_rate": 1.7896931148149864e-07, "logits/chosen": -18.84256362915039, "logits/rejected": -18.719926834106445, "logps/chosen": -332.97821044921875, "logps/rejected": -314.707275390625, "loss": 0.9238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.056507110595703, "rewards/margins": -0.05968743562698364, "rewards/rejected": 3.116194248199463, "step": 43470 }, { "epoch": 2.0186638191188075, "grad_norm": 121.70724487304688, "learning_rate": 1.7894145503505268e-07, "logits/chosen": -19.899425506591797, "logits/rejected": -18.905887603759766, "logps/chosen": -409.71533203125, "logps/rejected": -361.98583984375, "loss": 0.7125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0958738327026367, "rewards/margins": 0.7598022222518921, "rewards/rejected": 2.336071252822876, "step": 43480 }, { "epoch": 2.0191280932262408, "grad_norm": 99.63977813720703, "learning_rate": 1.789135985886067e-07, "logits/chosen": -18.908926010131836, "logits/rejected": -18.116933822631836, "logps/chosen": -446.7721252441406, "logps/rejected": -351.8707580566406, "loss": 0.665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.447788715362549, "rewards/margins": 0.8730856776237488, "rewards/rejected": 2.574702739715576, "step": 43490 }, { "epoch": 2.019592367333674, "grad_norm": 88.67095184326172, "learning_rate": 1.7888574214216074e-07, "logits/chosen": -20.31436538696289, "logits/rejected": -19.087276458740234, "logps/chosen": -507.94830322265625, "logps/rejected": -397.64520263671875, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": 3.902855634689331, "rewards/margins": 1.269113302230835, "rewards/rejected": 2.633742570877075, "step": 43500 }, { "epoch": 2.0200566414411067, "grad_norm": 73.3568344116211, "learning_rate": 1.7885788569571473e-07, "logits/chosen": -19.904888153076172, "logits/rejected": -18.592926025390625, "logps/chosen": -412.46044921875, "logps/rejected": -321.5540771484375, "loss": 0.6041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.435706615447998, "rewards/margins": 0.7935789227485657, "rewards/rejected": 2.6421279907226562, "step": 43510 }, { "epoch": 2.02052091554854, "grad_norm": 17.11574935913086, "learning_rate": 1.7883002924926877e-07, "logits/chosen": -19.090707778930664, "logits/rejected": -18.440771102905273, "logps/chosen": -272.4278869628906, "logps/rejected": -225.6407470703125, "loss": 0.6015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.738440990447998, "rewards/margins": 0.9378961324691772, "rewards/rejected": 1.8005447387695312, "step": 43520 }, { "epoch": 2.0209851896559727, "grad_norm": 51.764400482177734, "learning_rate": 1.7880217280282278e-07, "logits/chosen": -19.691509246826172, "logits/rejected": -19.140766143798828, "logps/chosen": -351.6806335449219, "logps/rejected": -277.19366455078125, "loss": 0.6256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.963465690612793, "rewards/margins": 0.5918282866477966, "rewards/rejected": 2.3716373443603516, "step": 43530 }, { "epoch": 2.021449463763406, "grad_norm": 17.960859298706055, "learning_rate": 1.787743163563768e-07, "logits/chosen": -18.581132888793945, "logits/rejected": -18.83544921875, "logps/chosen": -388.1068115234375, "logps/rejected": -332.5296325683594, "loss": 0.4075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.489442825317383, "rewards/margins": 1.1575398445129395, "rewards/rejected": 2.3319029808044434, "step": 43540 }, { "epoch": 2.0219137378708387, "grad_norm": 22.826826095581055, "learning_rate": 1.787464599099308e-07, "logits/chosen": -18.67564582824707, "logits/rejected": -17.408008575439453, "logps/chosen": -328.6595458984375, "logps/rejected": -209.1310577392578, "loss": 0.7878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.021747589111328, "rewards/margins": 1.3375561237335205, "rewards/rejected": 1.684191346168518, "step": 43550 }, { "epoch": 2.022378011978272, "grad_norm": 165.3306121826172, "learning_rate": 1.7871860346348482e-07, "logits/chosen": -17.221195220947266, "logits/rejected": -17.350828170776367, "logps/chosen": -360.64532470703125, "logps/rejected": -379.93963623046875, "loss": 0.821, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5125226974487305, "rewards/margins": 0.1992439329624176, "rewards/rejected": 2.313278913497925, "step": 43560 }, { "epoch": 2.022842286085705, "grad_norm": 136.42037963867188, "learning_rate": 1.7869074701703887e-07, "logits/chosen": -18.419654846191406, "logits/rejected": -19.307640075683594, "logps/chosen": -340.9231262207031, "logps/rejected": -416.86376953125, "loss": 1.1645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2482526302337646, "rewards/margins": -0.3071799874305725, "rewards/rejected": 2.5554325580596924, "step": 43570 }, { "epoch": 2.023306560193138, "grad_norm": 5.7952423095703125, "learning_rate": 1.7866289057059288e-07, "logits/chosen": -18.507705688476562, "logits/rejected": -17.94434928894043, "logps/chosen": -389.9203186035156, "logps/rejected": -299.9176330566406, "loss": 0.7069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7937207221984863, "rewards/margins": 1.382057785987854, "rewards/rejected": 2.4116625785827637, "step": 43580 }, { "epoch": 2.023770834300571, "grad_norm": 70.64342498779297, "learning_rate": 1.7863503412414687e-07, "logits/chosen": -19.47718048095703, "logits/rejected": -18.785919189453125, "logps/chosen": -396.38812255859375, "logps/rejected": -371.9624328613281, "loss": 0.9591, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.576866626739502, "rewards/margins": 0.37261125445365906, "rewards/rejected": 3.2042553424835205, "step": 43590 }, { "epoch": 2.024235108408004, "grad_norm": 50.25292205810547, "learning_rate": 1.786071776777009e-07, "logits/chosen": -18.882081985473633, "logits/rejected": -18.786340713500977, "logps/chosen": -449.64678955078125, "logps/rejected": -384.870849609375, "loss": 0.4885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.882154941558838, "rewards/margins": 0.7710471153259277, "rewards/rejected": 3.11110782623291, "step": 43600 }, { "epoch": 2.024699382515437, "grad_norm": 73.71324920654297, "learning_rate": 1.7857932123125492e-07, "logits/chosen": -19.03546714782715, "logits/rejected": -18.003856658935547, "logps/chosen": -273.31768798828125, "logps/rejected": -194.1354522705078, "loss": 0.5054, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9321740865707397, "rewards/margins": 0.8410047292709351, "rewards/rejected": 1.0911691188812256, "step": 43610 }, { "epoch": 2.0251636566228703, "grad_norm": 215.94532775878906, "learning_rate": 1.7855146478480896e-07, "logits/chosen": -18.780668258666992, "logits/rejected": -18.497514724731445, "logps/chosen": -460.016845703125, "logps/rejected": -467.8817443847656, "loss": 1.0602, "rewards/accuracies": 0.5, "rewards/chosen": 2.8430442810058594, "rewards/margins": -0.0570368766784668, "rewards/rejected": 2.900081157684326, "step": 43620 }, { "epoch": 2.025627930730303, "grad_norm": 73.14177703857422, "learning_rate": 1.7852360833836295e-07, "logits/chosen": -19.677640914916992, "logits/rejected": -17.871570587158203, "logps/chosen": -436.0594177246094, "logps/rejected": -310.06182861328125, "loss": 0.4222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.341549873352051, "rewards/margins": 1.9201946258544922, "rewards/rejected": 2.4213552474975586, "step": 43630 }, { "epoch": 2.0260922048377363, "grad_norm": 59.872947692871094, "learning_rate": 1.7849575189191697e-07, "logits/chosen": -19.0844669342041, "logits/rejected": -17.912588119506836, "logps/chosen": -434.533935546875, "logps/rejected": -386.9230041503906, "loss": 0.5593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.432912349700928, "rewards/margins": 1.2429897785186768, "rewards/rejected": 3.18992280960083, "step": 43640 }, { "epoch": 2.026556478945169, "grad_norm": 4.382894515991211, "learning_rate": 1.78467895445471e-07, "logits/chosen": -18.44780158996582, "logits/rejected": -17.879419326782227, "logps/chosen": -368.770751953125, "logps/rejected": -220.37661743164062, "loss": 0.4466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5845634937286377, "rewards/margins": 1.8633015155792236, "rewards/rejected": 1.7212613821029663, "step": 43650 }, { "epoch": 2.0270207530526023, "grad_norm": 43.58016586303711, "learning_rate": 1.78440038999025e-07, "logits/chosen": -19.40609359741211, "logits/rejected": -18.579622268676758, "logps/chosen": -397.3168029785156, "logps/rejected": -350.0293884277344, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7064709663391113, "rewards/margins": 1.2774279117584229, "rewards/rejected": 2.4290428161621094, "step": 43660 }, { "epoch": 2.027485027160035, "grad_norm": 137.2604217529297, "learning_rate": 1.7841218255257904e-07, "logits/chosen": -18.903356552124023, "logits/rejected": -18.018606185913086, "logps/chosen": -339.19244384765625, "logps/rejected": -277.39727783203125, "loss": 0.601, "rewards/accuracies": 0.5, "rewards/chosen": 3.177125930786133, "rewards/margins": 1.0169665813446045, "rewards/rejected": 2.1601595878601074, "step": 43670 }, { "epoch": 2.0279493012674683, "grad_norm": 59.85152053833008, "learning_rate": 1.7838432610613305e-07, "logits/chosen": -17.972187042236328, "logits/rejected": -18.56369972229004, "logps/chosen": -266.53564453125, "logps/rejected": -320.4942932128906, "loss": 1.7616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4934899806976318, "rewards/margins": -0.9601094126701355, "rewards/rejected": 2.453599452972412, "step": 43680 }, { "epoch": 2.0284135753749015, "grad_norm": 123.37354278564453, "learning_rate": 1.783564696596871e-07, "logits/chosen": -18.55666160583496, "logits/rejected": -18.843120574951172, "logps/chosen": -312.804443359375, "logps/rejected": -363.26312255859375, "loss": 1.3754, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.370755672454834, "rewards/margins": -0.49318695068359375, "rewards/rejected": 3.8639426231384277, "step": 43690 }, { "epoch": 2.0288778494823343, "grad_norm": 3.6157729625701904, "learning_rate": 1.7832861321324108e-07, "logits/chosen": -18.46494483947754, "logits/rejected": -17.711074829101562, "logps/chosen": -382.1849670410156, "logps/rejected": -289.45794677734375, "loss": 0.5124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.940887928009033, "rewards/margins": 1.2245628833770752, "rewards/rejected": 1.7163254022598267, "step": 43700 }, { "epoch": 2.0293421235897675, "grad_norm": 13.130533218383789, "learning_rate": 1.783007567667951e-07, "logits/chosen": -18.118083953857422, "logits/rejected": -17.158723831176758, "logps/chosen": -364.4992980957031, "logps/rejected": -311.72119140625, "loss": 1.0068, "rewards/accuracies": 0.5, "rewards/chosen": 2.758340835571289, "rewards/margins": 0.27180641889572144, "rewards/rejected": 2.486534595489502, "step": 43710 }, { "epoch": 2.0298063976972003, "grad_norm": 28.188135147094727, "learning_rate": 1.7827290032034914e-07, "logits/chosen": -18.444570541381836, "logits/rejected": -18.45873260498047, "logps/chosen": -270.62158203125, "logps/rejected": -301.9234619140625, "loss": 0.8901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.562227249145508, "rewards/margins": 0.4378301501274109, "rewards/rejected": 2.124397039413452, "step": 43720 }, { "epoch": 2.0302706718046335, "grad_norm": 29.742752075195312, "learning_rate": 1.7824504387390315e-07, "logits/chosen": -18.767431259155273, "logits/rejected": -18.038097381591797, "logps/chosen": -349.1748962402344, "logps/rejected": -249.189697265625, "loss": 0.5397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1945395469665527, "rewards/margins": 1.38384211063385, "rewards/rejected": 1.810697317123413, "step": 43730 }, { "epoch": 2.0307349459120663, "grad_norm": 6.692319869995117, "learning_rate": 1.7821718742745714e-07, "logits/chosen": -19.883014678955078, "logits/rejected": -18.931955337524414, "logps/chosen": -421.07635498046875, "logps/rejected": -316.11407470703125, "loss": 0.7084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.750225782394409, "rewards/margins": 0.7541383504867554, "rewards/rejected": 2.9960880279541016, "step": 43740 }, { "epoch": 2.0311992200194995, "grad_norm": 54.466434478759766, "learning_rate": 1.7818933098101118e-07, "logits/chosen": -18.95920181274414, "logits/rejected": -18.060510635375977, "logps/chosen": -356.6431579589844, "logps/rejected": -253.8818817138672, "loss": 0.7806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.426863193511963, "rewards/margins": 0.5725086331367493, "rewards/rejected": 1.8543542623519897, "step": 43750 }, { "epoch": 2.0316634941269327, "grad_norm": 18.54463005065918, "learning_rate": 1.781614745345652e-07, "logits/chosen": -18.97498893737793, "logits/rejected": -18.414339065551758, "logps/chosen": -498.669189453125, "logps/rejected": -413.80816650390625, "loss": 0.4993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.373886585235596, "rewards/margins": 1.5935554504394531, "rewards/rejected": 2.7803308963775635, "step": 43760 }, { "epoch": 2.0321277682343655, "grad_norm": 177.39385986328125, "learning_rate": 1.7813361808811924e-07, "logits/chosen": -19.033527374267578, "logits/rejected": -18.944175720214844, "logps/chosen": -423.3824157714844, "logps/rejected": -305.4627685546875, "loss": 0.6848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.722851276397705, "rewards/margins": 1.2031972408294678, "rewards/rejected": 2.519653797149658, "step": 43770 }, { "epoch": 2.0325920423417987, "grad_norm": 17.02783966064453, "learning_rate": 1.7810576164167322e-07, "logits/chosen": -19.055912017822266, "logits/rejected": -18.166013717651367, "logps/chosen": -406.2466735839844, "logps/rejected": -274.0460205078125, "loss": 0.7304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1945414543151855, "rewards/margins": 0.8241820335388184, "rewards/rejected": 2.3703596591949463, "step": 43780 }, { "epoch": 2.0330563164492315, "grad_norm": 15.640522956848145, "learning_rate": 1.7807790519522724e-07, "logits/chosen": -19.100543975830078, "logits/rejected": -18.11124038696289, "logps/chosen": -455.4017639160156, "logps/rejected": -316.3169250488281, "loss": 0.3139, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.149327754974365, "rewards/margins": 1.6187368631362915, "rewards/rejected": 2.5305910110473633, "step": 43790 }, { "epoch": 2.0335205905566647, "grad_norm": 125.66110229492188, "learning_rate": 1.7805004874878128e-07, "logits/chosen": -19.976848602294922, "logits/rejected": -19.245908737182617, "logps/chosen": -467.572021484375, "logps/rejected": -337.8858947753906, "loss": 0.4248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.338069915771484, "rewards/margins": 1.8201649188995361, "rewards/rejected": 2.5179052352905273, "step": 43800 }, { "epoch": 2.033984864664098, "grad_norm": 236.57456970214844, "learning_rate": 1.780221923023353e-07, "logits/chosen": -18.492700576782227, "logits/rejected": -19.196178436279297, "logps/chosen": -371.95556640625, "logps/rejected": -424.58526611328125, "loss": 1.4224, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.127683639526367, "rewards/margins": -0.6339174509048462, "rewards/rejected": 3.761601209640503, "step": 43810 }, { "epoch": 2.0344491387715307, "grad_norm": 0.5231056213378906, "learning_rate": 1.779943358558893e-07, "logits/chosen": -18.081966400146484, "logits/rejected": -16.91713523864746, "logps/chosen": -319.7079162597656, "logps/rejected": -177.00985717773438, "loss": 0.3896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3513972759246826, "rewards/margins": 1.9954379796981812, "rewards/rejected": 1.355959415435791, "step": 43820 }, { "epoch": 2.034913412878964, "grad_norm": 13.556921005249023, "learning_rate": 1.7796647940944332e-07, "logits/chosen": -18.761669158935547, "logits/rejected": -17.11911392211914, "logps/chosen": -393.77069091796875, "logps/rejected": -220.21139526367188, "loss": 0.3775, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4371941089630127, "rewards/margins": 2.225450038909912, "rewards/rejected": 1.2117438316345215, "step": 43830 }, { "epoch": 2.0353776869863967, "grad_norm": 17.747846603393555, "learning_rate": 1.7793862296299736e-07, "logits/chosen": -18.547435760498047, "logits/rejected": -17.91876983642578, "logps/chosen": -299.18316650390625, "logps/rejected": -229.3479461669922, "loss": 0.4705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.377509593963623, "rewards/margins": 1.1853644847869873, "rewards/rejected": 1.1921451091766357, "step": 43840 }, { "epoch": 2.03584196109383, "grad_norm": 32.092872619628906, "learning_rate": 1.7791076651655138e-07, "logits/chosen": -19.48483657836914, "logits/rejected": -18.649316787719727, "logps/chosen": -442.9412536621094, "logps/rejected": -321.0958557128906, "loss": 0.5404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.912064790725708, "rewards/margins": 1.2971271276474, "rewards/rejected": 2.6149373054504395, "step": 43850 }, { "epoch": 2.0363062352012626, "grad_norm": 121.90591430664062, "learning_rate": 1.7788291007010537e-07, "logits/chosen": -18.29004669189453, "logits/rejected": -17.7521915435791, "logps/chosen": -407.30511474609375, "logps/rejected": -412.93719482421875, "loss": 0.8399, "rewards/accuracies": 0.5, "rewards/chosen": 3.7426044940948486, "rewards/margins": 0.25223976373672485, "rewards/rejected": 3.4903647899627686, "step": 43860 }, { "epoch": 2.036770509308696, "grad_norm": 5.071764945983887, "learning_rate": 1.778550536236594e-07, "logits/chosen": -19.80367660522461, "logits/rejected": -18.486635208129883, "logps/chosen": -513.9159545898438, "logps/rejected": -329.8891906738281, "loss": 0.4823, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7800440788269043, "rewards/margins": 0.9760228395462036, "rewards/rejected": 2.8040213584899902, "step": 43870 }, { "epoch": 2.037234783416129, "grad_norm": 12.54208755493164, "learning_rate": 1.7782719717721342e-07, "logits/chosen": -19.121723175048828, "logits/rejected": -17.770870208740234, "logps/chosen": -338.0203857421875, "logps/rejected": -254.2772216796875, "loss": 0.275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.065509796142578, "rewards/margins": 1.8387115001678467, "rewards/rejected": 1.2267982959747314, "step": 43880 }, { "epoch": 2.037699057523562, "grad_norm": 28.170785903930664, "learning_rate": 1.777993407307674e-07, "logits/chosen": -19.283550262451172, "logits/rejected": -18.343524932861328, "logps/chosen": -393.8006286621094, "logps/rejected": -304.4679260253906, "loss": 0.4657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3375182151794434, "rewards/margins": 1.1075961589813232, "rewards/rejected": 1.2299221754074097, "step": 43890 }, { "epoch": 2.038163331630995, "grad_norm": 116.10594940185547, "learning_rate": 1.7777148428432145e-07, "logits/chosen": -18.544185638427734, "logits/rejected": -17.716949462890625, "logps/chosen": -416.7728576660156, "logps/rejected": -334.76544189453125, "loss": 0.3706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5071041584014893, "rewards/margins": 1.1780586242675781, "rewards/rejected": 2.329045534133911, "step": 43900 }, { "epoch": 2.038627605738428, "grad_norm": 6.1722564697265625, "learning_rate": 1.7774362783787547e-07, "logits/chosen": -18.79752540588379, "logits/rejected": -18.611454010009766, "logps/chosen": -291.6481018066406, "logps/rejected": -308.15655517578125, "loss": 1.1732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5639684200286865, "rewards/margins": 0.2296290397644043, "rewards/rejected": 2.3343393802642822, "step": 43910 }, { "epoch": 2.039091879845861, "grad_norm": 67.33482360839844, "learning_rate": 1.777157713914295e-07, "logits/chosen": -18.30381202697754, "logits/rejected": -17.603775024414062, "logps/chosen": -403.56427001953125, "logps/rejected": -319.4493103027344, "loss": 0.6944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8188157081604004, "rewards/margins": 0.3956015110015869, "rewards/rejected": 2.4232144355773926, "step": 43920 }, { "epoch": 2.039556153953294, "grad_norm": 83.62885284423828, "learning_rate": 1.776879149449835e-07, "logits/chosen": -19.234661102294922, "logits/rejected": -17.917123794555664, "logps/chosen": -503.5660705566406, "logps/rejected": -280.9695129394531, "loss": 0.2462, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7889862060546875, "rewards/margins": 2.927903652191162, "rewards/rejected": 1.8610824346542358, "step": 43930 }, { "epoch": 2.040020428060727, "grad_norm": 32.66767501831055, "learning_rate": 1.7766005849853754e-07, "logits/chosen": -19.125268936157227, "logits/rejected": -17.955493927001953, "logps/chosen": -331.1544189453125, "logps/rejected": -226.3856964111328, "loss": 0.2484, "rewards/accuracies": 1.0, "rewards/chosen": 3.426905870437622, "rewards/margins": 2.0462589263916016, "rewards/rejected": 1.3806469440460205, "step": 43940 }, { "epoch": 2.0404847021681602, "grad_norm": 6.00644588470459, "learning_rate": 1.7763220205209155e-07, "logits/chosen": -20.29767417907715, "logits/rejected": -19.563133239746094, "logps/chosen": -389.103515625, "logps/rejected": -329.61773681640625, "loss": 0.6915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6595749855041504, "rewards/margins": 0.701621413230896, "rewards/rejected": 2.9579532146453857, "step": 43950 }, { "epoch": 2.040948976275593, "grad_norm": 139.45355224609375, "learning_rate": 1.7760434560564556e-07, "logits/chosen": -20.18639373779297, "logits/rejected": -18.415359497070312, "logps/chosen": -530.0626220703125, "logps/rejected": -336.7016906738281, "loss": 0.7114, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.148201942443848, "rewards/margins": 1.265207052230835, "rewards/rejected": 2.8829944133758545, "step": 43960 }, { "epoch": 2.0414132503830262, "grad_norm": 13.798914909362793, "learning_rate": 1.7757648915919958e-07, "logits/chosen": -18.91840362548828, "logits/rejected": -17.633359909057617, "logps/chosen": -443.439208984375, "logps/rejected": -276.0947265625, "loss": 0.2071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.289976596832275, "rewards/margins": 1.986509084701538, "rewards/rejected": 2.303467035293579, "step": 43970 }, { "epoch": 2.041877524490459, "grad_norm": 38.600799560546875, "learning_rate": 1.775486327127536e-07, "logits/chosen": -18.979774475097656, "logits/rejected": -18.887760162353516, "logps/chosen": -288.5944519042969, "logps/rejected": -343.71942138671875, "loss": 1.0123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.077518939971924, "rewards/margins": -0.2859283983707428, "rewards/rejected": 3.3634471893310547, "step": 43980 }, { "epoch": 2.042341798597892, "grad_norm": 115.55076599121094, "learning_rate": 1.7752077626630764e-07, "logits/chosen": -18.64094352722168, "logits/rejected": -18.915515899658203, "logps/chosen": -295.7648620605469, "logps/rejected": -308.421142578125, "loss": 0.6875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.565025806427002, "rewards/margins": 0.32024964690208435, "rewards/rejected": 2.2447762489318848, "step": 43990 }, { "epoch": 2.0428060727053254, "grad_norm": 46.44025421142578, "learning_rate": 1.7749291981986165e-07, "logits/chosen": -19.159618377685547, "logits/rejected": -18.374927520751953, "logps/chosen": -433.4742736816406, "logps/rejected": -369.60491943359375, "loss": 0.4433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.190524578094482, "rewards/margins": 1.545632004737854, "rewards/rejected": 2.644892930984497, "step": 44000 }, { "epoch": 2.043270346812758, "grad_norm": 61.853782653808594, "learning_rate": 1.7746506337341564e-07, "logits/chosen": -19.24052619934082, "logits/rejected": -18.914295196533203, "logps/chosen": -343.90069580078125, "logps/rejected": -340.2979736328125, "loss": 0.9458, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1347756385803223, "rewards/margins": 0.7809724807739258, "rewards/rejected": 2.3538031578063965, "step": 44010 }, { "epoch": 2.0437346209201914, "grad_norm": 2.3348915576934814, "learning_rate": 1.7743720692696968e-07, "logits/chosen": -19.218122482299805, "logits/rejected": -17.54961395263672, "logps/chosen": -362.0381164550781, "logps/rejected": -189.6263427734375, "loss": 0.3321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.275325059890747, "rewards/margins": 2.3697569370269775, "rewards/rejected": 0.9055678248405457, "step": 44020 }, { "epoch": 2.044198895027624, "grad_norm": 2.765796184539795, "learning_rate": 1.774093504805237e-07, "logits/chosen": -19.242828369140625, "logits/rejected": -18.05540657043457, "logps/chosen": -503.0008850097656, "logps/rejected": -398.00982666015625, "loss": 0.5396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.225448131561279, "rewards/margins": 1.4945507049560547, "rewards/rejected": 2.7308974266052246, "step": 44030 }, { "epoch": 2.0446631691350574, "grad_norm": 58.986385345458984, "learning_rate": 1.7738149403407773e-07, "logits/chosen": -19.342159271240234, "logits/rejected": -18.068212509155273, "logps/chosen": -358.99951171875, "logps/rejected": -325.41827392578125, "loss": 0.4669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1315999031066895, "rewards/margins": 0.6569495797157288, "rewards/rejected": 2.4746501445770264, "step": 44040 }, { "epoch": 2.04512744324249, "grad_norm": 29.0989990234375, "learning_rate": 1.7735363758763172e-07, "logits/chosen": -19.80099868774414, "logits/rejected": -19.688495635986328, "logps/chosen": -375.6352233886719, "logps/rejected": -391.2604064941406, "loss": 0.9507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.628307819366455, "rewards/margins": 0.00025621653185226023, "rewards/rejected": 2.6280517578125, "step": 44050 }, { "epoch": 2.0455917173499234, "grad_norm": 133.20155334472656, "learning_rate": 1.7732578114118574e-07, "logits/chosen": -18.741207122802734, "logits/rejected": -18.4014835357666, "logps/chosen": -353.95452880859375, "logps/rejected": -372.2186584472656, "loss": 0.9385, "rewards/accuracies": 0.5, "rewards/chosen": 3.173625946044922, "rewards/margins": 0.41920098662376404, "rewards/rejected": 2.754424810409546, "step": 44060 }, { "epoch": 2.0460559914573566, "grad_norm": 30.55502700805664, "learning_rate": 1.7729792469473978e-07, "logits/chosen": -19.092876434326172, "logits/rejected": -18.631704330444336, "logps/chosen": -382.48321533203125, "logps/rejected": -334.43121337890625, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.894681453704834, "rewards/margins": 1.5657163858413696, "rewards/rejected": 2.3289647102355957, "step": 44070 }, { "epoch": 2.0465202655647894, "grad_norm": 25.472501754760742, "learning_rate": 1.7727006824829377e-07, "logits/chosen": -19.06020736694336, "logits/rejected": -18.33021354675293, "logps/chosen": -352.72576904296875, "logps/rejected": -277.04620361328125, "loss": 0.8119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.300530195236206, "rewards/margins": 1.2007718086242676, "rewards/rejected": 2.0997583866119385, "step": 44080 }, { "epoch": 2.0469845396722226, "grad_norm": 290.6937561035156, "learning_rate": 1.772422118018478e-07, "logits/chosen": -18.694433212280273, "logits/rejected": -18.191852569580078, "logps/chosen": -543.933837890625, "logps/rejected": -484.50592041015625, "loss": 1.0681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.240742206573486, "rewards/margins": 0.777591347694397, "rewards/rejected": 3.4631505012512207, "step": 44090 }, { "epoch": 2.0474488137796554, "grad_norm": 15.248483657836914, "learning_rate": 1.7721435535540182e-07, "logits/chosen": -18.503944396972656, "logits/rejected": -18.432401657104492, "logps/chosen": -478.26129150390625, "logps/rejected": -405.0166015625, "loss": 0.5418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.11983585357666, "rewards/margins": 1.5404348373413086, "rewards/rejected": 2.5794010162353516, "step": 44100 }, { "epoch": 2.0479130878870886, "grad_norm": 87.69205474853516, "learning_rate": 1.7718649890895586e-07, "logits/chosen": -19.511127471923828, "logits/rejected": -19.49468231201172, "logps/chosen": -458.611328125, "logps/rejected": -453.89031982421875, "loss": 0.809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.146149158477783, "rewards/margins": 0.03748949617147446, "rewards/rejected": 3.108659505844116, "step": 44110 }, { "epoch": 2.0483773619945214, "grad_norm": 34.51210403442383, "learning_rate": 1.7715864246250985e-07, "logits/chosen": -19.194250106811523, "logits/rejected": -17.801555633544922, "logps/chosen": -422.7245178222656, "logps/rejected": -278.65447998046875, "loss": 0.5539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.483976364135742, "rewards/margins": 1.4028441905975342, "rewards/rejected": 2.081132173538208, "step": 44120 }, { "epoch": 2.0488416361019546, "grad_norm": 49.763526916503906, "learning_rate": 1.7713078601606387e-07, "logits/chosen": -19.10894775390625, "logits/rejected": -18.366092681884766, "logps/chosen": -460.3030700683594, "logps/rejected": -330.3846740722656, "loss": 0.4754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.316641330718994, "rewards/margins": 2.03153920173645, "rewards/rejected": 2.285102367401123, "step": 44130 }, { "epoch": 2.049305910209388, "grad_norm": 17.123268127441406, "learning_rate": 1.771029295696179e-07, "logits/chosen": -19.482988357543945, "logits/rejected": -19.1458683013916, "logps/chosen": -372.5714416503906, "logps/rejected": -368.53289794921875, "loss": 1.1556, "rewards/accuracies": 0.5, "rewards/chosen": 2.3260419368743896, "rewards/margins": -0.4203014373779297, "rewards/rejected": 2.7463440895080566, "step": 44140 }, { "epoch": 2.0497701843168206, "grad_norm": 50.46728515625, "learning_rate": 1.7707507312317192e-07, "logits/chosen": -18.941884994506836, "logits/rejected": -18.56502914428711, "logps/chosen": -378.4423522949219, "logps/rejected": -312.3900451660156, "loss": 0.709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.154062271118164, "rewards/margins": 0.5864024758338928, "rewards/rejected": 2.567659854888916, "step": 44150 }, { "epoch": 2.050234458424254, "grad_norm": 2.8947877883911133, "learning_rate": 1.770472166767259e-07, "logits/chosen": -18.80910873413086, "logits/rejected": -17.99715232849121, "logps/chosen": -278.9645690917969, "logps/rejected": -230.4888916015625, "loss": 0.3781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.822150468826294, "rewards/margins": 1.5429461002349854, "rewards/rejected": 1.2792041301727295, "step": 44160 }, { "epoch": 2.0506987325316866, "grad_norm": 0.10919442027807236, "learning_rate": 1.7701936023027995e-07, "logits/chosen": -18.6484317779541, "logits/rejected": -18.168903350830078, "logps/chosen": -459.75445556640625, "logps/rejected": -369.74835205078125, "loss": 1.097, "rewards/accuracies": 0.5, "rewards/chosen": 4.593295097351074, "rewards/margins": 0.5386146903038025, "rewards/rejected": 4.054680347442627, "step": 44170 }, { "epoch": 2.0511630066391198, "grad_norm": 30.213829040527344, "learning_rate": 1.7699150378383396e-07, "logits/chosen": -18.702198028564453, "logits/rejected": -18.65406608581543, "logps/chosen": -372.24859619140625, "logps/rejected": -408.5937805175781, "loss": 0.972, "rewards/accuracies": 0.5, "rewards/chosen": 2.7286157608032227, "rewards/margins": 0.01434173621237278, "rewards/rejected": 2.7142739295959473, "step": 44180 }, { "epoch": 2.051627280746553, "grad_norm": 73.72528076171875, "learning_rate": 1.76963647337388e-07, "logits/chosen": -18.053207397460938, "logits/rejected": -18.0518856048584, "logps/chosen": -385.8443908691406, "logps/rejected": -342.31439208984375, "loss": 0.61, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.16237211227417, "rewards/margins": 0.43455275893211365, "rewards/rejected": 2.7278192043304443, "step": 44190 }, { "epoch": 2.0520915548539858, "grad_norm": 17.8443660736084, "learning_rate": 1.76935790890942e-07, "logits/chosen": -19.540281295776367, "logits/rejected": -18.13119888305664, "logps/chosen": -479.0873107910156, "logps/rejected": -317.09698486328125, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 4.396670341491699, "rewards/margins": 2.1786909103393555, "rewards/rejected": 2.2179794311523438, "step": 44200 }, { "epoch": 2.052555828961419, "grad_norm": 148.7273406982422, "learning_rate": 1.76907934444496e-07, "logits/chosen": -19.160961151123047, "logits/rejected": -18.930410385131836, "logps/chosen": -333.217041015625, "logps/rejected": -291.207275390625, "loss": 1.3948, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3167340755462646, "rewards/margins": -0.5473524928092957, "rewards/rejected": 2.864086389541626, "step": 44210 }, { "epoch": 2.0530201030688517, "grad_norm": 69.24742889404297, "learning_rate": 1.7688007799805005e-07, "logits/chosen": -18.06066131591797, "logits/rejected": -17.233360290527344, "logps/chosen": -395.58392333984375, "logps/rejected": -279.47076416015625, "loss": 0.3181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.159813642501831, "rewards/margins": 1.5346074104309082, "rewards/rejected": 1.6252063512802124, "step": 44220 }, { "epoch": 2.053484377176285, "grad_norm": 28.581056594848633, "learning_rate": 1.7685222155160406e-07, "logits/chosen": -20.12285041809082, "logits/rejected": -18.525611877441406, "logps/chosen": -424.4703063964844, "logps/rejected": -296.3514099121094, "loss": 0.3426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7212226390838623, "rewards/margins": 1.7214250564575195, "rewards/rejected": 1.9997975826263428, "step": 44230 }, { "epoch": 2.0539486512837177, "grad_norm": 85.38878631591797, "learning_rate": 1.7682436510515808e-07, "logits/chosen": -18.699176788330078, "logits/rejected": -18.40956687927246, "logps/chosen": -402.24310302734375, "logps/rejected": -405.5538635253906, "loss": 1.1764, "rewards/accuracies": 0.5, "rewards/chosen": 3.069304943084717, "rewards/margins": 0.2309083640575409, "rewards/rejected": 2.8383965492248535, "step": 44240 }, { "epoch": 2.054412925391151, "grad_norm": 54.316917419433594, "learning_rate": 1.767965086587121e-07, "logits/chosen": -18.368545532226562, "logits/rejected": -17.78226661682129, "logps/chosen": -363.10089111328125, "logps/rejected": -311.8710632324219, "loss": 0.6406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2163586616516113, "rewards/margins": 1.2203058004379272, "rewards/rejected": 1.9960529804229736, "step": 44250 }, { "epoch": 2.054877199498584, "grad_norm": 112.29779052734375, "learning_rate": 1.7676865221226613e-07, "logits/chosen": -19.574920654296875, "logits/rejected": -19.330373764038086, "logps/chosen": -408.36376953125, "logps/rejected": -351.66864013671875, "loss": 0.8205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.282166481018066, "rewards/margins": 1.183663249015808, "rewards/rejected": 3.0985028743743896, "step": 44260 }, { "epoch": 2.055341473606017, "grad_norm": 10.386098861694336, "learning_rate": 1.7674079576582012e-07, "logits/chosen": -18.984220504760742, "logits/rejected": -18.359580993652344, "logps/chosen": -329.2601623535156, "logps/rejected": -311.6821594238281, "loss": 0.5065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.560802936553955, "rewards/margins": 0.8028669357299805, "rewards/rejected": 2.7579362392425537, "step": 44270 }, { "epoch": 2.05580574771345, "grad_norm": 38.338497161865234, "learning_rate": 1.7671293931937414e-07, "logits/chosen": -18.88446807861328, "logits/rejected": -17.98405647277832, "logps/chosen": -589.9525146484375, "logps/rejected": -417.319580078125, "loss": 0.3158, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.284684181213379, "rewards/margins": 1.4834227561950684, "rewards/rejected": 2.8012614250183105, "step": 44280 }, { "epoch": 2.056270021820883, "grad_norm": 150.294189453125, "learning_rate": 1.7668508287292818e-07, "logits/chosen": -18.890125274658203, "logits/rejected": -18.477529525756836, "logps/chosen": -352.70196533203125, "logps/rejected": -280.3914794921875, "loss": 0.693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6960222721099854, "rewards/margins": 0.7968570590019226, "rewards/rejected": 1.899165391921997, "step": 44290 }, { "epoch": 2.056734295928316, "grad_norm": 55.09441375732422, "learning_rate": 1.766572264264822e-07, "logits/chosen": -19.497716903686523, "logits/rejected": -18.804826736450195, "logps/chosen": -347.74664306640625, "logps/rejected": -273.78497314453125, "loss": 0.5676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7856338024139404, "rewards/margins": 0.9322710037231445, "rewards/rejected": 1.853363037109375, "step": 44300 }, { "epoch": 2.057198570035749, "grad_norm": 102.12259674072266, "learning_rate": 1.7662936998003618e-07, "logits/chosen": -19.188232421875, "logits/rejected": -18.63518714904785, "logps/chosen": -390.3486633300781, "logps/rejected": -310.7537536621094, "loss": 0.3228, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9405226707458496, "rewards/margins": 1.5171658992767334, "rewards/rejected": 2.4233570098876953, "step": 44310 }, { "epoch": 2.057662844143182, "grad_norm": 12.343462944030762, "learning_rate": 1.7660151353359022e-07, "logits/chosen": -19.051198959350586, "logits/rejected": -18.143953323364258, "logps/chosen": -329.3746032714844, "logps/rejected": -253.723388671875, "loss": 0.6497, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.182218074798584, "rewards/margins": 0.8440597653388977, "rewards/rejected": 2.33815860748291, "step": 44320 }, { "epoch": 2.0581271182506153, "grad_norm": 59.548763275146484, "learning_rate": 1.7657365708714424e-07, "logits/chosen": -19.635526657104492, "logits/rejected": -18.789602279663086, "logps/chosen": -383.9624328613281, "logps/rejected": -338.463623046875, "loss": 0.6732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0830795764923096, "rewards/margins": 0.877893328666687, "rewards/rejected": 2.205185651779175, "step": 44330 }, { "epoch": 2.058591392358048, "grad_norm": 10.155021667480469, "learning_rate": 1.7654580064069828e-07, "logits/chosen": -18.20547103881836, "logits/rejected": -16.901721954345703, "logps/chosen": -465.3779296875, "logps/rejected": -390.0563049316406, "loss": 0.4503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6586265563964844, "rewards/margins": 1.760806679725647, "rewards/rejected": 1.8978201150894165, "step": 44340 }, { "epoch": 2.0590556664654813, "grad_norm": 110.070556640625, "learning_rate": 1.7651794419425226e-07, "logits/chosen": -19.046541213989258, "logits/rejected": -18.255569458007812, "logps/chosen": -349.7042236328125, "logps/rejected": -277.0201110839844, "loss": 0.501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4675402641296387, "rewards/margins": 1.117104172706604, "rewards/rejected": 2.3504364490509033, "step": 44350 }, { "epoch": 2.059519940572914, "grad_norm": 149.00973510742188, "learning_rate": 1.764900877478063e-07, "logits/chosen": -18.716053009033203, "logits/rejected": -18.457225799560547, "logps/chosen": -334.1767883300781, "logps/rejected": -286.2228698730469, "loss": 1.2695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7480173110961914, "rewards/margins": 0.4707377552986145, "rewards/rejected": 2.2772793769836426, "step": 44360 }, { "epoch": 2.0599842146803473, "grad_norm": 10.064934730529785, "learning_rate": 1.7646223130136032e-07, "logits/chosen": -18.403779983520508, "logits/rejected": -17.743419647216797, "logps/chosen": -270.2855529785156, "logps/rejected": -237.425048828125, "loss": 0.8251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.290419816970825, "rewards/margins": 0.8096733093261719, "rewards/rejected": 1.4807465076446533, "step": 44370 }, { "epoch": 2.06044848878778, "grad_norm": 85.46217346191406, "learning_rate": 1.7643437485491433e-07, "logits/chosen": -18.80266571044922, "logits/rejected": -17.978073120117188, "logps/chosen": -454.453125, "logps/rejected": -326.87408447265625, "loss": 0.6095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.816654920578003, "rewards/margins": 0.6781646609306335, "rewards/rejected": 3.1384902000427246, "step": 44380 }, { "epoch": 2.0609127628952133, "grad_norm": 72.19678497314453, "learning_rate": 1.7640651840846835e-07, "logits/chosen": -18.6451416015625, "logits/rejected": -17.444969177246094, "logps/chosen": -343.0048522949219, "logps/rejected": -244.32711791992188, "loss": 0.4462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.87158203125, "rewards/margins": 1.26358163356781, "rewards/rejected": 1.60800039768219, "step": 44390 }, { "epoch": 2.0613770370026465, "grad_norm": 77.07286071777344, "learning_rate": 1.7637866196202236e-07, "logits/chosen": -19.24474334716797, "logits/rejected": -18.675777435302734, "logps/chosen": -391.60235595703125, "logps/rejected": -337.2955627441406, "loss": 0.5538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7225089073181152, "rewards/margins": 0.890518069267273, "rewards/rejected": 2.8319907188415527, "step": 44400 }, { "epoch": 2.0618413111100793, "grad_norm": 70.16068267822266, "learning_rate": 1.763508055155764e-07, "logits/chosen": -19.406105041503906, "logits/rejected": -19.22308349609375, "logps/chosen": -485.1729431152344, "logps/rejected": -410.2286682128906, "loss": 0.7969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5988869667053223, "rewards/margins": 0.9625660181045532, "rewards/rejected": 2.6363205909729004, "step": 44410 }, { "epoch": 2.0623055852175125, "grad_norm": 91.17349243164062, "learning_rate": 1.7632294906913042e-07, "logits/chosen": -19.596590042114258, "logits/rejected": -18.072847366333008, "logps/chosen": -453.392578125, "logps/rejected": -393.3162841796875, "loss": 0.469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.245375156402588, "rewards/margins": 1.2881920337677002, "rewards/rejected": 2.9571831226348877, "step": 44420 }, { "epoch": 2.0627698593249453, "grad_norm": 58.701969146728516, "learning_rate": 1.762950926226844e-07, "logits/chosen": -19.005481719970703, "logits/rejected": -18.51729965209961, "logps/chosen": -398.04693603515625, "logps/rejected": -387.2527160644531, "loss": 0.9812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.100597858428955, "rewards/margins": 0.1572643518447876, "rewards/rejected": 2.943333864212036, "step": 44430 }, { "epoch": 2.0632341334323785, "grad_norm": 204.86480712890625, "learning_rate": 1.7626723617623845e-07, "logits/chosen": -18.8342342376709, "logits/rejected": -18.271900177001953, "logps/chosen": -407.31365966796875, "logps/rejected": -353.62847900390625, "loss": 0.9231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0488367080688477, "rewards/margins": 0.18646761775016785, "rewards/rejected": 2.8623690605163574, "step": 44440 }, { "epoch": 2.0636984075398117, "grad_norm": 82.67509460449219, "learning_rate": 1.7623937972979246e-07, "logits/chosen": -18.746755599975586, "logits/rejected": -17.936288833618164, "logps/chosen": -485.91534423828125, "logps/rejected": -438.93218994140625, "loss": 0.5109, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.455756664276123, "rewards/margins": 1.160573959350586, "rewards/rejected": 3.295182704925537, "step": 44450 }, { "epoch": 2.0641626816472445, "grad_norm": 1.2571768760681152, "learning_rate": 1.7621152328334645e-07, "logits/chosen": -18.531368255615234, "logits/rejected": -17.88233184814453, "logps/chosen": -367.92291259765625, "logps/rejected": -294.9130859375, "loss": 0.8401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5976080894470215, "rewards/margins": 1.0203946828842163, "rewards/rejected": 2.5772135257720947, "step": 44460 }, { "epoch": 2.0646269557546777, "grad_norm": 69.12825012207031, "learning_rate": 1.761836668369005e-07, "logits/chosen": -18.93404197692871, "logits/rejected": -18.89703941345215, "logps/chosen": -349.4306640625, "logps/rejected": -325.2982482910156, "loss": 1.0124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4459006786346436, "rewards/margins": 0.29213398694992065, "rewards/rejected": 3.153766393661499, "step": 44470 }, { "epoch": 2.0650912298621105, "grad_norm": 112.42425537109375, "learning_rate": 1.761558103904545e-07, "logits/chosen": -19.412220001220703, "logits/rejected": -18.566707611083984, "logps/chosen": -369.8920593261719, "logps/rejected": -229.23941040039062, "loss": 0.3806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1700851917266846, "rewards/margins": 2.0918996334075928, "rewards/rejected": 1.0781855583190918, "step": 44480 }, { "epoch": 2.0655555039695437, "grad_norm": 169.51853942871094, "learning_rate": 1.7612795394400855e-07, "logits/chosen": -19.01791763305664, "logits/rejected": -18.542213439941406, "logps/chosen": -373.77691650390625, "logps/rejected": -386.4590759277344, "loss": 0.8245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7663414478302, "rewards/margins": 0.265206903219223, "rewards/rejected": 3.5011343955993652, "step": 44490 }, { "epoch": 2.0660197780769765, "grad_norm": 0.6342881917953491, "learning_rate": 1.7610009749756254e-07, "logits/chosen": -19.490392684936523, "logits/rejected": -18.607919692993164, "logps/chosen": -357.1017150878906, "logps/rejected": -326.627685546875, "loss": 0.3875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.954385757446289, "rewards/margins": 2.4150948524475098, "rewards/rejected": 1.5392907857894897, "step": 44500 }, { "epoch": 2.0664840521844097, "grad_norm": 125.2986831665039, "learning_rate": 1.7607224105111658e-07, "logits/chosen": -19.020427703857422, "logits/rejected": -18.39706802368164, "logps/chosen": -441.9017639160156, "logps/rejected": -233.71371459960938, "loss": 0.2539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.149840354919434, "rewards/margins": 2.613252878189087, "rewards/rejected": 1.536587119102478, "step": 44510 }, { "epoch": 2.066948326291843, "grad_norm": 67.7188491821289, "learning_rate": 1.760443846046706e-07, "logits/chosen": -19.247854232788086, "logits/rejected": -19.12099838256836, "logps/chosen": -511.782470703125, "logps/rejected": -549.42529296875, "loss": 1.1527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.185738563537598, "rewards/margins": -0.10606183856725693, "rewards/rejected": 4.291800022125244, "step": 44520 }, { "epoch": 2.0674126003992757, "grad_norm": 172.84593200683594, "learning_rate": 1.7601652815822463e-07, "logits/chosen": -18.723529815673828, "logits/rejected": -17.933542251586914, "logps/chosen": -424.7344665527344, "logps/rejected": -334.8087158203125, "loss": 1.7206, "rewards/accuracies": 0.5, "rewards/chosen": 2.235487222671509, "rewards/margins": -0.6362762451171875, "rewards/rejected": 2.8717637062072754, "step": 44530 }, { "epoch": 2.067876874506709, "grad_norm": 8.636551856994629, "learning_rate": 1.7598867171177862e-07, "logits/chosen": -19.477014541625977, "logits/rejected": -18.783681869506836, "logps/chosen": -468.47955322265625, "logps/rejected": -386.98724365234375, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.680428981781006, "rewards/margins": 1.0951136350631714, "rewards/rejected": 2.585314989089966, "step": 44540 }, { "epoch": 2.0683411486141416, "grad_norm": 147.58848571777344, "learning_rate": 1.7596081526533263e-07, "logits/chosen": -19.648693084716797, "logits/rejected": -18.366378784179688, "logps/chosen": -393.5489196777344, "logps/rejected": -281.8590393066406, "loss": 0.5146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.467219114303589, "rewards/margins": 1.511455774307251, "rewards/rejected": 1.955763578414917, "step": 44550 }, { "epoch": 2.068805422721575, "grad_norm": 169.3972930908203, "learning_rate": 1.7593295881888668e-07, "logits/chosen": -18.840120315551758, "logits/rejected": -18.754175186157227, "logps/chosen": -297.1436462402344, "logps/rejected": -247.41702270507812, "loss": 1.0932, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8326544761657715, "rewards/margins": -0.04342663288116455, "rewards/rejected": 2.8760812282562256, "step": 44560 }, { "epoch": 2.0692696968290076, "grad_norm": 261.27569580078125, "learning_rate": 1.759051023724407e-07, "logits/chosen": -19.310455322265625, "logits/rejected": -18.46975326538086, "logps/chosen": -517.1856689453125, "logps/rejected": -347.1719970703125, "loss": 0.4705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0084123611450195, "rewards/margins": 1.328107237815857, "rewards/rejected": 1.6803048849105835, "step": 44570 }, { "epoch": 2.069733970936441, "grad_norm": 16.293197631835938, "learning_rate": 1.7587724592599468e-07, "logits/chosen": -19.733945846557617, "logits/rejected": -18.28040313720703, "logps/chosen": -417.2108459472656, "logps/rejected": -265.2725830078125, "loss": 0.4098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7888424396514893, "rewards/margins": 1.7602672576904297, "rewards/rejected": 2.0285751819610596, "step": 44580 }, { "epoch": 2.070198245043874, "grad_norm": 44.42853546142578, "learning_rate": 1.7584938947954872e-07, "logits/chosen": -19.595991134643555, "logits/rejected": -19.177730560302734, "logps/chosen": -486.16326904296875, "logps/rejected": -304.70648193359375, "loss": 0.7784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9343128204345703, "rewards/margins": 0.8082244992256165, "rewards/rejected": 2.1260883808135986, "step": 44590 }, { "epoch": 2.070662519151307, "grad_norm": 26.652912139892578, "learning_rate": 1.7582153303310273e-07, "logits/chosen": -18.41135025024414, "logits/rejected": -18.06036376953125, "logps/chosen": -347.896728515625, "logps/rejected": -387.7835998535156, "loss": 0.7948, "rewards/accuracies": 0.5, "rewards/chosen": 3.217651844024658, "rewards/margins": 0.20938155055046082, "rewards/rejected": 3.008270025253296, "step": 44600 }, { "epoch": 2.07112679325874, "grad_norm": 2.0852251052856445, "learning_rate": 1.7579367658665677e-07, "logits/chosen": -19.363893508911133, "logits/rejected": -18.072673797607422, "logps/chosen": -470.140869140625, "logps/rejected": -336.05010986328125, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.78839373588562, "rewards/margins": 1.3318297863006592, "rewards/rejected": 2.456564426422119, "step": 44610 }, { "epoch": 2.071591067366173, "grad_norm": 9.711477279663086, "learning_rate": 1.7576582014021076e-07, "logits/chosen": -18.644031524658203, "logits/rejected": -17.640256881713867, "logps/chosen": -379.9989318847656, "logps/rejected": -334.5809631347656, "loss": 0.532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.349229097366333, "rewards/margins": 1.4236116409301758, "rewards/rejected": 1.9256172180175781, "step": 44620 }, { "epoch": 2.072055341473606, "grad_norm": 137.0191650390625, "learning_rate": 1.7573796369376478e-07, "logits/chosen": -19.369415283203125, "logits/rejected": -18.23708152770996, "logps/chosen": -485.412353515625, "logps/rejected": -430.96307373046875, "loss": 0.6284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7604777812957764, "rewards/margins": 0.9762662649154663, "rewards/rejected": 2.7842118740081787, "step": 44630 }, { "epoch": 2.0725196155810393, "grad_norm": 55.39387512207031, "learning_rate": 1.7571010724731882e-07, "logits/chosen": -19.69603157043457, "logits/rejected": -19.433137893676758, "logps/chosen": -477.5741271972656, "logps/rejected": -436.3043518066406, "loss": 0.7449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6329219341278076, "rewards/margins": 0.43751344084739685, "rewards/rejected": 3.195408821105957, "step": 44640 }, { "epoch": 2.072983889688472, "grad_norm": 124.00715637207031, "learning_rate": 1.756822508008728e-07, "logits/chosen": -20.12136459350586, "logits/rejected": -18.761667251586914, "logps/chosen": -492.81097412109375, "logps/rejected": -392.8799133300781, "loss": 0.3821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9087109565734863, "rewards/margins": 1.6318395137786865, "rewards/rejected": 2.276871919631958, "step": 44650 }, { "epoch": 2.0734481637959052, "grad_norm": 67.90758514404297, "learning_rate": 1.7565439435442685e-07, "logits/chosen": -19.978328704833984, "logits/rejected": -18.4287166595459, "logps/chosen": -442.40625, "logps/rejected": -306.25933837890625, "loss": 0.4825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4551684856414795, "rewards/margins": 1.0436527729034424, "rewards/rejected": 2.411515951156616, "step": 44660 }, { "epoch": 2.073912437903338, "grad_norm": 232.2259979248047, "learning_rate": 1.7562653790798086e-07, "logits/chosen": -19.225555419921875, "logits/rejected": -18.727535247802734, "logps/chosen": -432.62542724609375, "logps/rejected": -312.0566101074219, "loss": 0.7509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3122475147247314, "rewards/margins": 1.0609501600265503, "rewards/rejected": 2.2512974739074707, "step": 44670 }, { "epoch": 2.0743767120107712, "grad_norm": 55.64809036254883, "learning_rate": 1.755986814615349e-07, "logits/chosen": -18.8411808013916, "logits/rejected": -18.41034507751465, "logps/chosen": -383.4222717285156, "logps/rejected": -329.9846496582031, "loss": 0.9754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0896987915039062, "rewards/margins": 0.29293161630630493, "rewards/rejected": 2.796767234802246, "step": 44680 }, { "epoch": 2.074840986118204, "grad_norm": 2.417912244796753, "learning_rate": 1.755708250150889e-07, "logits/chosen": -18.722652435302734, "logits/rejected": -18.005809783935547, "logps/chosen": -345.97735595703125, "logps/rejected": -242.33480834960938, "loss": 0.7572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.886904001235962, "rewards/margins": 0.9736120104789734, "rewards/rejected": 1.9132919311523438, "step": 44690 }, { "epoch": 2.075305260225637, "grad_norm": 105.98374938964844, "learning_rate": 1.755429685686429e-07, "logits/chosen": -19.40703010559082, "logits/rejected": -19.791149139404297, "logps/chosen": -355.08123779296875, "logps/rejected": -392.16436767578125, "loss": 1.4359, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0062320232391357, "rewards/margins": -0.725684642791748, "rewards/rejected": 3.7319164276123047, "step": 44700 }, { "epoch": 2.0757695343330704, "grad_norm": 5.728963375091553, "learning_rate": 1.7551511212219695e-07, "logits/chosen": -19.844928741455078, "logits/rejected": -18.890043258666992, "logps/chosen": -445.54949951171875, "logps/rejected": -375.0138854980469, "loss": 0.6856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9857287406921387, "rewards/margins": 1.152517557144165, "rewards/rejected": 1.8332111835479736, "step": 44710 }, { "epoch": 2.076233808440503, "grad_norm": 42.900230407714844, "learning_rate": 1.7548725567575096e-07, "logits/chosen": -19.25946044921875, "logits/rejected": -18.652814865112305, "logps/chosen": -440.08648681640625, "logps/rejected": -336.4974670410156, "loss": 0.4137, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.80619740486145, "rewards/margins": 1.2599503993988037, "rewards/rejected": 2.5462467670440674, "step": 44720 }, { "epoch": 2.0766980825479364, "grad_norm": 50.21938705444336, "learning_rate": 1.7545939922930495e-07, "logits/chosen": -18.00238800048828, "logits/rejected": -17.361125946044922, "logps/chosen": -259.5851135253906, "logps/rejected": -196.58799743652344, "loss": 0.6423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5850751399993896, "rewards/margins": 1.0881513357162476, "rewards/rejected": 1.4969236850738525, "step": 44730 }, { "epoch": 2.077162356655369, "grad_norm": 95.96946716308594, "learning_rate": 1.75431542782859e-07, "logits/chosen": -19.47859764099121, "logits/rejected": -19.206205368041992, "logps/chosen": -456.20867919921875, "logps/rejected": -365.99810791015625, "loss": 0.6637, "rewards/accuracies": 0.5, "rewards/chosen": 3.2944164276123047, "rewards/margins": 0.7802046537399292, "rewards/rejected": 2.514211893081665, "step": 44740 }, { "epoch": 2.0776266307628024, "grad_norm": 136.41552734375, "learning_rate": 1.75403686336413e-07, "logits/chosen": -19.116857528686523, "logits/rejected": -18.119487762451172, "logps/chosen": -355.6654357910156, "logps/rejected": -228.84597778320312, "loss": 0.515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6505208015441895, "rewards/margins": 1.304560899734497, "rewards/rejected": 1.345960021018982, "step": 44750 }, { "epoch": 2.078090904870235, "grad_norm": 26.895662307739258, "learning_rate": 1.7537582988996705e-07, "logits/chosen": -18.891613006591797, "logits/rejected": -18.692760467529297, "logps/chosen": -375.5610046386719, "logps/rejected": -390.90924072265625, "loss": 0.7503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.12009596824646, "rewards/margins": 0.7250837087631226, "rewards/rejected": 2.395012378692627, "step": 44760 }, { "epoch": 2.0785551789776684, "grad_norm": 131.4099884033203, "learning_rate": 1.7534797344352103e-07, "logits/chosen": -18.484676361083984, "logits/rejected": -17.51633644104004, "logps/chosen": -494.3428649902344, "logps/rejected": -286.5079650878906, "loss": 0.7815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8911499977111816, "rewards/margins": 1.8518129587173462, "rewards/rejected": 2.039336919784546, "step": 44770 }, { "epoch": 2.0790194530851016, "grad_norm": 16.15654182434082, "learning_rate": 1.7532011699707507e-07, "logits/chosen": -18.67760467529297, "logits/rejected": -18.952899932861328, "logps/chosen": -305.9217224121094, "logps/rejected": -365.4601135253906, "loss": 0.948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7581052780151367, "rewards/margins": -0.15051111578941345, "rewards/rejected": 2.908616304397583, "step": 44780 }, { "epoch": 2.0794837271925344, "grad_norm": 18.99495506286621, "learning_rate": 1.752922605506291e-07, "logits/chosen": -19.357990264892578, "logits/rejected": -18.37828826904297, "logps/chosen": -488.5211486816406, "logps/rejected": -334.16094970703125, "loss": 0.441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4849486351013184, "rewards/margins": 1.585922122001648, "rewards/rejected": 1.8990265130996704, "step": 44790 }, { "epoch": 2.0799480012999676, "grad_norm": 9.316580772399902, "learning_rate": 1.752644041041831e-07, "logits/chosen": -18.24942398071289, "logits/rejected": -18.57292938232422, "logps/chosen": -260.1778869628906, "logps/rejected": -282.9967041015625, "loss": 0.5282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.746279716491699, "rewards/margins": 0.7278445959091187, "rewards/rejected": 2.018435001373291, "step": 44800 }, { "epoch": 2.0804122754074004, "grad_norm": 49.61796569824219, "learning_rate": 1.7523654765773712e-07, "logits/chosen": -18.90252685546875, "logits/rejected": -18.600723266601562, "logps/chosen": -388.788818359375, "logps/rejected": -348.4111022949219, "loss": 0.3806, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6959757804870605, "rewards/margins": 1.2105686664581299, "rewards/rejected": 2.4854073524475098, "step": 44810 }, { "epoch": 2.0808765495148336, "grad_norm": 43.050437927246094, "learning_rate": 1.7520869121129113e-07, "logits/chosen": -18.873743057250977, "logits/rejected": -17.355154037475586, "logps/chosen": -397.21307373046875, "logps/rejected": -277.22576904296875, "loss": 0.4283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.681992292404175, "rewards/margins": 1.8421051502227783, "rewards/rejected": 1.839887261390686, "step": 44820 }, { "epoch": 2.081340823622267, "grad_norm": 86.26041412353516, "learning_rate": 1.7518083476484517e-07, "logits/chosen": -19.784093856811523, "logits/rejected": -18.089096069335938, "logps/chosen": -382.78167724609375, "logps/rejected": -301.7738342285156, "loss": 0.6355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6513493061065674, "rewards/margins": 1.3244340419769287, "rewards/rejected": 2.3269155025482178, "step": 44830 }, { "epoch": 2.0818050977296996, "grad_norm": 255.88941955566406, "learning_rate": 1.751529783183992e-07, "logits/chosen": -19.532222747802734, "logits/rejected": -18.242219924926758, "logps/chosen": -542.1939697265625, "logps/rejected": -448.64215087890625, "loss": 0.5703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.275721549987793, "rewards/margins": 1.3533591032028198, "rewards/rejected": 2.9223620891571045, "step": 44840 }, { "epoch": 2.082269371837133, "grad_norm": 88.35637664794922, "learning_rate": 1.7512512187195318e-07, "logits/chosen": -18.974336624145508, "logits/rejected": -18.30167007446289, "logps/chosen": -354.035400390625, "logps/rejected": -289.24920654296875, "loss": 0.3538, "rewards/accuracies": 1.0, "rewards/chosen": 2.941277027130127, "rewards/margins": 1.0670057535171509, "rewards/rejected": 1.8742711544036865, "step": 44850 }, { "epoch": 2.0827336459445656, "grad_norm": 173.15701293945312, "learning_rate": 1.7509726542550722e-07, "logits/chosen": -18.026037216186523, "logits/rejected": -17.518672943115234, "logps/chosen": -410.51043701171875, "logps/rejected": -335.9383850097656, "loss": 0.9807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5145657062530518, "rewards/margins": 0.6367160677909851, "rewards/rejected": 2.8778491020202637, "step": 44860 }, { "epoch": 2.0831979200519988, "grad_norm": 92.30205535888672, "learning_rate": 1.7506940897906123e-07, "logits/chosen": -18.466949462890625, "logits/rejected": -18.565792083740234, "logps/chosen": -345.2665100097656, "logps/rejected": -309.40972900390625, "loss": 0.6308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7649130821228027, "rewards/margins": 0.28838473558425903, "rewards/rejected": 2.4765284061431885, "step": 44870 }, { "epoch": 2.0836621941594315, "grad_norm": 14.354708671569824, "learning_rate": 1.7504155253261522e-07, "logits/chosen": -18.86454200744629, "logits/rejected": -18.93781089782715, "logps/chosen": -356.05194091796875, "logps/rejected": -296.5145568847656, "loss": 0.5569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5905256271362305, "rewards/margins": 0.5711835026741028, "rewards/rejected": 2.0193419456481934, "step": 44880 }, { "epoch": 2.0841264682668648, "grad_norm": 3.7465579509735107, "learning_rate": 1.7501369608616926e-07, "logits/chosen": -18.32306480407715, "logits/rejected": -17.616308212280273, "logps/chosen": -364.08404541015625, "logps/rejected": -235.5713653564453, "loss": 0.5321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.553135633468628, "rewards/margins": 1.1798886060714722, "rewards/rejected": 1.3732473850250244, "step": 44890 }, { "epoch": 2.084590742374298, "grad_norm": 10.309385299682617, "learning_rate": 1.7498583963972328e-07, "logits/chosen": -19.481367111206055, "logits/rejected": -17.969364166259766, "logps/chosen": -361.46697998046875, "logps/rejected": -282.885498046875, "loss": 0.786, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1579010486602783, "rewards/margins": 0.995769202709198, "rewards/rejected": 2.1621317863464355, "step": 44900 }, { "epoch": 2.0850550164817307, "grad_norm": 0.1810777485370636, "learning_rate": 1.7495798319327732e-07, "logits/chosen": -19.37551498413086, "logits/rejected": -18.57275390625, "logps/chosen": -511.6029357910156, "logps/rejected": -349.6181335449219, "loss": 0.3434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.330192565917969, "rewards/margins": 1.9212894439697266, "rewards/rejected": 2.4089033603668213, "step": 44910 }, { "epoch": 2.085519290589164, "grad_norm": 135.6461181640625, "learning_rate": 1.749301267468313e-07, "logits/chosen": -19.589847564697266, "logits/rejected": -19.172073364257812, "logps/chosen": -333.0442810058594, "logps/rejected": -437.7225036621094, "loss": 0.992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.688629150390625, "rewards/margins": 0.5623828172683716, "rewards/rejected": 3.1262459754943848, "step": 44920 }, { "epoch": 2.0859835646965967, "grad_norm": 6.927517414093018, "learning_rate": 1.7490227030038535e-07, "logits/chosen": -19.06864356994629, "logits/rejected": -18.11064338684082, "logps/chosen": -334.7294006347656, "logps/rejected": -217.57034301757812, "loss": 0.542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0122177600860596, "rewards/margins": 1.237630844116211, "rewards/rejected": 1.7745869159698486, "step": 44930 }, { "epoch": 2.08644783880403, "grad_norm": 32.8173828125, "learning_rate": 1.7487441385393936e-07, "logits/chosen": -19.413368225097656, "logits/rejected": -18.367389678955078, "logps/chosen": -419.8949279785156, "logps/rejected": -271.95611572265625, "loss": 0.4217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.018387794494629, "rewards/margins": 1.874881386756897, "rewards/rejected": 2.1435065269470215, "step": 44940 }, { "epoch": 2.0869121129114627, "grad_norm": 5.5310468673706055, "learning_rate": 1.748465574074934e-07, "logits/chosen": -19.422618865966797, "logits/rejected": -18.20261573791504, "logps/chosen": -380.19830322265625, "logps/rejected": -256.9037780761719, "loss": 0.3578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.449202299118042, "rewards/margins": 2.3019967079162598, "rewards/rejected": 1.1472053527832031, "step": 44950 }, { "epoch": 2.087376387018896, "grad_norm": 180.86221313476562, "learning_rate": 1.748187009610474e-07, "logits/chosen": -19.02720069885254, "logits/rejected": -18.823776245117188, "logps/chosen": -401.9709167480469, "logps/rejected": -326.5409240722656, "loss": 0.2885, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.115393161773682, "rewards/margins": 1.6000277996063232, "rewards/rejected": 2.5153648853302, "step": 44960 }, { "epoch": 2.087840661126329, "grad_norm": 49.074039459228516, "learning_rate": 1.747908445146014e-07, "logits/chosen": -19.165386199951172, "logits/rejected": -18.87176513671875, "logps/chosen": -383.01849365234375, "logps/rejected": -374.971435546875, "loss": 1.2402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5399489402770996, "rewards/margins": 0.02189772203564644, "rewards/rejected": 3.518051862716675, "step": 44970 }, { "epoch": 2.088304935233762, "grad_norm": 16.963712692260742, "learning_rate": 1.7476298806815544e-07, "logits/chosen": -19.82183265686035, "logits/rejected": -18.865406036376953, "logps/chosen": -282.94427490234375, "logps/rejected": -258.9875183105469, "loss": 0.7849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1256027221679688, "rewards/margins": 0.8174245953559875, "rewards/rejected": 2.308178186416626, "step": 44980 }, { "epoch": 2.088769209341195, "grad_norm": 2.451597213745117, "learning_rate": 1.7473513162170946e-07, "logits/chosen": -19.137975692749023, "logits/rejected": -17.98704719543457, "logps/chosen": -495.1014099121094, "logps/rejected": -339.9453125, "loss": 0.7436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.040797233581543, "rewards/margins": 1.5189576148986816, "rewards/rejected": 2.5218396186828613, "step": 44990 }, { "epoch": 2.089233483448628, "grad_norm": 95.8475570678711, "learning_rate": 1.7470727517526345e-07, "logits/chosen": -17.949758529663086, "logits/rejected": -18.547672271728516, "logps/chosen": -313.19378662109375, "logps/rejected": -321.04583740234375, "loss": 1.0809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.549393892288208, "rewards/margins": -0.14918816089630127, "rewards/rejected": 2.6985819339752197, "step": 45000 }, { "epoch": 2.089697757556061, "grad_norm": 138.4458770751953, "learning_rate": 1.746794187288175e-07, "logits/chosen": -19.520061492919922, "logits/rejected": -18.457401275634766, "logps/chosen": -491.78546142578125, "logps/rejected": -387.216552734375, "loss": 0.4021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.660305976867676, "rewards/margins": 1.7370083332061768, "rewards/rejected": 2.923297882080078, "step": 45010 }, { "epoch": 2.0901620316634943, "grad_norm": 128.75851440429688, "learning_rate": 1.746515622823715e-07, "logits/chosen": -18.987873077392578, "logits/rejected": -19.789899826049805, "logps/chosen": -318.4842224121094, "logps/rejected": -315.0902404785156, "loss": 0.9013, "rewards/accuracies": 0.5, "rewards/chosen": 2.0345842838287354, "rewards/margins": -0.024183785542845726, "rewards/rejected": 2.0587680339813232, "step": 45020 }, { "epoch": 2.090626305770927, "grad_norm": 101.9997329711914, "learning_rate": 1.7462370583592554e-07, "logits/chosen": -19.058731079101562, "logits/rejected": -19.244667053222656, "logps/chosen": -451.913330078125, "logps/rejected": -411.9163513183594, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.406680107116699, "rewards/margins": 1.1978719234466553, "rewards/rejected": 3.2088077068328857, "step": 45030 }, { "epoch": 2.0910905798783603, "grad_norm": 0.7860685586929321, "learning_rate": 1.7459584938947953e-07, "logits/chosen": -20.078981399536133, "logits/rejected": -19.018678665161133, "logps/chosen": -384.63824462890625, "logps/rejected": -303.71527099609375, "loss": 0.5109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9444453716278076, "rewards/margins": 1.2063599824905396, "rewards/rejected": 2.738085985183716, "step": 45040 }, { "epoch": 2.091554853985793, "grad_norm": 85.88626098632812, "learning_rate": 1.7456799294303355e-07, "logits/chosen": -18.70471954345703, "logits/rejected": -18.645917892456055, "logps/chosen": -383.76470947265625, "logps/rejected": -360.8020324707031, "loss": 0.9985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.537511348724365, "rewards/margins": 0.7123605012893677, "rewards/rejected": 3.825150966644287, "step": 45050 }, { "epoch": 2.0920191280932263, "grad_norm": 271.5236511230469, "learning_rate": 1.745401364965876e-07, "logits/chosen": -17.857505798339844, "logits/rejected": -17.871511459350586, "logps/chosen": -327.5447082519531, "logps/rejected": -351.62799072265625, "loss": 0.584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.306683301925659, "rewards/margins": 1.4588332176208496, "rewards/rejected": 1.8478502035140991, "step": 45060 }, { "epoch": 2.092483402200659, "grad_norm": 92.13377380371094, "learning_rate": 1.7451228005014158e-07, "logits/chosen": -19.38056182861328, "logits/rejected": -19.247562408447266, "logps/chosen": -361.73468017578125, "logps/rejected": -351.6829528808594, "loss": 0.7767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7660820484161377, "rewards/margins": 0.9210667610168457, "rewards/rejected": 2.845015048980713, "step": 45070 }, { "epoch": 2.0929476763080923, "grad_norm": 11.845900535583496, "learning_rate": 1.7448442360369562e-07, "logits/chosen": -19.284732818603516, "logits/rejected": -18.02849006652832, "logps/chosen": -402.28631591796875, "logps/rejected": -295.60601806640625, "loss": 0.2932, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.250746726989746, "rewards/margins": 2.5216355323791504, "rewards/rejected": 1.7291114330291748, "step": 45080 }, { "epoch": 2.0934119504155255, "grad_norm": 17.670997619628906, "learning_rate": 1.7445656715724963e-07, "logits/chosen": -19.431598663330078, "logits/rejected": -18.387205123901367, "logps/chosen": -416.55316162109375, "logps/rejected": -314.4508972167969, "loss": 0.37, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2160122394561768, "rewards/margins": 1.0823078155517578, "rewards/rejected": 2.13370418548584, "step": 45090 }, { "epoch": 2.0938762245229583, "grad_norm": 52.18212127685547, "learning_rate": 1.7442871071080367e-07, "logits/chosen": -18.568693161010742, "logits/rejected": -18.544506072998047, "logps/chosen": -413.8140563964844, "logps/rejected": -432.1348571777344, "loss": 1.0612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.323864459991455, "rewards/margins": 0.27044421434402466, "rewards/rejected": 3.053420066833496, "step": 45100 }, { "epoch": 2.0943404986303915, "grad_norm": 226.84617614746094, "learning_rate": 1.7440085426435766e-07, "logits/chosen": -20.260498046875, "logits/rejected": -18.830162048339844, "logps/chosen": -495.0201721191406, "logps/rejected": -358.76580810546875, "loss": 0.5688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3373212814331055, "rewards/margins": 1.4243004322052002, "rewards/rejected": 2.913020372390747, "step": 45110 }, { "epoch": 2.0948047727378243, "grad_norm": 3.079834222793579, "learning_rate": 1.7437299781791167e-07, "logits/chosen": -20.607364654541016, "logits/rejected": -19.504587173461914, "logps/chosen": -367.55047607421875, "logps/rejected": -296.76641845703125, "loss": 0.5743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7709546089172363, "rewards/margins": 1.547282099723816, "rewards/rejected": 2.2236721515655518, "step": 45120 }, { "epoch": 2.0952690468452575, "grad_norm": 87.18830108642578, "learning_rate": 1.7434514137146572e-07, "logits/chosen": -18.11733627319336, "logits/rejected": -18.104772567749023, "logps/chosen": -396.51678466796875, "logps/rejected": -385.5491027832031, "loss": 1.2168, "rewards/accuracies": 0.5, "rewards/chosen": 2.955420970916748, "rewards/margins": 0.17174692451953888, "rewards/rejected": 2.7836740016937256, "step": 45130 }, { "epoch": 2.0957333209526903, "grad_norm": 56.72617721557617, "learning_rate": 1.7431728492501973e-07, "logits/chosen": -19.068477630615234, "logits/rejected": -18.057079315185547, "logps/chosen": -375.6964111328125, "logps/rejected": -239.65658569335938, "loss": 0.2998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7546322345733643, "rewards/margins": 1.4418561458587646, "rewards/rejected": 1.3127758502960205, "step": 45140 }, { "epoch": 2.0961975950601235, "grad_norm": 49.9874267578125, "learning_rate": 1.7428942847857372e-07, "logits/chosen": -20.018644332885742, "logits/rejected": -18.355566024780273, "logps/chosen": -442.41455078125, "logps/rejected": -283.7337951660156, "loss": 0.4868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.9746198654174805, "rewards/margins": 2.0884957313537598, "rewards/rejected": 2.8861243724823, "step": 45150 }, { "epoch": 2.0966618691675567, "grad_norm": 52.71002960205078, "learning_rate": 1.7426157203212776e-07, "logits/chosen": -18.775341033935547, "logits/rejected": -18.53182029724121, "logps/chosen": -415.207275390625, "logps/rejected": -362.0982360839844, "loss": 1.3734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.767824649810791, "rewards/margins": 0.12024097144603729, "rewards/rejected": 3.6475837230682373, "step": 45160 }, { "epoch": 2.0971261432749895, "grad_norm": 95.01985168457031, "learning_rate": 1.7423371558568177e-07, "logits/chosen": -20.245044708251953, "logits/rejected": -20.229772567749023, "logps/chosen": -422.2225646972656, "logps/rejected": -353.78759765625, "loss": 0.3951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.772515058517456, "rewards/margins": 1.0426172018051147, "rewards/rejected": 2.729897975921631, "step": 45170 }, { "epoch": 2.0975904173824227, "grad_norm": 107.74561309814453, "learning_rate": 1.7420585913923581e-07, "logits/chosen": -18.250608444213867, "logits/rejected": -17.013917922973633, "logps/chosen": -444.57879638671875, "logps/rejected": -358.5045471191406, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 3.4150421619415283, "rewards/margins": 1.7997581958770752, "rewards/rejected": 1.6152842044830322, "step": 45180 }, { "epoch": 2.0980546914898555, "grad_norm": 81.46308135986328, "learning_rate": 1.741780026927898e-07, "logits/chosen": -18.81185531616211, "logits/rejected": -18.315135955810547, "logps/chosen": -367.7162170410156, "logps/rejected": -332.38934326171875, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.098590612411499, "rewards/margins": 0.32720527052879333, "rewards/rejected": 2.771385431289673, "step": 45190 }, { "epoch": 2.0985189655972887, "grad_norm": 58.41725540161133, "learning_rate": 1.7415014624634384e-07, "logits/chosen": -18.937484741210938, "logits/rejected": -17.529247283935547, "logps/chosen": -301.9396057128906, "logps/rejected": -199.60606384277344, "loss": 0.3814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.566450834274292, "rewards/margins": 1.3177027702331543, "rewards/rejected": 1.2487483024597168, "step": 45200 }, { "epoch": 2.0989832397047214, "grad_norm": 15.920256614685059, "learning_rate": 1.7412228979989786e-07, "logits/chosen": -18.64996337890625, "logits/rejected": -18.15752601623535, "logps/chosen": -355.6669921875, "logps/rejected": -300.91656494140625, "loss": 1.2841, "rewards/accuracies": 0.5, "rewards/chosen": 3.171839475631714, "rewards/margins": 0.010754585266113281, "rewards/rejected": 3.1610846519470215, "step": 45210 }, { "epoch": 2.0994475138121547, "grad_norm": 50.22486114501953, "learning_rate": 1.7409443335345187e-07, "logits/chosen": -19.136661529541016, "logits/rejected": -18.611936569213867, "logps/chosen": -347.35931396484375, "logps/rejected": -333.0314636230469, "loss": 1.1653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2439002990722656, "rewards/margins": 0.27194148302078247, "rewards/rejected": 2.971959352493286, "step": 45220 }, { "epoch": 2.099911787919588, "grad_norm": 81.32830810546875, "learning_rate": 1.740665769070059e-07, "logits/chosen": -18.69002914428711, "logits/rejected": -18.09292221069336, "logps/chosen": -462.31610107421875, "logps/rejected": -354.3712158203125, "loss": 0.667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.702648639678955, "rewards/margins": 1.0771634578704834, "rewards/rejected": 2.6254849433898926, "step": 45230 }, { "epoch": 2.1003760620270207, "grad_norm": 78.34564208984375, "learning_rate": 1.740387204605599e-07, "logits/chosen": -18.07422637939453, "logits/rejected": -17.644092559814453, "logps/chosen": -340.297607421875, "logps/rejected": -301.4516906738281, "loss": 0.8907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.969837188720703, "rewards/margins": 0.5728023648262024, "rewards/rejected": 2.3970344066619873, "step": 45240 }, { "epoch": 2.100840336134454, "grad_norm": 71.44621276855469, "learning_rate": 1.7401086401411394e-07, "logits/chosen": -19.256027221679688, "logits/rejected": -18.208213806152344, "logps/chosen": -415.3662109375, "logps/rejected": -301.51507568359375, "loss": 0.5672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9249234199523926, "rewards/margins": 0.8286183476448059, "rewards/rejected": 2.0963053703308105, "step": 45250 }, { "epoch": 2.1013046102418866, "grad_norm": 242.17694091796875, "learning_rate": 1.7398300756766793e-07, "logits/chosen": -19.30593490600586, "logits/rejected": -18.272830963134766, "logps/chosen": -357.6803894042969, "logps/rejected": -374.7660827636719, "loss": 1.0148, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.871051549911499, "rewards/margins": 0.17752626538276672, "rewards/rejected": 2.6935253143310547, "step": 45260 }, { "epoch": 2.10176888434932, "grad_norm": 1.112783670425415, "learning_rate": 1.7395515112122195e-07, "logits/chosen": -20.418855667114258, "logits/rejected": -19.829145431518555, "logps/chosen": -395.27081298828125, "logps/rejected": -324.59375, "loss": 0.6162, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.633040189743042, "rewards/margins": 0.8670123815536499, "rewards/rejected": 2.7660276889801025, "step": 45270 }, { "epoch": 2.102233158456753, "grad_norm": 46.89817428588867, "learning_rate": 1.7392729467477599e-07, "logits/chosen": -18.845439910888672, "logits/rejected": -18.018878936767578, "logps/chosen": -486.83294677734375, "logps/rejected": -386.32855224609375, "loss": 0.577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.615327835083008, "rewards/margins": 1.1661850214004517, "rewards/rejected": 3.4491424560546875, "step": 45280 }, { "epoch": 2.102697432564186, "grad_norm": 32.002593994140625, "learning_rate": 1.7389943822833e-07, "logits/chosen": -19.852550506591797, "logits/rejected": -18.135658264160156, "logps/chosen": -396.5814208984375, "logps/rejected": -249.7318878173828, "loss": 0.3232, "rewards/accuracies": 1.0, "rewards/chosen": 3.9891421794891357, "rewards/margins": 1.8735675811767578, "rewards/rejected": 2.115574836730957, "step": 45290 }, { "epoch": 2.103161706671619, "grad_norm": 266.0830078125, "learning_rate": 1.73871581781884e-07, "logits/chosen": -18.80896759033203, "logits/rejected": -18.413576126098633, "logps/chosen": -353.1339111328125, "logps/rejected": -322.14862060546875, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4575889110565186, "rewards/margins": 0.5567654371261597, "rewards/rejected": 1.9008235931396484, "step": 45300 }, { "epoch": 2.103625980779052, "grad_norm": 62.98097229003906, "learning_rate": 1.7384372533543803e-07, "logits/chosen": -20.025665283203125, "logits/rejected": -19.08807373046875, "logps/chosen": -417.40380859375, "logps/rejected": -337.65191650390625, "loss": 0.2878, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3842780590057373, "rewards/margins": 1.9707019329071045, "rewards/rejected": 1.413576364517212, "step": 45310 }, { "epoch": 2.104090254886485, "grad_norm": 7.520932197570801, "learning_rate": 1.7381586888899204e-07, "logits/chosen": -19.34940528869629, "logits/rejected": -18.54994773864746, "logps/chosen": -371.01373291015625, "logps/rejected": -329.2674255371094, "loss": 0.6667, "rewards/accuracies": 0.5, "rewards/chosen": 3.2381389141082764, "rewards/margins": 0.6865535974502563, "rewards/rejected": 2.5515856742858887, "step": 45320 }, { "epoch": 2.104554528993918, "grad_norm": 12.936332702636719, "learning_rate": 1.7378801244254609e-07, "logits/chosen": -20.5094051361084, "logits/rejected": -20.213897705078125, "logps/chosen": -655.787109375, "logps/rejected": -350.7814636230469, "loss": 0.3752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5232415199279785, "rewards/margins": 1.3692258596420288, "rewards/rejected": 3.15401554107666, "step": 45330 }, { "epoch": 2.105018803101351, "grad_norm": 220.5701904296875, "learning_rate": 1.7376015599610007e-07, "logits/chosen": -19.510953903198242, "logits/rejected": -19.30730438232422, "logps/chosen": -401.8267822265625, "logps/rejected": -396.67999267578125, "loss": 0.9264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9834675788879395, "rewards/margins": 0.39807844161987305, "rewards/rejected": 2.5853888988494873, "step": 45340 }, { "epoch": 2.1054830772087842, "grad_norm": 9.447869300842285, "learning_rate": 1.7373229954965411e-07, "logits/chosen": -18.299694061279297, "logits/rejected": -17.45475196838379, "logps/chosen": -430.6934509277344, "logps/rejected": -312.00128173828125, "loss": 0.452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7993931770324707, "rewards/margins": 1.715158224105835, "rewards/rejected": 2.0842347145080566, "step": 45350 }, { "epoch": 2.105947351316217, "grad_norm": 221.4174346923828, "learning_rate": 1.7370444310320813e-07, "logits/chosen": -18.427982330322266, "logits/rejected": -17.518367767333984, "logps/chosen": -379.97430419921875, "logps/rejected": -318.9420471191406, "loss": 0.6904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0422914028167725, "rewards/margins": 1.079638123512268, "rewards/rejected": 1.9626535177230835, "step": 45360 }, { "epoch": 2.1064116254236502, "grad_norm": 12.166695594787598, "learning_rate": 1.7367658665676217e-07, "logits/chosen": -18.40171241760254, "logits/rejected": -17.4559268951416, "logps/chosen": -485.9525451660156, "logps/rejected": -342.0303955078125, "loss": 0.7352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7110595703125, "rewards/margins": 1.4320790767669678, "rewards/rejected": 2.2789809703826904, "step": 45370 }, { "epoch": 2.106875899531083, "grad_norm": 2.497629404067993, "learning_rate": 1.7364873021031616e-07, "logits/chosen": -19.162992477416992, "logits/rejected": -17.684280395507812, "logps/chosen": -428.18994140625, "logps/rejected": -275.0762939453125, "loss": 0.5903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.94635272026062, "rewards/margins": 1.260811686515808, "rewards/rejected": 2.6855411529541016, "step": 45380 }, { "epoch": 2.1073401736385162, "grad_norm": 158.58067321777344, "learning_rate": 1.7362087376387017e-07, "logits/chosen": -18.458494186401367, "logits/rejected": -18.924510955810547, "logps/chosen": -354.5393371582031, "logps/rejected": -362.27667236328125, "loss": 1.1894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.799330949783325, "rewards/margins": 0.026651550084352493, "rewards/rejected": 3.772679567337036, "step": 45390 }, { "epoch": 2.107804447745949, "grad_norm": 13.166871070861816, "learning_rate": 1.7359301731742421e-07, "logits/chosen": -19.25302505493164, "logits/rejected": -18.459863662719727, "logps/chosen": -318.33636474609375, "logps/rejected": -275.3372802734375, "loss": 0.4034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4758810997009277, "rewards/margins": 0.9545413851737976, "rewards/rejected": 1.521339774131775, "step": 45400 }, { "epoch": 2.108268721853382, "grad_norm": 105.39386749267578, "learning_rate": 1.7356516087097823e-07, "logits/chosen": -19.27933692932129, "logits/rejected": -18.381023406982422, "logps/chosen": -440.25634765625, "logps/rejected": -334.8729248046875, "loss": 0.3269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3523166179656982, "rewards/margins": 2.1305580139160156, "rewards/rejected": 1.2217589616775513, "step": 45410 }, { "epoch": 2.1087329959608154, "grad_norm": 34.743831634521484, "learning_rate": 1.7353730442453222e-07, "logits/chosen": -19.81722068786621, "logits/rejected": -19.124584197998047, "logps/chosen": -494.3219299316406, "logps/rejected": -406.2067565917969, "loss": 0.8051, "rewards/accuracies": 0.5, "rewards/chosen": 4.089535713195801, "rewards/margins": 0.5730977654457092, "rewards/rejected": 3.5164387226104736, "step": 45420 }, { "epoch": 2.109197270068248, "grad_norm": 241.57325744628906, "learning_rate": 1.7350944797808626e-07, "logits/chosen": -18.899816513061523, "logits/rejected": -18.388916015625, "logps/chosen": -450.2947692871094, "logps/rejected": -474.38201904296875, "loss": 0.9453, "rewards/accuracies": 0.5, "rewards/chosen": 3.4255752563476562, "rewards/margins": 0.40719231963157654, "rewards/rejected": 3.0183827877044678, "step": 45430 }, { "epoch": 2.1096615441756814, "grad_norm": 94.55707550048828, "learning_rate": 1.7348159153164027e-07, "logits/chosen": -19.14834213256836, "logits/rejected": -18.55785369873047, "logps/chosen": -423.7290954589844, "logps/rejected": -365.80474853515625, "loss": 0.5304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.313174247741699, "rewards/margins": 1.3427890539169312, "rewards/rejected": 2.9703848361968994, "step": 45440 }, { "epoch": 2.110125818283114, "grad_norm": 27.9514217376709, "learning_rate": 1.734537350851943e-07, "logits/chosen": -19.450946807861328, "logits/rejected": -18.354660034179688, "logps/chosen": -380.1578674316406, "logps/rejected": -219.4210968017578, "loss": 0.4431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6651763916015625, "rewards/margins": 1.6740745306015015, "rewards/rejected": 1.991101861000061, "step": 45450 }, { "epoch": 2.1105900923905474, "grad_norm": 299.7472229003906, "learning_rate": 1.734258786387483e-07, "logits/chosen": -19.117511749267578, "logits/rejected": -18.768184661865234, "logps/chosen": -423.35650634765625, "logps/rejected": -327.5719299316406, "loss": 0.8044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.877671480178833, "rewards/margins": 0.7473981380462646, "rewards/rejected": 2.1302735805511475, "step": 45460 }, { "epoch": 2.1110543664979806, "grad_norm": 35.9371337890625, "learning_rate": 1.7339802219230232e-07, "logits/chosen": -18.057968139648438, "logits/rejected": -17.456024169921875, "logps/chosen": -427.1298828125, "logps/rejected": -330.8240051269531, "loss": 0.7996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.00144100189209, "rewards/margins": 1.3461687564849854, "rewards/rejected": 2.6552722454071045, "step": 45470 }, { "epoch": 2.1115186406054134, "grad_norm": 39.171478271484375, "learning_rate": 1.7337016574585636e-07, "logits/chosen": -18.997203826904297, "logits/rejected": -18.05928611755371, "logps/chosen": -352.83648681640625, "logps/rejected": -289.77752685546875, "loss": 0.5065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9717471599578857, "rewards/margins": 0.8550189733505249, "rewards/rejected": 2.1167283058166504, "step": 45480 }, { "epoch": 2.1119829147128466, "grad_norm": 0.46810102462768555, "learning_rate": 1.7334230929941034e-07, "logits/chosen": -18.84842872619629, "logits/rejected": -17.545032501220703, "logps/chosen": -487.47705078125, "logps/rejected": -352.52923583984375, "loss": 0.7523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.977691650390625, "rewards/margins": 1.4955189228057861, "rewards/rejected": 2.4821724891662598, "step": 45490 }, { "epoch": 2.1124471888202794, "grad_norm": 0.31340664625167847, "learning_rate": 1.7331445285296439e-07, "logits/chosen": -19.449481964111328, "logits/rejected": -18.41464614868164, "logps/chosen": -352.7217102050781, "logps/rejected": -283.2828674316406, "loss": 1.2562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7301132678985596, "rewards/margins": 0.6192938089370728, "rewards/rejected": 2.1108193397521973, "step": 45500 }, { "epoch": 2.1129114629277126, "grad_norm": 105.11309051513672, "learning_rate": 1.732865964065184e-07, "logits/chosen": -18.633853912353516, "logits/rejected": -18.31147575378418, "logps/chosen": -428.18194580078125, "logps/rejected": -387.82073974609375, "loss": 0.7885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8702666759490967, "rewards/margins": 0.9448922872543335, "rewards/rejected": 2.9253745079040527, "step": 45510 }, { "epoch": 2.1133757370351454, "grad_norm": 207.0276641845703, "learning_rate": 1.7325873996007244e-07, "logits/chosen": -19.246362686157227, "logits/rejected": -18.609722137451172, "logps/chosen": -523.9429931640625, "logps/rejected": -397.6134948730469, "loss": 0.7715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.102425575256348, "rewards/margins": 0.7089980244636536, "rewards/rejected": 3.393428087234497, "step": 45520 }, { "epoch": 2.1138400111425786, "grad_norm": 36.32840347290039, "learning_rate": 1.7323088351362643e-07, "logits/chosen": -19.848373413085938, "logits/rejected": -19.24761199951172, "logps/chosen": -359.3858947753906, "logps/rejected": -304.2044677734375, "loss": 0.5353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.177283525466919, "rewards/margins": 0.800325870513916, "rewards/rejected": 2.376957416534424, "step": 45530 }, { "epoch": 2.114304285250012, "grad_norm": 23.784997940063477, "learning_rate": 1.7320302706718044e-07, "logits/chosen": -19.388904571533203, "logits/rejected": -18.492961883544922, "logps/chosen": -436.71954345703125, "logps/rejected": -346.8855895996094, "loss": 0.6768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7504875659942627, "rewards/margins": 1.1260544061660767, "rewards/rejected": 2.6244330406188965, "step": 45540 }, { "epoch": 2.1147685593574446, "grad_norm": 3.0307841300964355, "learning_rate": 1.7317517062073449e-07, "logits/chosen": -18.960451126098633, "logits/rejected": -18.45796775817871, "logps/chosen": -397.2898864746094, "logps/rejected": -393.98406982421875, "loss": 0.5562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.722581386566162, "rewards/margins": 0.9046451449394226, "rewards/rejected": 2.817936420440674, "step": 45550 }, { "epoch": 2.115232833464878, "grad_norm": 191.44540405273438, "learning_rate": 1.731473141742885e-07, "logits/chosen": -19.62628746032715, "logits/rejected": -18.463043212890625, "logps/chosen": -361.0859069824219, "logps/rejected": -274.99609375, "loss": 0.6404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.575965166091919, "rewards/margins": 0.7658358812332153, "rewards/rejected": 2.810128688812256, "step": 45560 }, { "epoch": 2.1156971075723106, "grad_norm": 141.91525268554688, "learning_rate": 1.731194577278425e-07, "logits/chosen": -19.289676666259766, "logits/rejected": -18.282466888427734, "logps/chosen": -401.4374084472656, "logps/rejected": -357.14385986328125, "loss": 0.7089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5774993896484375, "rewards/margins": 0.6041358113288879, "rewards/rejected": 2.9733638763427734, "step": 45570 }, { "epoch": 2.1161613816797438, "grad_norm": 58.20608139038086, "learning_rate": 1.7309160128139653e-07, "logits/chosen": -19.46851921081543, "logits/rejected": -18.37114715576172, "logps/chosen": -458.253662109375, "logps/rejected": -302.9862365722656, "loss": 0.3995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.935032606124878, "rewards/margins": 1.6398292779922485, "rewards/rejected": 2.295203685760498, "step": 45580 }, { "epoch": 2.1166256557871765, "grad_norm": 142.12838745117188, "learning_rate": 1.7306374483495054e-07, "logits/chosen": -18.676116943359375, "logits/rejected": -18.609127044677734, "logps/chosen": -330.0765075683594, "logps/rejected": -336.0859375, "loss": 1.0271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.114576816558838, "rewards/margins": -0.25324639678001404, "rewards/rejected": 3.3678231239318848, "step": 45590 }, { "epoch": 2.1170899298946098, "grad_norm": 13.113527297973633, "learning_rate": 1.7303588838850458e-07, "logits/chosen": -19.147192001342773, "logits/rejected": -17.917224884033203, "logps/chosen": -413.37713623046875, "logps/rejected": -279.78607177734375, "loss": 0.345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.508136034011841, "rewards/margins": 1.3421474695205688, "rewards/rejected": 2.1659884452819824, "step": 45600 }, { "epoch": 2.117554204002043, "grad_norm": 57.16046142578125, "learning_rate": 1.7300803194205857e-07, "logits/chosen": -18.887418746948242, "logits/rejected": -18.612735748291016, "logps/chosen": -279.1496887207031, "logps/rejected": -259.02459716796875, "loss": 0.6111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4580485820770264, "rewards/margins": 0.8705365061759949, "rewards/rejected": 1.5875122547149658, "step": 45610 }, { "epoch": 2.1180184781094757, "grad_norm": 172.23899841308594, "learning_rate": 1.7298017549561261e-07, "logits/chosen": -18.853878021240234, "logits/rejected": -17.959766387939453, "logps/chosen": -461.2688903808594, "logps/rejected": -398.86346435546875, "loss": 0.9689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.292213439941406, "rewards/margins": 1.2518326044082642, "rewards/rejected": 3.0403807163238525, "step": 45620 }, { "epoch": 2.118482752216909, "grad_norm": 25.97888946533203, "learning_rate": 1.7295231904916663e-07, "logits/chosen": -18.466209411621094, "logits/rejected": -18.19417953491211, "logps/chosen": -290.6502685546875, "logps/rejected": -329.2286376953125, "loss": 0.7546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7036452293395996, "rewards/margins": 1.1475690603256226, "rewards/rejected": 1.556075930595398, "step": 45630 }, { "epoch": 2.1189470263243417, "grad_norm": 27.7051944732666, "learning_rate": 1.7292446260272062e-07, "logits/chosen": -19.2735652923584, "logits/rejected": -18.750003814697266, "logps/chosen": -441.7337951660156, "logps/rejected": -372.0755310058594, "loss": 0.8828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.358748197555542, "rewards/margins": 0.7022448778152466, "rewards/rejected": 2.656503200531006, "step": 45640 }, { "epoch": 2.119411300431775, "grad_norm": 132.40396118164062, "learning_rate": 1.7289660615627466e-07, "logits/chosen": -19.753753662109375, "logits/rejected": -18.889606475830078, "logps/chosen": -384.4528503417969, "logps/rejected": -345.09698486328125, "loss": 0.5776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6109745502471924, "rewards/margins": 0.9206514358520508, "rewards/rejected": 2.6903228759765625, "step": 45650 }, { "epoch": 2.119875574539208, "grad_norm": 0.24665723741054535, "learning_rate": 1.7286874970982867e-07, "logits/chosen": -19.134376525878906, "logits/rejected": -18.483495712280273, "logps/chosen": -316.6400146484375, "logps/rejected": -289.04412841796875, "loss": 1.0164, "rewards/accuracies": 0.5, "rewards/chosen": 2.890059471130371, "rewards/margins": 0.6591617465019226, "rewards/rejected": 2.2308976650238037, "step": 45660 }, { "epoch": 2.120339848646641, "grad_norm": 169.8780059814453, "learning_rate": 1.728408932633827e-07, "logits/chosen": -19.37102699279785, "logits/rejected": -19.4296875, "logps/chosen": -351.49310302734375, "logps/rejected": -390.42852783203125, "loss": 1.0126, "rewards/accuracies": 0.5, "rewards/chosen": 3.742690324783325, "rewards/margins": 0.23485930263996124, "rewards/rejected": 3.507830858230591, "step": 45670 }, { "epoch": 2.120804122754074, "grad_norm": 55.52506637573242, "learning_rate": 1.728130368169367e-07, "logits/chosen": -18.651199340820312, "logits/rejected": -17.765403747558594, "logps/chosen": -475.3191833496094, "logps/rejected": -339.4200134277344, "loss": 0.6179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.647439479827881, "rewards/margins": 1.0837814807891846, "rewards/rejected": 2.5636582374572754, "step": 45680 }, { "epoch": 2.121268396861507, "grad_norm": 138.62774658203125, "learning_rate": 1.7278518037049072e-07, "logits/chosen": -19.410099029541016, "logits/rejected": -18.507808685302734, "logps/chosen": -379.132568359375, "logps/rejected": -245.0394287109375, "loss": 0.4891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7097110748291016, "rewards/margins": 1.8887840509414673, "rewards/rejected": 1.8209272623062134, "step": 45690 }, { "epoch": 2.12173267096894, "grad_norm": 37.06282424926758, "learning_rate": 1.7275732392404476e-07, "logits/chosen": -19.17105484008789, "logits/rejected": -17.938915252685547, "logps/chosen": -456.09857177734375, "logps/rejected": -335.29144287109375, "loss": 0.3474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.112212181091309, "rewards/margins": 1.7117271423339844, "rewards/rejected": 2.4004852771759033, "step": 45700 }, { "epoch": 2.122196945076373, "grad_norm": 10.796565055847168, "learning_rate": 1.7272946747759877e-07, "logits/chosen": -19.453861236572266, "logits/rejected": -18.654071807861328, "logps/chosen": -416.50634765625, "logps/rejected": -278.774658203125, "loss": 0.5979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.068761110305786, "rewards/margins": 0.896277129650116, "rewards/rejected": 2.1724839210510254, "step": 45710 }, { "epoch": 2.122661219183806, "grad_norm": 58.81774139404297, "learning_rate": 1.7270161103115276e-07, "logits/chosen": -19.085012435913086, "logits/rejected": -20.082294464111328, "logps/chosen": -288.066162109375, "logps/rejected": -381.7801818847656, "loss": 1.5505, "rewards/accuracies": 0.5, "rewards/chosen": 2.9253809452056885, "rewards/margins": -0.8525185585021973, "rewards/rejected": 3.7778992652893066, "step": 45720 }, { "epoch": 2.1231254932912393, "grad_norm": 49.92389678955078, "learning_rate": 1.726737545847068e-07, "logits/chosen": -18.919992446899414, "logits/rejected": -17.92549705505371, "logps/chosen": -429.491455078125, "logps/rejected": -381.40130615234375, "loss": 0.4663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9477438926696777, "rewards/margins": 1.6728063821792603, "rewards/rejected": 2.274937152862549, "step": 45730 }, { "epoch": 2.123589767398672, "grad_norm": 33.90694046020508, "learning_rate": 1.7264589813826081e-07, "logits/chosen": -20.650514602661133, "logits/rejected": -18.961822509765625, "logps/chosen": -374.42327880859375, "logps/rejected": -314.4123229980469, "loss": 0.6287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.87599778175354, "rewards/margins": 0.920724093914032, "rewards/rejected": 1.9552736282348633, "step": 45740 }, { "epoch": 2.1240540415061053, "grad_norm": 9.28669548034668, "learning_rate": 1.7261804169181486e-07, "logits/chosen": -19.607234954833984, "logits/rejected": -19.023664474487305, "logps/chosen": -487.32757568359375, "logps/rejected": -354.5532531738281, "loss": 0.3217, "rewards/accuracies": 1.0, "rewards/chosen": 3.7537219524383545, "rewards/margins": 1.2369558811187744, "rewards/rejected": 2.51676607131958, "step": 45750 }, { "epoch": 2.124518315613538, "grad_norm": 53.36229705810547, "learning_rate": 1.7259018524536884e-07, "logits/chosen": -18.93229866027832, "logits/rejected": -18.217859268188477, "logps/chosen": -363.70904541015625, "logps/rejected": -317.98699951171875, "loss": 0.7301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5717437267303467, "rewards/margins": 1.1982452869415283, "rewards/rejected": 2.3734984397888184, "step": 45760 }, { "epoch": 2.1249825897209713, "grad_norm": 71.92171478271484, "learning_rate": 1.7256232879892288e-07, "logits/chosen": -18.39883804321289, "logits/rejected": -17.8149471282959, "logps/chosen": -412.4971618652344, "logps/rejected": -323.2561340332031, "loss": 0.6933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9593846797943115, "rewards/margins": 0.7466030716896057, "rewards/rejected": 2.2127819061279297, "step": 45770 }, { "epoch": 2.125446863828404, "grad_norm": 11.872344017028809, "learning_rate": 1.725344723524769e-07, "logits/chosen": -19.535900115966797, "logits/rejected": -17.528827667236328, "logps/chosen": -438.7127990722656, "logps/rejected": -240.2060089111328, "loss": 0.499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.109838962554932, "rewards/margins": 2.3002734184265137, "rewards/rejected": 1.8095651865005493, "step": 45780 }, { "epoch": 2.1259111379358373, "grad_norm": 77.24695587158203, "learning_rate": 1.7250661590603094e-07, "logits/chosen": -18.98769760131836, "logits/rejected": -18.808513641357422, "logps/chosen": -438.527587890625, "logps/rejected": -346.3056640625, "loss": 0.5985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.453575849533081, "rewards/margins": 1.3813120126724243, "rewards/rejected": 2.072263717651367, "step": 45790 }, { "epoch": 2.1263754120432705, "grad_norm": 54.28952407836914, "learning_rate": 1.7247875945958493e-07, "logits/chosen": -18.14076805114746, "logits/rejected": -17.140966415405273, "logps/chosen": -379.28033447265625, "logps/rejected": -305.53216552734375, "loss": 0.2394, "rewards/accuracies": 1.0, "rewards/chosen": 3.5776424407958984, "rewards/margins": 2.110201120376587, "rewards/rejected": 1.4674413204193115, "step": 45800 }, { "epoch": 2.1268396861507033, "grad_norm": 0.45112934708595276, "learning_rate": 1.7245090301313894e-07, "logits/chosen": -19.913026809692383, "logits/rejected": -18.937183380126953, "logps/chosen": -317.4830627441406, "logps/rejected": -221.70993041992188, "loss": 0.5936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0652318000793457, "rewards/margins": 0.9212487936019897, "rewards/rejected": 2.1439826488494873, "step": 45810 }, { "epoch": 2.1273039602581365, "grad_norm": 2.5476064682006836, "learning_rate": 1.7242304656669298e-07, "logits/chosen": -17.96344566345215, "logits/rejected": -17.945384979248047, "logps/chosen": -236.30819702148438, "logps/rejected": -233.51168823242188, "loss": 0.6325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9170223474502563, "rewards/margins": 0.6965854167938232, "rewards/rejected": 1.220436930656433, "step": 45820 }, { "epoch": 2.1277682343655693, "grad_norm": 22.24012565612793, "learning_rate": 1.7239519012024697e-07, "logits/chosen": -19.255741119384766, "logits/rejected": -19.230289459228516, "logps/chosen": -343.564453125, "logps/rejected": -368.81536865234375, "loss": 0.9302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6023855209350586, "rewards/margins": 0.01702108420431614, "rewards/rejected": 2.585364818572998, "step": 45830 }, { "epoch": 2.1282325084730025, "grad_norm": 21.109350204467773, "learning_rate": 1.7236733367380099e-07, "logits/chosen": -19.61456871032715, "logits/rejected": -19.0468692779541, "logps/chosen": -389.3531188964844, "logps/rejected": -401.6031494140625, "loss": 0.4421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.014714241027832, "rewards/margins": 1.458451271057129, "rewards/rejected": 2.5562632083892822, "step": 45840 }, { "epoch": 2.1286967825804357, "grad_norm": 49.03371047973633, "learning_rate": 1.7233947722735503e-07, "logits/chosen": -18.567489624023438, "logits/rejected": -17.181604385375977, "logps/chosen": -390.04461669921875, "logps/rejected": -191.0416259765625, "loss": 0.5097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.751366376876831, "rewards/margins": 2.2073583602905273, "rewards/rejected": 1.5440081357955933, "step": 45850 }, { "epoch": 2.1291610566878685, "grad_norm": 82.1938247680664, "learning_rate": 1.7231162078090904e-07, "logits/chosen": -18.68456268310547, "logits/rejected": -18.1904239654541, "logps/chosen": -321.285400390625, "logps/rejected": -247.67105102539062, "loss": 0.574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5479602813720703, "rewards/margins": 0.9649931192398071, "rewards/rejected": 1.5829672813415527, "step": 45860 }, { "epoch": 2.1296253307953017, "grad_norm": 60.06254196166992, "learning_rate": 1.7228376433446306e-07, "logits/chosen": -18.43539047241211, "logits/rejected": -18.023426055908203, "logps/chosen": -386.0320129394531, "logps/rejected": -338.44677734375, "loss": 0.7925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.912180185317993, "rewards/margins": 0.5985062122344971, "rewards/rejected": 2.313673734664917, "step": 45870 }, { "epoch": 2.1300896049027345, "grad_norm": 10.346925735473633, "learning_rate": 1.7225590788801707e-07, "logits/chosen": -19.25833511352539, "logits/rejected": -18.723346710205078, "logps/chosen": -408.4331970214844, "logps/rejected": -472.02532958984375, "loss": 0.8194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.234072208404541, "rewards/margins": 0.465991348028183, "rewards/rejected": 3.768080472946167, "step": 45880 }, { "epoch": 2.1305538790101677, "grad_norm": 2.9451522827148438, "learning_rate": 1.7222805144157109e-07, "logits/chosen": -18.845794677734375, "logits/rejected": -18.197946548461914, "logps/chosen": -373.8187561035156, "logps/rejected": -278.9246826171875, "loss": 0.6027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3235530853271484, "rewards/margins": 1.7341797351837158, "rewards/rejected": 1.589373230934143, "step": 45890 }, { "epoch": 2.1310181531176005, "grad_norm": 184.27639770507812, "learning_rate": 1.7220019499512513e-07, "logits/chosen": -19.21266746520996, "logits/rejected": -18.254608154296875, "logps/chosen": -361.91302490234375, "logps/rejected": -312.481201171875, "loss": 0.8241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5775749683380127, "rewards/margins": 1.025637149810791, "rewards/rejected": 2.551938056945801, "step": 45900 }, { "epoch": 2.1314824272250337, "grad_norm": 105.84527587890625, "learning_rate": 1.7217233854867911e-07, "logits/chosen": -19.567508697509766, "logits/rejected": -18.738304138183594, "logps/chosen": -396.2769775390625, "logps/rejected": -377.10064697265625, "loss": 0.5059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4337258338928223, "rewards/margins": 0.8221198320388794, "rewards/rejected": 2.6116061210632324, "step": 45910 }, { "epoch": 2.131946701332467, "grad_norm": 28.839998245239258, "learning_rate": 1.7214448210223316e-07, "logits/chosen": -19.452770233154297, "logits/rejected": -18.93161392211914, "logps/chosen": -439.110107421875, "logps/rejected": -376.2578430175781, "loss": 1.0286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3642654418945312, "rewards/margins": 0.392634779214859, "rewards/rejected": 2.971630334854126, "step": 45920 }, { "epoch": 2.1324109754398997, "grad_norm": 220.31288146972656, "learning_rate": 1.7211662565578717e-07, "logits/chosen": -19.406124114990234, "logits/rejected": -19.314340591430664, "logps/chosen": -474.18450927734375, "logps/rejected": -471.45672607421875, "loss": 1.0189, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6579203605651855, "rewards/margins": 0.310660183429718, "rewards/rejected": 3.3472607135772705, "step": 45930 }, { "epoch": 2.132875249547333, "grad_norm": 141.91934204101562, "learning_rate": 1.720887692093412e-07, "logits/chosen": -19.650606155395508, "logits/rejected": -19.0139102935791, "logps/chosen": -427.2084045410156, "logps/rejected": -381.7682189941406, "loss": 0.8623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.455528736114502, "rewards/margins": 0.9456182718276978, "rewards/rejected": 3.5099101066589355, "step": 45940 }, { "epoch": 2.1333395236547656, "grad_norm": 13.919329643249512, "learning_rate": 1.720609127628952e-07, "logits/chosen": -19.27979278564453, "logits/rejected": -17.870698928833008, "logps/chosen": -500.95709228515625, "logps/rejected": -400.8097229003906, "loss": 0.6658, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.368979454040527, "rewards/margins": 1.13419771194458, "rewards/rejected": 3.2347817420959473, "step": 45950 }, { "epoch": 2.133803797762199, "grad_norm": 27.457975387573242, "learning_rate": 1.7203305631644921e-07, "logits/chosen": -19.493656158447266, "logits/rejected": -18.62185287475586, "logps/chosen": -334.14764404296875, "logps/rejected": -275.8680114746094, "loss": 0.5393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.891434907913208, "rewards/margins": 1.3670787811279297, "rewards/rejected": 1.5243560075759888, "step": 45960 }, { "epoch": 2.1342680718696316, "grad_norm": 49.41316604614258, "learning_rate": 1.7200519987000325e-07, "logits/chosen": -19.459869384765625, "logits/rejected": -18.214401245117188, "logps/chosen": -477.5943298339844, "logps/rejected": -348.6683044433594, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.673086643218994, "rewards/margins": 0.9754694104194641, "rewards/rejected": 2.6976170539855957, "step": 45970 }, { "epoch": 2.134732345977065, "grad_norm": 15.890701293945312, "learning_rate": 1.7197734342355727e-07, "logits/chosen": -19.513381958007812, "logits/rejected": -18.563888549804688, "logps/chosen": -353.11016845703125, "logps/rejected": -268.3609313964844, "loss": 0.5897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5766053199768066, "rewards/margins": 1.2268999814987183, "rewards/rejected": 2.349705219268799, "step": 45980 }, { "epoch": 2.135196620084498, "grad_norm": 0.16768433153629303, "learning_rate": 1.7194948697711126e-07, "logits/chosen": -20.07394790649414, "logits/rejected": -18.319828033447266, "logps/chosen": -494.5849609375, "logps/rejected": -341.7398986816406, "loss": 0.2953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.300721168518066, "rewards/margins": 2.883134365081787, "rewards/rejected": 2.4175868034362793, "step": 45990 }, { "epoch": 2.135660894191931, "grad_norm": 1.7598127126693726, "learning_rate": 1.719216305306653e-07, "logits/chosen": -19.459558486938477, "logits/rejected": -18.20254898071289, "logps/chosen": -400.88958740234375, "logps/rejected": -305.08392333984375, "loss": 0.4102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9961588382720947, "rewards/margins": 1.5354187488555908, "rewards/rejected": 1.4607399702072144, "step": 46000 }, { "epoch": 2.136125168299364, "grad_norm": 221.66851806640625, "learning_rate": 1.718937740842193e-07, "logits/chosen": -18.634695053100586, "logits/rejected": -18.627622604370117, "logps/chosen": -369.3438720703125, "logps/rejected": -345.2467956542969, "loss": 0.7215, "rewards/accuracies": 0.5, "rewards/chosen": 2.896918296813965, "rewards/margins": 0.0916905552148819, "rewards/rejected": 2.8052279949188232, "step": 46010 }, { "epoch": 2.136589442406797, "grad_norm": 144.78128051757812, "learning_rate": 1.7186591763777335e-07, "logits/chosen": -19.173152923583984, "logits/rejected": -18.07948875427246, "logps/chosen": -295.1409912109375, "logps/rejected": -261.3563232421875, "loss": 0.5541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.606271266937256, "rewards/margins": 0.626979410648346, "rewards/rejected": 1.979291558265686, "step": 46020 }, { "epoch": 2.13705371651423, "grad_norm": 37.854148864746094, "learning_rate": 1.7183806119132734e-07, "logits/chosen": -19.930767059326172, "logits/rejected": -18.666248321533203, "logps/chosen": -445.21844482421875, "logps/rejected": -352.1050720214844, "loss": 0.4198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9296627044677734, "rewards/margins": 1.4227404594421387, "rewards/rejected": 2.506922483444214, "step": 46030 }, { "epoch": 2.137517990621663, "grad_norm": 210.230712890625, "learning_rate": 1.7181020474488138e-07, "logits/chosen": -19.885446548461914, "logits/rejected": -19.144044876098633, "logps/chosen": -412.47491455078125, "logps/rejected": -364.3799133300781, "loss": 0.7476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.919142484664917, "rewards/margins": 0.6655365228652954, "rewards/rejected": 3.253606081008911, "step": 46040 }, { "epoch": 2.137982264729096, "grad_norm": 36.818031311035156, "learning_rate": 1.717823482984354e-07, "logits/chosen": -19.032032012939453, "logits/rejected": -18.278871536254883, "logps/chosen": -384.74188232421875, "logps/rejected": -301.6715393066406, "loss": 0.4309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7249488830566406, "rewards/margins": 1.642331838607788, "rewards/rejected": 2.0826168060302734, "step": 46050 }, { "epoch": 2.1384465388365292, "grad_norm": 0.7087934017181396, "learning_rate": 1.7175449185198939e-07, "logits/chosen": -18.887516021728516, "logits/rejected": -18.071308135986328, "logps/chosen": -407.41571044921875, "logps/rejected": -369.6929931640625, "loss": 0.9714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.08305025100708, "rewards/margins": 1.1610071659088135, "rewards/rejected": 2.9220433235168457, "step": 46060 }, { "epoch": 2.138910812943962, "grad_norm": 37.20888137817383, "learning_rate": 1.7172663540554343e-07, "logits/chosen": -20.402236938476562, "logits/rejected": -19.726627349853516, "logps/chosen": -363.96881103515625, "logps/rejected": -268.4066467285156, "loss": 0.6479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2623703479766846, "rewards/margins": 0.5913141965866089, "rewards/rejected": 2.6710562705993652, "step": 46070 }, { "epoch": 2.1393750870513952, "grad_norm": 23.943939208984375, "learning_rate": 1.7169877895909744e-07, "logits/chosen": -18.377979278564453, "logits/rejected": -18.49165153503418, "logps/chosen": -395.1888732910156, "logps/rejected": -406.48944091796875, "loss": 1.363, "rewards/accuracies": 0.5, "rewards/chosen": 3.42834734916687, "rewards/margins": 0.02513810433447361, "rewards/rejected": 3.4032092094421387, "step": 46080 }, { "epoch": 2.139839361158828, "grad_norm": 305.1073303222656, "learning_rate": 1.7167092251265148e-07, "logits/chosen": -18.128101348876953, "logits/rejected": -17.817296981811523, "logps/chosen": -447.4606018066406, "logps/rejected": -429.5164489746094, "loss": 0.8272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.261338710784912, "rewards/margins": 1.0850673913955688, "rewards/rejected": 3.176271438598633, "step": 46090 }, { "epoch": 2.140303635266261, "grad_norm": 102.99411010742188, "learning_rate": 1.7164306606620547e-07, "logits/chosen": -18.677330017089844, "logits/rejected": -18.282672882080078, "logps/chosen": -436.4203186035156, "logps/rejected": -528.6951293945312, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4590866565704346, "rewards/margins": 0.9134281873703003, "rewards/rejected": 2.545658588409424, "step": 46100 }, { "epoch": 2.1407679093736944, "grad_norm": 42.19325637817383, "learning_rate": 1.7161520961975948e-07, "logits/chosen": -19.57523536682129, "logits/rejected": -18.26131248474121, "logps/chosen": -444.51483154296875, "logps/rejected": -346.3723449707031, "loss": 0.3261, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.560380935668945, "rewards/margins": 1.4553158283233643, "rewards/rejected": 3.10506534576416, "step": 46110 }, { "epoch": 2.141232183481127, "grad_norm": 26.600875854492188, "learning_rate": 1.7158735317331353e-07, "logits/chosen": -19.580066680908203, "logits/rejected": -18.16070556640625, "logps/chosen": -342.6733703613281, "logps/rejected": -254.7626495361328, "loss": 0.6326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9326205253601074, "rewards/margins": 0.7995811700820923, "rewards/rejected": 2.1330389976501465, "step": 46120 }, { "epoch": 2.1416964575885604, "grad_norm": 56.82577133178711, "learning_rate": 1.7155949672686754e-07, "logits/chosen": -18.74087142944336, "logits/rejected": -17.757160186767578, "logps/chosen": -385.4190673828125, "logps/rejected": -268.40789794921875, "loss": 0.5751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0054104328155518, "rewards/margins": 1.1517492532730103, "rewards/rejected": 1.8536611795425415, "step": 46130 }, { "epoch": 2.142160731695993, "grad_norm": 21.722171783447266, "learning_rate": 1.7153164028042153e-07, "logits/chosen": -19.223167419433594, "logits/rejected": -18.128314971923828, "logps/chosen": -389.29840087890625, "logps/rejected": -301.97216796875, "loss": 0.3833, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6059577465057373, "rewards/margins": 1.7541639804840088, "rewards/rejected": 0.851793646812439, "step": 46140 }, { "epoch": 2.1426250058034264, "grad_norm": 13.413440704345703, "learning_rate": 1.7150378383397557e-07, "logits/chosen": -19.244001388549805, "logits/rejected": -17.368671417236328, "logps/chosen": -390.758544921875, "logps/rejected": -230.3612518310547, "loss": 0.4515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2985305786132812, "rewards/margins": 1.6827281713485718, "rewards/rejected": 1.6158024072647095, "step": 46150 }, { "epoch": 2.143089279910859, "grad_norm": 93.53143310546875, "learning_rate": 1.7147592738752958e-07, "logits/chosen": -19.695127487182617, "logits/rejected": -18.844032287597656, "logps/chosen": -406.45684814453125, "logps/rejected": -294.1186218261719, "loss": 0.5121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.906757354736328, "rewards/margins": 1.8649070262908936, "rewards/rejected": 2.0418505668640137, "step": 46160 }, { "epoch": 2.1435535540182924, "grad_norm": 51.59159469604492, "learning_rate": 1.7144807094108362e-07, "logits/chosen": -19.55611801147461, "logits/rejected": -18.45651626586914, "logps/chosen": -382.9006652832031, "logps/rejected": -315.30462646484375, "loss": 0.3505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5595555305480957, "rewards/margins": 1.4326860904693604, "rewards/rejected": 2.1268696784973145, "step": 46170 }, { "epoch": 2.1440178281257256, "grad_norm": 5.459080219268799, "learning_rate": 1.714202144946376e-07, "logits/chosen": -18.959144592285156, "logits/rejected": -18.238325119018555, "logps/chosen": -417.2935485839844, "logps/rejected": -378.2263488769531, "loss": 0.6464, "rewards/accuracies": 0.5, "rewards/chosen": 4.033223628997803, "rewards/margins": 0.5949237942695618, "rewards/rejected": 3.4382996559143066, "step": 46180 }, { "epoch": 2.1444821022331584, "grad_norm": 55.219993591308594, "learning_rate": 1.7139235804819165e-07, "logits/chosen": -18.86691665649414, "logits/rejected": -18.33242416381836, "logps/chosen": -323.6818542480469, "logps/rejected": -358.93621826171875, "loss": 0.8873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.837367534637451, "rewards/margins": 0.13305151462554932, "rewards/rejected": 2.7043161392211914, "step": 46190 }, { "epoch": 2.1449463763405916, "grad_norm": 109.8018798828125, "learning_rate": 1.7136450160174567e-07, "logits/chosen": -18.664968490600586, "logits/rejected": -17.836864471435547, "logps/chosen": -265.71649169921875, "logps/rejected": -254.207275390625, "loss": 0.6244, "rewards/accuracies": 0.5, "rewards/chosen": 2.3678314685821533, "rewards/margins": 0.6275344491004944, "rewards/rejected": 1.7402970790863037, "step": 46200 }, { "epoch": 2.1454106504480244, "grad_norm": 26.395177841186523, "learning_rate": 1.713366451552997e-07, "logits/chosen": -19.67581558227539, "logits/rejected": -18.532852172851562, "logps/chosen": -272.30157470703125, "logps/rejected": -222.3636474609375, "loss": 0.6766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.132916212081909, "rewards/margins": 0.8910945057868958, "rewards/rejected": 1.2418216466903687, "step": 46210 }, { "epoch": 2.1458749245554576, "grad_norm": 38.030757904052734, "learning_rate": 1.713087887088537e-07, "logits/chosen": -20.36286163330078, "logits/rejected": -19.313182830810547, "logps/chosen": -482.3856506347656, "logps/rejected": -422.66082763671875, "loss": 0.4726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.171942710876465, "rewards/margins": 1.3954271078109741, "rewards/rejected": 2.776515245437622, "step": 46220 }, { "epoch": 2.146339198662891, "grad_norm": 96.09930419921875, "learning_rate": 1.712809322624077e-07, "logits/chosen": -19.722082138061523, "logits/rejected": -19.199766159057617, "logps/chosen": -484.283203125, "logps/rejected": -346.05718994140625, "loss": 0.697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9110074043273926, "rewards/margins": 0.4942629933357239, "rewards/rejected": 3.4167447090148926, "step": 46230 }, { "epoch": 2.1468034727703236, "grad_norm": 89.41971588134766, "learning_rate": 1.7125307581596175e-07, "logits/chosen": -18.79507827758789, "logits/rejected": -18.869810104370117, "logps/chosen": -390.1151428222656, "logps/rejected": -374.73333740234375, "loss": 1.1321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2184462547302246, "rewards/margins": 0.256244033575058, "rewards/rejected": 2.9622018337249756, "step": 46240 }, { "epoch": 2.147267746877757, "grad_norm": 74.56407928466797, "learning_rate": 1.7122521936951574e-07, "logits/chosen": -20.065380096435547, "logits/rejected": -20.218250274658203, "logps/chosen": -312.5684814453125, "logps/rejected": -297.7648010253906, "loss": 1.0329, "rewards/accuracies": 0.5, "rewards/chosen": 2.7338736057281494, "rewards/margins": -0.3034355044364929, "rewards/rejected": 3.037309169769287, "step": 46250 }, { "epoch": 2.1477320209851896, "grad_norm": 37.020938873291016, "learning_rate": 1.7119736292306976e-07, "logits/chosen": -18.020313262939453, "logits/rejected": -16.83745574951172, "logps/chosen": -448.95684814453125, "logps/rejected": -303.8210144042969, "loss": 0.2799, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3863532543182373, "rewards/margins": 1.6679179668426514, "rewards/rejected": 1.718435287475586, "step": 46260 }, { "epoch": 2.1481962950926228, "grad_norm": 65.91485595703125, "learning_rate": 1.711695064766238e-07, "logits/chosen": -19.977909088134766, "logits/rejected": -19.491016387939453, "logps/chosen": -508.76947021484375, "logps/rejected": -379.48687744140625, "loss": 0.3541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.302990913391113, "rewards/margins": 1.4835641384124756, "rewards/rejected": 2.8194265365600586, "step": 46270 }, { "epoch": 2.1486605692000555, "grad_norm": 177.8387908935547, "learning_rate": 1.711416500301778e-07, "logits/chosen": -19.870502471923828, "logits/rejected": -17.771291732788086, "logps/chosen": -484.4562072753906, "logps/rejected": -250.2724151611328, "loss": 0.2687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.962869167327881, "rewards/margins": 2.0408935546875, "rewards/rejected": 1.9219757318496704, "step": 46280 }, { "epoch": 2.1491248433074888, "grad_norm": 2.571260929107666, "learning_rate": 1.7111379358373183e-07, "logits/chosen": -19.100391387939453, "logits/rejected": -17.79633140563965, "logps/chosen": -345.1379699707031, "logps/rejected": -231.18569946289062, "loss": 0.2846, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.08653450012207, "rewards/margins": 2.1514360904693604, "rewards/rejected": 1.9350982904434204, "step": 46290 }, { "epoch": 2.149589117414922, "grad_norm": 73.61347198486328, "learning_rate": 1.7108593713728584e-07, "logits/chosen": -19.58816146850586, "logits/rejected": -18.975555419921875, "logps/chosen": -242.7406768798828, "logps/rejected": -214.74429321289062, "loss": 0.6084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7271335124969482, "rewards/margins": 0.5370875597000122, "rewards/rejected": 1.1900461912155151, "step": 46300 }, { "epoch": 2.1500533915223548, "grad_norm": 23.599428176879883, "learning_rate": 1.7106086633548446e-07, "logits/chosen": -18.308298110961914, "logits/rejected": -17.73724365234375, "logps/chosen": -382.5427551269531, "logps/rejected": -293.4725646972656, "loss": 0.6342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.48738169670105, "rewards/margins": 1.0231413841247559, "rewards/rejected": 1.4642404317855835, "step": 46310 }, { "epoch": 2.150517665629788, "grad_norm": 63.531455993652344, "learning_rate": 1.7103300988903848e-07, "logits/chosen": -20.84402847290039, "logits/rejected": -19.34067726135254, "logps/chosen": -478.85382080078125, "logps/rejected": -373.33026123046875, "loss": 0.6811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.196505546569824, "rewards/margins": 0.8490435481071472, "rewards/rejected": 3.3474624156951904, "step": 46320 }, { "epoch": 2.1509819397372207, "grad_norm": 5.193625450134277, "learning_rate": 1.7100515344259252e-07, "logits/chosen": -19.498580932617188, "logits/rejected": -18.251358032226562, "logps/chosen": -482.14447021484375, "logps/rejected": -302.02215576171875, "loss": 0.4356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5794677734375, "rewards/margins": 2.032860517501831, "rewards/rejected": 2.546607494354248, "step": 46330 }, { "epoch": 2.151446213844654, "grad_norm": 2.8043694496154785, "learning_rate": 1.709772969961465e-07, "logits/chosen": -18.263591766357422, "logits/rejected": -17.619029998779297, "logps/chosen": -396.4454650878906, "logps/rejected": -357.1144104003906, "loss": 0.8067, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4592437744140625, "rewards/margins": 0.8458514213562012, "rewards/rejected": 2.613391637802124, "step": 46340 }, { "epoch": 2.1519104879520867, "grad_norm": 77.61511993408203, "learning_rate": 1.7094944054970052e-07, "logits/chosen": -19.25632095336914, "logits/rejected": -18.35763931274414, "logps/chosen": -484.1024475097656, "logps/rejected": -323.08135986328125, "loss": 0.5861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.028152942657471, "rewards/margins": 1.1140706539154053, "rewards/rejected": 2.9140820503234863, "step": 46350 }, { "epoch": 2.15237476205952, "grad_norm": 28.60233497619629, "learning_rate": 1.7092158410325456e-07, "logits/chosen": -19.204570770263672, "logits/rejected": -19.55331039428711, "logps/chosen": -373.3374938964844, "logps/rejected": -381.9899597167969, "loss": 0.8657, "rewards/accuracies": 0.5, "rewards/chosen": 3.1514782905578613, "rewards/margins": 0.2862231433391571, "rewards/rejected": 2.8652548789978027, "step": 46360 }, { "epoch": 2.152839036166953, "grad_norm": 1.2086312770843506, "learning_rate": 1.7089372765680857e-07, "logits/chosen": -18.734827041625977, "logits/rejected": -17.052532196044922, "logps/chosen": -496.80474853515625, "logps/rejected": -278.0742492675781, "loss": 0.2444, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.294873237609863, "rewards/margins": 2.4515299797058105, "rewards/rejected": 1.8433430194854736, "step": 46370 }, { "epoch": 2.153303310274386, "grad_norm": 42.36211395263672, "learning_rate": 1.708658712103626e-07, "logits/chosen": -19.239341735839844, "logits/rejected": -18.331928253173828, "logps/chosen": -382.9627380371094, "logps/rejected": -266.90203857421875, "loss": 0.6842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.46757435798645, "rewards/margins": 1.428405523300171, "rewards/rejected": 2.0391688346862793, "step": 46380 }, { "epoch": 2.153767584381819, "grad_norm": 94.1921615600586, "learning_rate": 1.708380147639166e-07, "logits/chosen": -19.12748146057129, "logits/rejected": -18.805889129638672, "logps/chosen": -287.3941345214844, "logps/rejected": -261.029296875, "loss": 0.4942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6197433471679688, "rewards/margins": 1.070094108581543, "rewards/rejected": 2.549649238586426, "step": 46390 }, { "epoch": 2.154231858489252, "grad_norm": 171.34844970703125, "learning_rate": 1.7081015831747064e-07, "logits/chosen": -18.3077335357666, "logits/rejected": -18.903913497924805, "logps/chosen": -342.8224182128906, "logps/rejected": -434.26800537109375, "loss": 1.3122, "rewards/accuracies": 0.5, "rewards/chosen": 2.8881263732910156, "rewards/margins": -0.5590965151786804, "rewards/rejected": 3.447222948074341, "step": 46400 }, { "epoch": 2.154696132596685, "grad_norm": 4.914639472961426, "learning_rate": 1.7078230187102466e-07, "logits/chosen": -18.988800048828125, "logits/rejected": -18.296220779418945, "logps/chosen": -471.139404296875, "logps/rejected": -394.0014953613281, "loss": 0.5361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.391319751739502, "rewards/margins": 0.8483522534370422, "rewards/rejected": 3.5429675579071045, "step": 46410 }, { "epoch": 2.155160406704118, "grad_norm": 98.49930572509766, "learning_rate": 1.7075444542457865e-07, "logits/chosen": -20.274551391601562, "logits/rejected": -19.766948699951172, "logps/chosen": -495.11578369140625, "logps/rejected": -461.83056640625, "loss": 0.2542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.390803337097168, "rewards/margins": 1.7283008098602295, "rewards/rejected": 3.6625030040740967, "step": 46420 }, { "epoch": 2.155624680811551, "grad_norm": 13.989322662353516, "learning_rate": 1.707265889781327e-07, "logits/chosen": -19.093585968017578, "logits/rejected": -17.687198638916016, "logps/chosen": -371.9888000488281, "logps/rejected": -231.665283203125, "loss": 0.5387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7529423236846924, "rewards/margins": 1.333984613418579, "rewards/rejected": 1.4189579486846924, "step": 46430 }, { "epoch": 2.1560889549189843, "grad_norm": 136.69837951660156, "learning_rate": 1.706987325316867e-07, "logits/chosen": -19.10576820373535, "logits/rejected": -19.569107055664062, "logps/chosen": -399.9996337890625, "logps/rejected": -370.7572326660156, "loss": 1.4315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3956046104431152, "rewards/margins": 0.07179746776819229, "rewards/rejected": 3.3238072395324707, "step": 46440 }, { "epoch": 2.156553229026417, "grad_norm": 125.0971450805664, "learning_rate": 1.7067087608524074e-07, "logits/chosen": -18.67241096496582, "logits/rejected": -17.913936614990234, "logps/chosen": -476.2757263183594, "logps/rejected": -375.0510559082031, "loss": 0.5353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.005951881408691, "rewards/margins": 1.4515236616134644, "rewards/rejected": 2.5544285774230957, "step": 46450 }, { "epoch": 2.1570175031338503, "grad_norm": 71.53072357177734, "learning_rate": 1.7064301963879473e-07, "logits/chosen": -19.011821746826172, "logits/rejected": -19.452980041503906, "logps/chosen": -453.88421630859375, "logps/rejected": -461.725341796875, "loss": 1.074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8175721168518066, "rewards/margins": 0.329284131526947, "rewards/rejected": 3.488288164138794, "step": 46460 }, { "epoch": 2.157481777241283, "grad_norm": 42.43461227416992, "learning_rate": 1.7061516319234875e-07, "logits/chosen": -18.601173400878906, "logits/rejected": -18.129901885986328, "logps/chosen": -384.0447692871094, "logps/rejected": -319.7749938964844, "loss": 1.0144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0712413787841797, "rewards/margins": 0.41253989934921265, "rewards/rejected": 2.658701181411743, "step": 46470 }, { "epoch": 2.1579460513487163, "grad_norm": 127.7796630859375, "learning_rate": 1.705873067459028e-07, "logits/chosen": -19.3787784576416, "logits/rejected": -18.78322982788086, "logps/chosen": -486.8350524902344, "logps/rejected": -362.3717956542969, "loss": 0.5312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.052471160888672, "rewards/margins": 0.8852535486221313, "rewards/rejected": 3.167217493057251, "step": 46480 }, { "epoch": 2.158410325456149, "grad_norm": 2.4319231510162354, "learning_rate": 1.7055945029945678e-07, "logits/chosen": -18.671295166015625, "logits/rejected": -18.527267456054688, "logps/chosen": -361.4610900878906, "logps/rejected": -326.0965881347656, "loss": 1.1283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5436229705810547, "rewards/margins": 0.8711616396903992, "rewards/rejected": 2.6724612712860107, "step": 46490 }, { "epoch": 2.1588745995635823, "grad_norm": 40.11394500732422, "learning_rate": 1.705315938530108e-07, "logits/chosen": -18.301349639892578, "logits/rejected": -18.09547233581543, "logps/chosen": -316.26519775390625, "logps/rejected": -324.0169372558594, "loss": 0.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.032874822616577, "rewards/margins": 1.0708246231079102, "rewards/rejected": 1.962050199508667, "step": 46500 }, { "epoch": 2.1593388736710155, "grad_norm": 145.8101348876953, "learning_rate": 1.7050373740656483e-07, "logits/chosen": -20.044586181640625, "logits/rejected": -18.71847915649414, "logps/chosen": -482.9347229003906, "logps/rejected": -343.9385681152344, "loss": 0.5239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9232985973358154, "rewards/margins": 1.1004736423492432, "rewards/rejected": 2.8228249549865723, "step": 46510 }, { "epoch": 2.1598031477784483, "grad_norm": 42.4177131652832, "learning_rate": 1.7047588096011885e-07, "logits/chosen": -19.05853271484375, "logits/rejected": -17.642465591430664, "logps/chosen": -408.25225830078125, "logps/rejected": -309.89215087890625, "loss": 0.4853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6581368446350098, "rewards/margins": 1.3935792446136475, "rewards/rejected": 2.264557361602783, "step": 46520 }, { "epoch": 2.1602674218858815, "grad_norm": 96.18278503417969, "learning_rate": 1.7044802451367286e-07, "logits/chosen": -20.111560821533203, "logits/rejected": -19.824827194213867, "logps/chosen": -512.5511474609375, "logps/rejected": -490.5945739746094, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.683215141296387, "rewards/margins": 1.5521059036254883, "rewards/rejected": 3.1311097145080566, "step": 46530 }, { "epoch": 2.1607316959933143, "grad_norm": 166.32765197753906, "learning_rate": 1.7042016806722687e-07, "logits/chosen": -19.521940231323242, "logits/rejected": -19.542272567749023, "logps/chosen": -446.6333923339844, "logps/rejected": -404.4185791015625, "loss": 1.0693, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.7355103492736816, "rewards/margins": 0.0010687947506085038, "rewards/rejected": 3.7344412803649902, "step": 46540 }, { "epoch": 2.1611959701007475, "grad_norm": 46.89192581176758, "learning_rate": 1.7039231162078092e-07, "logits/chosen": -19.310527801513672, "logits/rejected": -19.194290161132812, "logps/chosen": -392.06890869140625, "logps/rejected": -387.37689208984375, "loss": 0.7467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2605087757110596, "rewards/margins": 0.5353050231933594, "rewards/rejected": 2.7252037525177, "step": 46550 }, { "epoch": 2.1616602442081807, "grad_norm": 63.775390625, "learning_rate": 1.7036445517433493e-07, "logits/chosen": -19.272525787353516, "logits/rejected": -18.659343719482422, "logps/chosen": -405.75732421875, "logps/rejected": -322.20025634765625, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9648184776306152, "rewards/margins": 1.0626616477966309, "rewards/rejected": 1.902157187461853, "step": 46560 }, { "epoch": 2.1621245183156135, "grad_norm": 2.646678924560547, "learning_rate": 1.7033659872788892e-07, "logits/chosen": -19.83299446105957, "logits/rejected": -18.89492416381836, "logps/chosen": -382.8473205566406, "logps/rejected": -279.8808288574219, "loss": 0.6563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5995864868164062, "rewards/margins": 1.0199346542358398, "rewards/rejected": 2.579651355743408, "step": 46570 }, { "epoch": 2.1625887924230467, "grad_norm": 93.95719909667969, "learning_rate": 1.7030874228144296e-07, "logits/chosen": -19.0543270111084, "logits/rejected": -17.456727981567383, "logps/chosen": -509.494873046875, "logps/rejected": -261.6358947753906, "loss": 0.6681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3109867572784424, "rewards/margins": 1.4064295291900635, "rewards/rejected": 1.904557228088379, "step": 46580 }, { "epoch": 2.1630530665304795, "grad_norm": 35.67835235595703, "learning_rate": 1.7028088583499697e-07, "logits/chosen": -18.68085479736328, "logits/rejected": -17.9766845703125, "logps/chosen": -407.3653259277344, "logps/rejected": -321.2158508300781, "loss": 0.3517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3859031200408936, "rewards/margins": 1.371847152709961, "rewards/rejected": 2.014056444168091, "step": 46590 }, { "epoch": 2.1635173406379127, "grad_norm": 3.5130207538604736, "learning_rate": 1.7025302938855102e-07, "logits/chosen": -18.245777130126953, "logits/rejected": -17.46229362487793, "logps/chosen": -306.8414001464844, "logps/rejected": -204.96511840820312, "loss": 0.4705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3961102962493896, "rewards/margins": 1.1903170347213745, "rewards/rejected": 1.2057933807373047, "step": 46600 }, { "epoch": 2.1639816147453454, "grad_norm": 32.18534469604492, "learning_rate": 1.70225172942105e-07, "logits/chosen": -18.843971252441406, "logits/rejected": -16.997604370117188, "logps/chosen": -356.5840148925781, "logps/rejected": -242.15982055664062, "loss": 0.4911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.365450382232666, "rewards/margins": 1.2357157468795776, "rewards/rejected": 1.1297346353530884, "step": 46610 }, { "epoch": 2.1644458888527787, "grad_norm": 252.42503356933594, "learning_rate": 1.7019731649565902e-07, "logits/chosen": -18.22710609436035, "logits/rejected": -18.22651481628418, "logps/chosen": -290.32000732421875, "logps/rejected": -327.35406494140625, "loss": 0.7798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5592334270477295, "rewards/margins": 0.3068058490753174, "rewards/rejected": 2.252427577972412, "step": 46620 }, { "epoch": 2.164910162960212, "grad_norm": 53.1734733581543, "learning_rate": 1.7016946004921306e-07, "logits/chosen": -18.401599884033203, "logits/rejected": -17.811168670654297, "logps/chosen": -363.3431091308594, "logps/rejected": -253.44833374023438, "loss": 0.6063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9105842113494873, "rewards/margins": 0.565649688243866, "rewards/rejected": 2.3449344635009766, "step": 46630 }, { "epoch": 2.1653744370676447, "grad_norm": 107.83626556396484, "learning_rate": 1.7014160360276707e-07, "logits/chosen": -20.196224212646484, "logits/rejected": -19.508010864257812, "logps/chosen": -416.77020263671875, "logps/rejected": -346.0863037109375, "loss": 0.3602, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.267608165740967, "rewards/margins": 1.4690839052200317, "rewards/rejected": 2.7985241413116455, "step": 46640 }, { "epoch": 2.165838711175078, "grad_norm": 23.741064071655273, "learning_rate": 1.701137471563211e-07, "logits/chosen": -19.545215606689453, "logits/rejected": -19.06234359741211, "logps/chosen": -461.29229736328125, "logps/rejected": -495.68402099609375, "loss": 1.2921, "rewards/accuracies": 0.5, "rewards/chosen": 3.574660539627075, "rewards/margins": -0.410372793674469, "rewards/rejected": 3.9850330352783203, "step": 46650 }, { "epoch": 2.1663029852825106, "grad_norm": 206.27420043945312, "learning_rate": 1.700858907098751e-07, "logits/chosen": -19.276893615722656, "logits/rejected": -19.283313751220703, "logps/chosen": -357.83709716796875, "logps/rejected": -375.24725341796875, "loss": 1.0712, "rewards/accuracies": 0.5, "rewards/chosen": 2.3179423809051514, "rewards/margins": -0.19676198065280914, "rewards/rejected": 2.514704465866089, "step": 46660 }, { "epoch": 2.166767259389944, "grad_norm": 171.20233154296875, "learning_rate": 1.7005803426342912e-07, "logits/chosen": -19.56228256225586, "logits/rejected": -18.707286834716797, "logps/chosen": -341.6275939941406, "logps/rejected": -301.1710205078125, "loss": 0.3451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7221016883850098, "rewards/margins": 1.6144342422485352, "rewards/rejected": 2.1076674461364746, "step": 46670 }, { "epoch": 2.167231533497377, "grad_norm": 2.9684035778045654, "learning_rate": 1.7003017781698316e-07, "logits/chosen": -18.971576690673828, "logits/rejected": -18.362651824951172, "logps/chosen": -450.6337890625, "logps/rejected": -441.14898681640625, "loss": 0.5542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.047079563140869, "rewards/margins": 1.4073035717010498, "rewards/rejected": 3.6397769451141357, "step": 46680 }, { "epoch": 2.16769580760481, "grad_norm": 44.330650329589844, "learning_rate": 1.7000232137053715e-07, "logits/chosen": -18.916852951049805, "logits/rejected": -18.81793212890625, "logps/chosen": -357.14263916015625, "logps/rejected": -320.3531494140625, "loss": 0.6994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.990426540374756, "rewards/margins": 0.46299800276756287, "rewards/rejected": 2.52742862701416, "step": 46690 }, { "epoch": 2.168160081712243, "grad_norm": 40.632781982421875, "learning_rate": 1.699744649240912e-07, "logits/chosen": -18.249860763549805, "logits/rejected": -18.114154815673828, "logps/chosen": -286.03204345703125, "logps/rejected": -260.71856689453125, "loss": 0.5134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.306225538253784, "rewards/margins": 1.0991735458374023, "rewards/rejected": 2.207052230834961, "step": 46700 }, { "epoch": 2.168624355819676, "grad_norm": 187.03860473632812, "learning_rate": 1.699466084776452e-07, "logits/chosen": -18.235198974609375, "logits/rejected": -18.280424118041992, "logps/chosen": -284.9580078125, "logps/rejected": -295.69781494140625, "loss": 1.3494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5428032875061035, "rewards/margins": -0.3696404695510864, "rewards/rejected": 2.9124438762664795, "step": 46710 }, { "epoch": 2.169088629927109, "grad_norm": 29.18621826171875, "learning_rate": 1.699187520311992e-07, "logits/chosen": -18.64242172241211, "logits/rejected": -18.62468910217285, "logps/chosen": -419.7662048339844, "logps/rejected": -389.0875549316406, "loss": 0.7663, "rewards/accuracies": 0.5, "rewards/chosen": 3.8948235511779785, "rewards/margins": 0.23848433792591095, "rewards/rejected": 3.656339168548584, "step": 46720 }, { "epoch": 2.169552904034542, "grad_norm": 17.905914306640625, "learning_rate": 1.6989089558475323e-07, "logits/chosen": -19.198511123657227, "logits/rejected": -18.116840362548828, "logps/chosen": -321.0169372558594, "logps/rejected": -215.91244506835938, "loss": 0.4945, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.968900203704834, "rewards/margins": 1.5157945156097412, "rewards/rejected": 1.4531055688858032, "step": 46730 }, { "epoch": 2.170017178141975, "grad_norm": 12.608931541442871, "learning_rate": 1.6986303913830725e-07, "logits/chosen": -19.23806381225586, "logits/rejected": -18.594942092895508, "logps/chosen": -356.6492919921875, "logps/rejected": -313.10870361328125, "loss": 0.7425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.73472261428833, "rewards/margins": 0.6552847027778625, "rewards/rejected": 2.0794379711151123, "step": 46740 }, { "epoch": 2.1704814522494082, "grad_norm": 130.17019653320312, "learning_rate": 1.6983518269186129e-07, "logits/chosen": -19.094600677490234, "logits/rejected": -18.544416427612305, "logps/chosen": -342.3444519042969, "logps/rejected": -364.60546875, "loss": 0.9952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.683922529220581, "rewards/margins": 0.35021400451660156, "rewards/rejected": 2.3337085247039795, "step": 46750 }, { "epoch": 2.170945726356841, "grad_norm": 73.31780242919922, "learning_rate": 1.6980732624541527e-07, "logits/chosen": -18.466585159301758, "logits/rejected": -18.390077590942383, "logps/chosen": -425.17535400390625, "logps/rejected": -392.6142883300781, "loss": 0.7648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.317885160446167, "rewards/margins": 0.8852014541625977, "rewards/rejected": 2.4326834678649902, "step": 46760 }, { "epoch": 2.1714100004642742, "grad_norm": 56.70231246948242, "learning_rate": 1.697794697989693e-07, "logits/chosen": -18.613359451293945, "logits/rejected": -18.161718368530273, "logps/chosen": -415.8006286621094, "logps/rejected": -365.2512512207031, "loss": 0.835, "rewards/accuracies": 0.5, "rewards/chosen": 2.913926601409912, "rewards/margins": 0.7249541282653809, "rewards/rejected": 2.1889724731445312, "step": 46770 }, { "epoch": 2.171874274571707, "grad_norm": 8.500458717346191, "learning_rate": 1.6975161335252333e-07, "logits/chosen": -19.291645050048828, "logits/rejected": -18.316082000732422, "logps/chosen": -435.2998962402344, "logps/rejected": -323.42047119140625, "loss": 0.6133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.799973726272583, "rewards/margins": 1.2875648736953735, "rewards/rejected": 2.51240873336792, "step": 46780 }, { "epoch": 2.1723385486791402, "grad_norm": 55.029212951660156, "learning_rate": 1.6972375690607734e-07, "logits/chosen": -19.170610427856445, "logits/rejected": -18.34604263305664, "logps/chosen": -343.63775634765625, "logps/rejected": -316.71881103515625, "loss": 1.0926, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8743367195129395, "rewards/margins": 0.05319960042834282, "rewards/rejected": 2.8211371898651123, "step": 46790 }, { "epoch": 2.172802822786573, "grad_norm": 207.74301147460938, "learning_rate": 1.6969590045963136e-07, "logits/chosen": -18.59162712097168, "logits/rejected": -18.269271850585938, "logps/chosen": -478.80889892578125, "logps/rejected": -461.13873291015625, "loss": 0.9266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.106341361999512, "rewards/margins": 0.6055158972740173, "rewards/rejected": 3.5008251667022705, "step": 46800 }, { "epoch": 2.173267096894006, "grad_norm": 233.1119842529297, "learning_rate": 1.6966804401318537e-07, "logits/chosen": -19.216182708740234, "logits/rejected": -18.64848518371582, "logps/chosen": -351.93231201171875, "logps/rejected": -405.2386169433594, "loss": 0.5881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.107818365097046, "rewards/margins": 0.5088468790054321, "rewards/rejected": 2.5989716053009033, "step": 46810 }, { "epoch": 2.1737313710014394, "grad_norm": 69.75457763671875, "learning_rate": 1.6964018756673941e-07, "logits/chosen": -19.562355041503906, "logits/rejected": -18.806806564331055, "logps/chosen": -506.70379638671875, "logps/rejected": -391.4352111816406, "loss": 0.6376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.809255599975586, "rewards/margins": 1.4294222593307495, "rewards/rejected": 2.379833698272705, "step": 46820 }, { "epoch": 2.174195645108872, "grad_norm": 91.29471588134766, "learning_rate": 1.6961233112029343e-07, "logits/chosen": -18.327377319335938, "logits/rejected": -18.535907745361328, "logps/chosen": -377.18817138671875, "logps/rejected": -326.66534423828125, "loss": 1.0577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5349130630493164, "rewards/margins": -0.18296518921852112, "rewards/rejected": 2.717878580093384, "step": 46830 }, { "epoch": 2.1746599192163054, "grad_norm": 27.89600372314453, "learning_rate": 1.6958447467384742e-07, "logits/chosen": -20.26502799987793, "logits/rejected": -19.61317253112793, "logps/chosen": -330.27227783203125, "logps/rejected": -292.10894775390625, "loss": 0.8317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.999387264251709, "rewards/margins": 0.4948412775993347, "rewards/rejected": 2.5045461654663086, "step": 46840 }, { "epoch": 2.175124193323738, "grad_norm": 198.0425262451172, "learning_rate": 1.6955661822740146e-07, "logits/chosen": -18.904659271240234, "logits/rejected": -18.619592666625977, "logps/chosen": -315.9808349609375, "logps/rejected": -300.75164794921875, "loss": 1.4998, "rewards/accuracies": 0.5, "rewards/chosen": 2.1400957107543945, "rewards/margins": -0.7688528299331665, "rewards/rejected": 2.9089484214782715, "step": 46850 }, { "epoch": 2.1755884674311714, "grad_norm": 0.1300375759601593, "learning_rate": 1.6952876178095547e-07, "logits/chosen": -19.437158584594727, "logits/rejected": -18.331554412841797, "logps/chosen": -456.94091796875, "logps/rejected": -433.23956298828125, "loss": 0.7803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.979918003082275, "rewards/margins": 1.2076750993728638, "rewards/rejected": 3.7722434997558594, "step": 46860 }, { "epoch": 2.176052741538604, "grad_norm": 274.2079772949219, "learning_rate": 1.6950090533450951e-07, "logits/chosen": -18.900936126708984, "logits/rejected": -18.227235794067383, "logps/chosen": -285.8631286621094, "logps/rejected": -281.7510681152344, "loss": 0.7679, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.191188097000122, "rewards/margins": 1.374762773513794, "rewards/rejected": 1.8164256811141968, "step": 46870 }, { "epoch": 2.1765170156460374, "grad_norm": 49.48956298828125, "learning_rate": 1.694730488880635e-07, "logits/chosen": -18.18233299255371, "logits/rejected": -17.74969482421875, "logps/chosen": -342.74212646484375, "logps/rejected": -286.6898193359375, "loss": 0.7073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1450095176696777, "rewards/margins": 1.1544036865234375, "rewards/rejected": 1.9906055927276611, "step": 46880 }, { "epoch": 2.1769812897534706, "grad_norm": 84.0687026977539, "learning_rate": 1.6944519244161752e-07, "logits/chosen": -19.332901000976562, "logits/rejected": -17.321537017822266, "logps/chosen": -401.2219543457031, "logps/rejected": -256.95013427734375, "loss": 0.4171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.3730645179748535, "rewards/margins": 2.3243367671966553, "rewards/rejected": 3.04872727394104, "step": 46890 }, { "epoch": 2.1774455638609034, "grad_norm": 131.4071807861328, "learning_rate": 1.6941733599517156e-07, "logits/chosen": -19.081756591796875, "logits/rejected": -18.873502731323242, "logps/chosen": -354.79296875, "logps/rejected": -316.4072265625, "loss": 0.6589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0156686305999756, "rewards/margins": 1.3287832736968994, "rewards/rejected": 1.6868852376937866, "step": 46900 }, { "epoch": 2.1779098379683366, "grad_norm": 83.10690307617188, "learning_rate": 1.6938947954872555e-07, "logits/chosen": -19.158300399780273, "logits/rejected": -17.664093017578125, "logps/chosen": -352.79766845703125, "logps/rejected": -210.7409210205078, "loss": 0.5003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.653125047683716, "rewards/margins": 1.560003399848938, "rewards/rejected": 2.0931215286254883, "step": 46910 }, { "epoch": 2.1783741120757694, "grad_norm": 67.60855102539062, "learning_rate": 1.6936162310227956e-07, "logits/chosen": -19.175580978393555, "logits/rejected": -18.8228702545166, "logps/chosen": -363.58966064453125, "logps/rejected": -353.01385498046875, "loss": 0.7342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.446525812149048, "rewards/margins": 0.5956114530563354, "rewards/rejected": 1.8509142398834229, "step": 46920 }, { "epoch": 2.1788383861832026, "grad_norm": 22.882801055908203, "learning_rate": 1.693337666558336e-07, "logits/chosen": -18.89940643310547, "logits/rejected": -18.332111358642578, "logps/chosen": -352.8665771484375, "logps/rejected": -304.7383728027344, "loss": 0.8759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.590388774871826, "rewards/margins": 0.5264737010002136, "rewards/rejected": 3.0639147758483887, "step": 46930 }, { "epoch": 2.179302660290636, "grad_norm": 106.40463256835938, "learning_rate": 1.6930591020938762e-07, "logits/chosen": -18.411283493041992, "logits/rejected": -17.023052215576172, "logps/chosen": -355.6224670410156, "logps/rejected": -217.2815399169922, "loss": 0.3448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.786520481109619, "rewards/margins": 1.9397468566894531, "rewards/rejected": 0.8467734456062317, "step": 46940 }, { "epoch": 2.1797669343980686, "grad_norm": 83.42806243896484, "learning_rate": 1.6927805376294163e-07, "logits/chosen": -19.393659591674805, "logits/rejected": -17.96753692626953, "logps/chosen": -513.1704711914062, "logps/rejected": -352.1658020019531, "loss": 0.6203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2887797355651855, "rewards/margins": 1.6947200298309326, "rewards/rejected": 2.594059467315674, "step": 46950 }, { "epoch": 2.180231208505502, "grad_norm": 34.51558303833008, "learning_rate": 1.6925019731649564e-07, "logits/chosen": -18.515832901000977, "logits/rejected": -17.813032150268555, "logps/chosen": -300.0561218261719, "logps/rejected": -216.5570831298828, "loss": 0.453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6992180347442627, "rewards/margins": 1.3833107948303223, "rewards/rejected": 1.31590735912323, "step": 46960 }, { "epoch": 2.1806954826129346, "grad_norm": 18.815876007080078, "learning_rate": 1.6922234087004969e-07, "logits/chosen": -18.98488426208496, "logits/rejected": -17.736677169799805, "logps/chosen": -399.29827880859375, "logps/rejected": -318.1844177246094, "loss": 0.4365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6492815017700195, "rewards/margins": 1.8920180797576904, "rewards/rejected": 2.7572638988494873, "step": 46970 }, { "epoch": 2.1811597567203678, "grad_norm": 5.5228166580200195, "learning_rate": 1.691944844236037e-07, "logits/chosen": -20.06541633605957, "logits/rejected": -18.01642608642578, "logps/chosen": -451.85980224609375, "logps/rejected": -288.75830078125, "loss": 0.5487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.175912380218506, "rewards/margins": 1.6296989917755127, "rewards/rejected": 2.5462136268615723, "step": 46980 }, { "epoch": 2.1816240308278005, "grad_norm": 177.76309204101562, "learning_rate": 1.691666279771577e-07, "logits/chosen": -18.95390510559082, "logits/rejected": -18.3514461517334, "logps/chosen": -368.18878173828125, "logps/rejected": -303.14923095703125, "loss": 0.7291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.641204357147217, "rewards/margins": 0.3964725434780121, "rewards/rejected": 2.244732141494751, "step": 46990 }, { "epoch": 2.1820883049352338, "grad_norm": 26.59811782836914, "learning_rate": 1.6913877153071173e-07, "logits/chosen": -19.879512786865234, "logits/rejected": -18.605051040649414, "logps/chosen": -378.9024353027344, "logps/rejected": -295.6720886230469, "loss": 0.5176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.652568817138672, "rewards/margins": 1.555344581604004, "rewards/rejected": 2.097224235534668, "step": 47000 }, { "epoch": 2.182552579042667, "grad_norm": 119.36187744140625, "learning_rate": 1.6911091508426574e-07, "logits/chosen": -18.035015106201172, "logits/rejected": -17.865934371948242, "logps/chosen": -381.8631896972656, "logps/rejected": -383.7899475097656, "loss": 0.6612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2699973583221436, "rewards/margins": 1.1742522716522217, "rewards/rejected": 2.09574556350708, "step": 47010 }, { "epoch": 2.1830168531500997, "grad_norm": 16.519428253173828, "learning_rate": 1.6908305863781978e-07, "logits/chosen": -20.465778350830078, "logits/rejected": -19.589183807373047, "logps/chosen": -408.32977294921875, "logps/rejected": -378.99151611328125, "loss": 0.3374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.829263210296631, "rewards/margins": 2.0357003211975098, "rewards/rejected": 1.7935625314712524, "step": 47020 }, { "epoch": 2.183481127257533, "grad_norm": 112.21993255615234, "learning_rate": 1.6905520219137377e-07, "logits/chosen": -18.51295280456543, "logits/rejected": -17.631973266601562, "logps/chosen": -329.02081298828125, "logps/rejected": -252.0664520263672, "loss": 0.7998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8415703773498535, "rewards/margins": 1.271079421043396, "rewards/rejected": 1.5704905986785889, "step": 47030 }, { "epoch": 2.1839454013649657, "grad_norm": 95.62097930908203, "learning_rate": 1.690273457449278e-07, "logits/chosen": -19.715564727783203, "logits/rejected": -18.367326736450195, "logps/chosen": -339.3900451660156, "logps/rejected": -228.7073211669922, "loss": 0.4068, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1056127548217773, "rewards/margins": 1.6054646968841553, "rewards/rejected": 1.500147819519043, "step": 47040 }, { "epoch": 2.184409675472399, "grad_norm": 153.2001495361328, "learning_rate": 1.6899948929848183e-07, "logits/chosen": -19.118310928344727, "logits/rejected": -18.125654220581055, "logps/chosen": -376.0675354003906, "logps/rejected": -317.5582580566406, "loss": 0.3873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1141762733459473, "rewards/margins": 1.3708919286727905, "rewards/rejected": 1.7432844638824463, "step": 47050 }, { "epoch": 2.184873949579832, "grad_norm": 154.65748596191406, "learning_rate": 1.6897163285203584e-07, "logits/chosen": -18.744075775146484, "logits/rejected": -19.23380470275879, "logps/chosen": -452.2005920410156, "logps/rejected": -580.7679443359375, "loss": 1.1169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.058138370513916, "rewards/margins": -0.05319490283727646, "rewards/rejected": 4.111332893371582, "step": 47060 }, { "epoch": 2.185338223687265, "grad_norm": 38.998085021972656, "learning_rate": 1.6894377640558986e-07, "logits/chosen": -19.690427780151367, "logits/rejected": -19.07247543334961, "logps/chosen": -428.13641357421875, "logps/rejected": -358.81231689453125, "loss": 0.2633, "rewards/accuracies": 1.0, "rewards/chosen": 3.89178204536438, "rewards/margins": 1.6439628601074219, "rewards/rejected": 2.247819423675537, "step": 47070 }, { "epoch": 2.185802497794698, "grad_norm": 140.06707763671875, "learning_rate": 1.6891591995914387e-07, "logits/chosen": -18.695955276489258, "logits/rejected": -18.990657806396484, "logps/chosen": -335.36175537109375, "logps/rejected": -329.52630615234375, "loss": 0.8321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.004462718963623, "rewards/margins": 0.5921538472175598, "rewards/rejected": 2.412308931350708, "step": 47080 }, { "epoch": 2.186266771902131, "grad_norm": 52.75849151611328, "learning_rate": 1.6888806351269789e-07, "logits/chosen": -19.288646697998047, "logits/rejected": -19.161203384399414, "logps/chosen": -439.16278076171875, "logps/rejected": -417.99835205078125, "loss": 0.871, "rewards/accuracies": 0.5, "rewards/chosen": 3.833354949951172, "rewards/margins": 0.367292195558548, "rewards/rejected": 3.4660630226135254, "step": 47090 }, { "epoch": 2.186731046009564, "grad_norm": 88.79447937011719, "learning_rate": 1.688602070662519e-07, "logits/chosen": -18.530061721801758, "logits/rejected": -17.923450469970703, "logps/chosen": -422.8340759277344, "logps/rejected": -317.6113586425781, "loss": 0.8766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.100831031799316, "rewards/margins": 0.6765937805175781, "rewards/rejected": 3.424237012863159, "step": 47100 }, { "epoch": 2.187195320116997, "grad_norm": 18.501218795776367, "learning_rate": 1.6883235061980592e-07, "logits/chosen": -19.138317108154297, "logits/rejected": -18.49042320251465, "logps/chosen": -425.08624267578125, "logps/rejected": -345.5802001953125, "loss": 0.233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.086781978607178, "rewards/margins": 2.1159591674804688, "rewards/rejected": 1.9708226919174194, "step": 47110 }, { "epoch": 2.18765959422443, "grad_norm": 48.778865814208984, "learning_rate": 1.6880449417335996e-07, "logits/chosen": -18.325603485107422, "logits/rejected": -17.721492767333984, "logps/chosen": -364.468505859375, "logps/rejected": -350.26824951171875, "loss": 0.7876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.258970260620117, "rewards/margins": 0.9717505574226379, "rewards/rejected": 2.287220001220703, "step": 47120 }, { "epoch": 2.1881238683318633, "grad_norm": 91.52851104736328, "learning_rate": 1.6877663772691397e-07, "logits/chosen": -18.711454391479492, "logits/rejected": -18.53794288635254, "logps/chosen": -375.52777099609375, "logps/rejected": -381.2413024902344, "loss": 0.9009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2510673999786377, "rewards/margins": 0.23887956142425537, "rewards/rejected": 3.0121877193450928, "step": 47130 }, { "epoch": 2.188588142439296, "grad_norm": 23.18983268737793, "learning_rate": 1.6874878128046796e-07, "logits/chosen": -18.91048240661621, "logits/rejected": -18.320331573486328, "logps/chosen": -351.9544982910156, "logps/rejected": -262.0375671386719, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.25895881652832, "rewards/margins": 1.9388643503189087, "rewards/rejected": 2.320094585418701, "step": 47140 }, { "epoch": 2.1890524165467293, "grad_norm": 18.032766342163086, "learning_rate": 1.68720924834022e-07, "logits/chosen": -18.140361785888672, "logits/rejected": -19.19200325012207, "logps/chosen": -326.65020751953125, "logps/rejected": -343.85382080078125, "loss": 1.6155, "rewards/accuracies": 0.5, "rewards/chosen": 2.4459073543548584, "rewards/margins": -0.8751307725906372, "rewards/rejected": 3.321038007736206, "step": 47150 }, { "epoch": 2.189516690654162, "grad_norm": 122.28203582763672, "learning_rate": 1.6869306838757601e-07, "logits/chosen": -19.425655364990234, "logits/rejected": -18.831430435180664, "logps/chosen": -474.56646728515625, "logps/rejected": -400.13885498046875, "loss": 0.6281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.700155258178711, "rewards/margins": 0.842342734336853, "rewards/rejected": 2.8578126430511475, "step": 47160 }, { "epoch": 2.1899809647615953, "grad_norm": 70.58617401123047, "learning_rate": 1.6866521194113006e-07, "logits/chosen": -18.864852905273438, "logits/rejected": -18.992998123168945, "logps/chosen": -435.3695373535156, "logps/rejected": -358.9839782714844, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.630072593688965, "rewards/margins": 0.6027865409851074, "rewards/rejected": 2.0272858142852783, "step": 47170 }, { "epoch": 2.190445238869028, "grad_norm": 143.6057586669922, "learning_rate": 1.6863735549468404e-07, "logits/chosen": -18.370304107666016, "logits/rejected": -18.24395179748535, "logps/chosen": -399.4139709472656, "logps/rejected": -320.8525390625, "loss": 0.874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.996122360229492, "rewards/margins": -0.04518892616033554, "rewards/rejected": 3.041311264038086, "step": 47180 }, { "epoch": 2.1909095129764613, "grad_norm": 1.2070621252059937, "learning_rate": 1.6860949904823806e-07, "logits/chosen": -18.67048454284668, "logits/rejected": -17.89870262145996, "logps/chosen": -286.2237548828125, "logps/rejected": -257.97540283203125, "loss": 0.593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2530837059020996, "rewards/margins": 1.0533744096755981, "rewards/rejected": 2.199709415435791, "step": 47190 }, { "epoch": 2.1913737870838945, "grad_norm": 89.69815063476562, "learning_rate": 1.685816426017921e-07, "logits/chosen": -18.3403263092041, "logits/rejected": -18.214868545532227, "logps/chosen": -281.1751403808594, "logps/rejected": -239.7638702392578, "loss": 0.4742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.324962615966797, "rewards/margins": 1.0031286478042603, "rewards/rejected": 1.3218339681625366, "step": 47200 }, { "epoch": 2.1918380611913273, "grad_norm": 68.56610107421875, "learning_rate": 1.6855378615534611e-07, "logits/chosen": -18.65793228149414, "logits/rejected": -17.72597312927246, "logps/chosen": -325.0231628417969, "logps/rejected": -198.6039581298828, "loss": 0.5242, "rewards/accuracies": 0.5, "rewards/chosen": 2.930286407470703, "rewards/margins": 1.2839449644088745, "rewards/rejected": 1.646341323852539, "step": 47210 }, { "epoch": 2.1923023352987605, "grad_norm": 32.57686233520508, "learning_rate": 1.6852592970890013e-07, "logits/chosen": -19.898372650146484, "logits/rejected": -18.826261520385742, "logps/chosen": -336.9012145996094, "logps/rejected": -225.54421997070312, "loss": 0.3294, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4655601978302, "rewards/margins": 1.6690326929092407, "rewards/rejected": 1.7965275049209595, "step": 47220 }, { "epoch": 2.1927666094061933, "grad_norm": 140.99526977539062, "learning_rate": 1.6849807326245414e-07, "logits/chosen": -18.79836082458496, "logits/rejected": -18.179176330566406, "logps/chosen": -501.98565673828125, "logps/rejected": -377.43719482421875, "loss": 0.4471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.279526948928833, "rewards/margins": 1.5772130489349365, "rewards/rejected": 1.7023141384124756, "step": 47230 }, { "epoch": 2.1932308835136265, "grad_norm": 2.5503854751586914, "learning_rate": 1.6847021681600818e-07, "logits/chosen": -19.41801643371582, "logits/rejected": -18.936866760253906, "logps/chosen": -292.09967041015625, "logps/rejected": -220.08779907226562, "loss": 0.9256, "rewards/accuracies": 0.5, "rewards/chosen": 2.232018232345581, "rewards/margins": 0.38264960050582886, "rewards/rejected": 1.849368691444397, "step": 47240 }, { "epoch": 2.1936951576210593, "grad_norm": 46.96310043334961, "learning_rate": 1.684423603695622e-07, "logits/chosen": -19.802284240722656, "logits/rejected": -18.29914093017578, "logps/chosen": -373.7742614746094, "logps/rejected": -269.72882080078125, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.65516996383667, "rewards/margins": 1.0825732946395874, "rewards/rejected": 2.572596549987793, "step": 47250 }, { "epoch": 2.1941594317284925, "grad_norm": 36.82844161987305, "learning_rate": 1.6841450392311619e-07, "logits/chosen": -18.95199966430664, "logits/rejected": -18.629148483276367, "logps/chosen": -358.4596252441406, "logps/rejected": -292.45330810546875, "loss": 0.8573, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.089303493499756, "rewards/margins": 0.9146442413330078, "rewards/rejected": 2.174659252166748, "step": 47260 }, { "epoch": 2.1946237058359257, "grad_norm": 18.756793975830078, "learning_rate": 1.6838664747667023e-07, "logits/chosen": -19.139232635498047, "logits/rejected": -16.974811553955078, "logps/chosen": -497.4737243652344, "logps/rejected": -300.5422668457031, "loss": 0.2188, "rewards/accuracies": 1.0, "rewards/chosen": 4.704802513122559, "rewards/margins": 2.6993942260742188, "rewards/rejected": 2.0054080486297607, "step": 47270 }, { "epoch": 2.1950879799433585, "grad_norm": 1.0148680210113525, "learning_rate": 1.6835879103022424e-07, "logits/chosen": -19.50525665283203, "logits/rejected": -18.6723690032959, "logps/chosen": -356.84869384765625, "logps/rejected": -347.20452880859375, "loss": 0.9269, "rewards/accuracies": 0.5, "rewards/chosen": 2.4829211235046387, "rewards/margins": 0.38062095642089844, "rewards/rejected": 2.1023001670837402, "step": 47280 }, { "epoch": 2.1955522540507917, "grad_norm": 17.096364974975586, "learning_rate": 1.6833093458377823e-07, "logits/chosen": -18.490840911865234, "logits/rejected": -17.382648468017578, "logps/chosen": -291.0995788574219, "logps/rejected": -189.12399291992188, "loss": 0.2244, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.531895399093628, "rewards/margins": 2.190974712371826, "rewards/rejected": 1.34092116355896, "step": 47290 }, { "epoch": 2.1960165281582245, "grad_norm": 42.47968292236328, "learning_rate": 1.6830307813733227e-07, "logits/chosen": -18.907934188842773, "logits/rejected": -18.05666160583496, "logps/chosen": -415.8619689941406, "logps/rejected": -323.44573974609375, "loss": 0.4214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.961616277694702, "rewards/margins": 0.8751066327095032, "rewards/rejected": 2.0865097045898438, "step": 47300 }, { "epoch": 2.1964808022656577, "grad_norm": 1.943888545036316, "learning_rate": 1.6827522169088629e-07, "logits/chosen": -19.584163665771484, "logits/rejected": -18.09335708618164, "logps/chosen": -374.2776184082031, "logps/rejected": -255.19216918945312, "loss": 0.5067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.260301113128662, "rewards/margins": 1.6506942510604858, "rewards/rejected": 2.6096067428588867, "step": 47310 }, { "epoch": 2.1969450763730904, "grad_norm": 117.74173736572266, "learning_rate": 1.6824736524444033e-07, "logits/chosen": -18.453998565673828, "logits/rejected": -18.28426742553711, "logps/chosen": -363.71954345703125, "logps/rejected": -363.2632751464844, "loss": 0.8507, "rewards/accuracies": 0.5, "rewards/chosen": 3.495373249053955, "rewards/margins": 0.1503111869096756, "rewards/rejected": 3.345061779022217, "step": 47320 }, { "epoch": 2.1974093504805237, "grad_norm": 0.4699890911579132, "learning_rate": 1.6821950879799431e-07, "logits/chosen": -18.39668083190918, "logits/rejected": -17.428348541259766, "logps/chosen": -349.26416015625, "logps/rejected": -235.5431365966797, "loss": 0.4109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.039537191390991, "rewards/margins": 1.6878302097320557, "rewards/rejected": 1.3517073392868042, "step": 47330 }, { "epoch": 2.197873624587957, "grad_norm": 42.0103759765625, "learning_rate": 1.6819165235154833e-07, "logits/chosen": -18.80966567993164, "logits/rejected": -17.749032974243164, "logps/chosen": -481.58892822265625, "logps/rejected": -369.913330078125, "loss": 0.8904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.537856578826904, "rewards/margins": 0.8999286890029907, "rewards/rejected": 3.637928009033203, "step": 47340 }, { "epoch": 2.1983378986953896, "grad_norm": 89.88130950927734, "learning_rate": 1.6816379590510237e-07, "logits/chosen": -18.503543853759766, "logits/rejected": -18.0627498626709, "logps/chosen": -403.10137939453125, "logps/rejected": -318.85260009765625, "loss": 0.5714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8564469814300537, "rewards/margins": 0.6548963785171509, "rewards/rejected": 2.2015509605407715, "step": 47350 }, { "epoch": 2.198802172802823, "grad_norm": 127.51848602294922, "learning_rate": 1.6813593945865638e-07, "logits/chosen": -19.02259635925293, "logits/rejected": -19.165937423706055, "logps/chosen": -376.7886657714844, "logps/rejected": -434.9598083496094, "loss": 1.018, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.041992664337158, "rewards/margins": -0.33456867933273315, "rewards/rejected": 3.376561403274536, "step": 47360 }, { "epoch": 2.1992664469102556, "grad_norm": 169.3943328857422, "learning_rate": 1.681080830122104e-07, "logits/chosen": -18.433456420898438, "logits/rejected": -17.483829498291016, "logps/chosen": -423.2861328125, "logps/rejected": -328.4618225097656, "loss": 0.8459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3533973693847656, "rewards/margins": 1.1006755828857422, "rewards/rejected": 2.2527220249176025, "step": 47370 }, { "epoch": 2.199730721017689, "grad_norm": 234.97708129882812, "learning_rate": 1.6808022656576441e-07, "logits/chosen": -17.862577438354492, "logits/rejected": -17.917142868041992, "logps/chosen": -268.681640625, "logps/rejected": -274.260498046875, "loss": 0.9595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6485047340393066, "rewards/margins": 0.5327457189559937, "rewards/rejected": 2.1157591342926025, "step": 47380 }, { "epoch": 2.200194995125122, "grad_norm": 195.1226806640625, "learning_rate": 1.6805237011931845e-07, "logits/chosen": -19.387889862060547, "logits/rejected": -18.835628509521484, "logps/chosen": -441.3561096191406, "logps/rejected": -391.42327880859375, "loss": 1.1652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.141620397567749, "rewards/margins": 0.014699148945510387, "rewards/rejected": 3.1269211769104004, "step": 47390 }, { "epoch": 2.200659269232555, "grad_norm": 0.7305449843406677, "learning_rate": 1.6802451367287247e-07, "logits/chosen": -18.971614837646484, "logits/rejected": -17.79195785522461, "logps/chosen": -558.3480834960938, "logps/rejected": -410.399169921875, "loss": 0.5912, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.428932189941406, "rewards/margins": 1.3071577548980713, "rewards/rejected": 3.121774911880493, "step": 47400 }, { "epoch": 2.201123543339988, "grad_norm": 17.027841567993164, "learning_rate": 1.6799665722642646e-07, "logits/chosen": -18.831096649169922, "logits/rejected": -18.104209899902344, "logps/chosen": -312.21173095703125, "logps/rejected": -228.2025604248047, "loss": 0.5445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.617382764816284, "rewards/margins": 1.2462002038955688, "rewards/rejected": 1.3711825609207153, "step": 47410 }, { "epoch": 2.201587817447421, "grad_norm": 37.217803955078125, "learning_rate": 1.679688007799805e-07, "logits/chosen": -19.407747268676758, "logits/rejected": -19.59238052368164, "logps/chosen": -330.6898193359375, "logps/rejected": -319.8103942871094, "loss": 1.0729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.027930974960327, "rewards/margins": 0.283097505569458, "rewards/rejected": 2.7448337078094482, "step": 47420 }, { "epoch": 2.202052091554854, "grad_norm": 73.04273986816406, "learning_rate": 1.679409443335345e-07, "logits/chosen": -18.202007293701172, "logits/rejected": -17.684213638305664, "logps/chosen": -343.9331359863281, "logps/rejected": -295.2978210449219, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5307867527008057, "rewards/margins": 0.7221409678459167, "rewards/rejected": 1.8086456060409546, "step": 47430 }, { "epoch": 2.202516365662287, "grad_norm": 183.6783905029297, "learning_rate": 1.6791308788708855e-07, "logits/chosen": -18.242050170898438, "logits/rejected": -18.056438446044922, "logps/chosen": -306.70233154296875, "logps/rejected": -330.2645263671875, "loss": 0.8961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6458568572998047, "rewards/margins": 0.45893988013267517, "rewards/rejected": 2.1869168281555176, "step": 47440 }, { "epoch": 2.20298063976972, "grad_norm": 108.18648529052734, "learning_rate": 1.6788523144064254e-07, "logits/chosen": -18.349287033081055, "logits/rejected": -17.83621597290039, "logps/chosen": -287.1534423828125, "logps/rejected": -247.02560424804688, "loss": 0.5631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5462615489959717, "rewards/margins": 1.0333054065704346, "rewards/rejected": 1.512956142425537, "step": 47450 }, { "epoch": 2.2034449138771532, "grad_norm": 1.4230983257293701, "learning_rate": 1.6785737499419656e-07, "logits/chosen": -19.326496124267578, "logits/rejected": -18.236923217773438, "logps/chosen": -469.8582458496094, "logps/rejected": -372.7039489746094, "loss": 0.2894, "rewards/accuracies": 1.0, "rewards/chosen": 5.405754089355469, "rewards/margins": 1.9951465129852295, "rewards/rejected": 3.4106075763702393, "step": 47460 }, { "epoch": 2.203909187984586, "grad_norm": 120.57463073730469, "learning_rate": 1.678295185477506e-07, "logits/chosen": -18.196792602539062, "logits/rejected": -17.360532760620117, "logps/chosen": -387.59429931640625, "logps/rejected": -296.82708740234375, "loss": 0.3724, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3062961101531982, "rewards/margins": 1.5686960220336914, "rewards/rejected": 1.7375999689102173, "step": 47470 }, { "epoch": 2.2043734620920192, "grad_norm": 32.58550262451172, "learning_rate": 1.6780166210130459e-07, "logits/chosen": -18.786624908447266, "logits/rejected": -17.786935806274414, "logps/chosen": -370.3779296875, "logps/rejected": -240.68228149414062, "loss": 0.3826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.860572099685669, "rewards/margins": 1.1944873332977295, "rewards/rejected": 1.6660845279693604, "step": 47480 }, { "epoch": 2.204837736199452, "grad_norm": 39.87187194824219, "learning_rate": 1.6777380565485863e-07, "logits/chosen": -19.519222259521484, "logits/rejected": -18.15399169921875, "logps/chosen": -311.12200927734375, "logps/rejected": -250.31729125976562, "loss": 0.8613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4591145515441895, "rewards/margins": 0.5815444588661194, "rewards/rejected": 1.8775699138641357, "step": 47490 }, { "epoch": 2.205302010306885, "grad_norm": 6.805274486541748, "learning_rate": 1.6774594920841264e-07, "logits/chosen": -18.41883087158203, "logits/rejected": -17.276580810546875, "logps/chosen": -453.7372131347656, "logps/rejected": -244.81759643554688, "loss": 0.5428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.229006290435791, "rewards/margins": 1.2318780422210693, "rewards/rejected": 1.9971277713775635, "step": 47500 }, { "epoch": 2.2057662844143184, "grad_norm": 65.62383270263672, "learning_rate": 1.6771809276196666e-07, "logits/chosen": -18.204303741455078, "logits/rejected": -18.666492462158203, "logps/chosen": -331.19293212890625, "logps/rejected": -334.76190185546875, "loss": 0.6709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2829654216766357, "rewards/margins": 0.9409864544868469, "rewards/rejected": 2.341979503631592, "step": 47510 }, { "epoch": 2.206230558521751, "grad_norm": 21.885255813598633, "learning_rate": 1.6769023631552067e-07, "logits/chosen": -19.133602142333984, "logits/rejected": -18.315065383911133, "logps/chosen": -388.636474609375, "logps/rejected": -318.5108337402344, "loss": 0.8001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.617244243621826, "rewards/margins": 0.9435936808586121, "rewards/rejected": 3.673650026321411, "step": 47520 }, { "epoch": 2.2066948326291844, "grad_norm": 266.59527587890625, "learning_rate": 1.6766237986907468e-07, "logits/chosen": -18.680213928222656, "logits/rejected": -18.28840446472168, "logps/chosen": -453.75433349609375, "logps/rejected": -405.92242431640625, "loss": 0.8963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.292952537536621, "rewards/margins": 0.674453616142273, "rewards/rejected": 3.618499279022217, "step": 47530 }, { "epoch": 2.207159106736617, "grad_norm": 6.225310802459717, "learning_rate": 1.6763452342262873e-07, "logits/chosen": -18.041362762451172, "logits/rejected": -16.973535537719727, "logps/chosen": -381.62127685546875, "logps/rejected": -269.93511962890625, "loss": 0.6612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0676984786987305, "rewards/margins": 1.7068901062011719, "rewards/rejected": 2.3608083724975586, "step": 47540 }, { "epoch": 2.2076233808440504, "grad_norm": 107.86458587646484, "learning_rate": 1.6760666697618274e-07, "logits/chosen": -19.153282165527344, "logits/rejected": -17.8375244140625, "logps/chosen": -445.706787109375, "logps/rejected": -309.4840393066406, "loss": 0.7963, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.272993803024292, "rewards/margins": 1.3148701190948486, "rewards/rejected": 1.958123803138733, "step": 47550 }, { "epoch": 2.208087654951483, "grad_norm": 211.22157287597656, "learning_rate": 1.6757881052973673e-07, "logits/chosen": -19.30478286743164, "logits/rejected": -18.54574203491211, "logps/chosen": -431.12579345703125, "logps/rejected": -349.1520690917969, "loss": 0.6674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2284908294677734, "rewards/margins": 0.6747244000434875, "rewards/rejected": 2.5537667274475098, "step": 47560 }, { "epoch": 2.2085519290589164, "grad_norm": 87.05888366699219, "learning_rate": 1.6755095408329077e-07, "logits/chosen": -18.326297760009766, "logits/rejected": -18.374759674072266, "logps/chosen": -303.72607421875, "logps/rejected": -279.58013916015625, "loss": 0.8029, "rewards/accuracies": 0.5, "rewards/chosen": 2.788930892944336, "rewards/margins": 0.6848338842391968, "rewards/rejected": 2.1040968894958496, "step": 47570 }, { "epoch": 2.2090162031663496, "grad_norm": 58.944095611572266, "learning_rate": 1.6752309763684478e-07, "logits/chosen": -19.354686737060547, "logits/rejected": -18.723533630371094, "logps/chosen": -391.6367492675781, "logps/rejected": -342.7109375, "loss": 0.7757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4050750732421875, "rewards/margins": 0.21438440680503845, "rewards/rejected": 3.190690517425537, "step": 47580 }, { "epoch": 2.2094804772737824, "grad_norm": 115.08682250976562, "learning_rate": 1.6749524119039882e-07, "logits/chosen": -18.72374725341797, "logits/rejected": -17.895328521728516, "logps/chosen": -404.18035888671875, "logps/rejected": -273.0128479003906, "loss": 0.3907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.443396806716919, "rewards/margins": 1.4419373273849487, "rewards/rejected": 2.0014593601226807, "step": 47590 }, { "epoch": 2.2099447513812156, "grad_norm": 48.98170471191406, "learning_rate": 1.674673847439528e-07, "logits/chosen": -18.079570770263672, "logits/rejected": -17.361618041992188, "logps/chosen": -350.3414306640625, "logps/rejected": -261.6790466308594, "loss": 0.9025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.666602373123169, "rewards/margins": 0.5398561358451843, "rewards/rejected": 2.126746416091919, "step": 47600 }, { "epoch": 2.2104090254886484, "grad_norm": 168.48985290527344, "learning_rate": 1.6743952829750683e-07, "logits/chosen": -18.90281105041504, "logits/rejected": -18.53334617614746, "logps/chosen": -461.69049072265625, "logps/rejected": -422.7080993652344, "loss": 1.0729, "rewards/accuracies": 0.5, "rewards/chosen": 3.586587905883789, "rewards/margins": -0.11254777759313583, "rewards/rejected": 3.6991355419158936, "step": 47610 }, { "epoch": 2.2108732995960816, "grad_norm": 46.84124755859375, "learning_rate": 1.6741167185106087e-07, "logits/chosen": -19.542999267578125, "logits/rejected": -19.042570114135742, "logps/chosen": -340.15185546875, "logps/rejected": -225.88735961914062, "loss": 0.6174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8194193840026855, "rewards/margins": 1.4676660299301147, "rewards/rejected": 2.3517537117004395, "step": 47620 }, { "epoch": 2.2113375737035144, "grad_norm": 100.28666687011719, "learning_rate": 1.6738381540461488e-07, "logits/chosen": -18.822689056396484, "logits/rejected": -17.11705780029297, "logps/chosen": -325.527587890625, "logps/rejected": -192.5873565673828, "loss": 0.4561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6434223651885986, "rewards/margins": 2.777362585067749, "rewards/rejected": 0.8660597801208496, "step": 47630 }, { "epoch": 2.2118018478109476, "grad_norm": 15.67757797241211, "learning_rate": 1.673559589581689e-07, "logits/chosen": -18.2895565032959, "logits/rejected": -17.673681259155273, "logps/chosen": -350.8534240722656, "logps/rejected": -389.45452880859375, "loss": 1.3071, "rewards/accuracies": 0.5, "rewards/chosen": 2.7074408531188965, "rewards/margins": -0.14272598922252655, "rewards/rejected": 2.8501670360565186, "step": 47640 }, { "epoch": 2.212266121918381, "grad_norm": 213.88955688476562, "learning_rate": 1.673281025117229e-07, "logits/chosen": -19.07504653930664, "logits/rejected": -19.23533821105957, "logps/chosen": -501.38922119140625, "logps/rejected": -411.8392639160156, "loss": 0.4449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2566022872924805, "rewards/margins": 1.4279652833938599, "rewards/rejected": 2.828636646270752, "step": 47650 }, { "epoch": 2.2127303960258136, "grad_norm": 140.2361602783203, "learning_rate": 1.6730024606527695e-07, "logits/chosen": -19.721397399902344, "logits/rejected": -18.484098434448242, "logps/chosen": -410.5135192871094, "logps/rejected": -331.381591796875, "loss": 0.3218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6874217987060547, "rewards/margins": 1.5853850841522217, "rewards/rejected": 2.102036952972412, "step": 47660 }, { "epoch": 2.2131946701332468, "grad_norm": 6.72937536239624, "learning_rate": 1.6727238961883094e-07, "logits/chosen": -18.559680938720703, "logits/rejected": -17.404674530029297, "logps/chosen": -433.3363342285156, "logps/rejected": -348.34564208984375, "loss": 0.4763, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.493961811065674, "rewards/margins": 1.9261786937713623, "rewards/rejected": 2.5677833557128906, "step": 47670 }, { "epoch": 2.2136589442406795, "grad_norm": 176.24710083007812, "learning_rate": 1.6724453317238496e-07, "logits/chosen": -19.37667465209961, "logits/rejected": -18.584077835083008, "logps/chosen": -460.3726501464844, "logps/rejected": -430.71807861328125, "loss": 0.5887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9498343467712402, "rewards/margins": 0.9295404553413391, "rewards/rejected": 3.020294427871704, "step": 47680 }, { "epoch": 2.2141232183481128, "grad_norm": 98.59513854980469, "learning_rate": 1.67216676725939e-07, "logits/chosen": -18.785919189453125, "logits/rejected": -18.253345489501953, "logps/chosen": -414.836181640625, "logps/rejected": -303.0775146484375, "loss": 0.4948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1162619590759277, "rewards/margins": 1.4237877130508423, "rewards/rejected": 1.692474126815796, "step": 47690 }, { "epoch": 2.2145874924555455, "grad_norm": 92.82511138916016, "learning_rate": 1.67188820279493e-07, "logits/chosen": -18.89928436279297, "logits/rejected": -18.469717025756836, "logps/chosen": -362.6381530761719, "logps/rejected": -320.5865783691406, "loss": 0.6799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8799943923950195, "rewards/margins": 0.5575836896896362, "rewards/rejected": 2.3224103450775146, "step": 47700 }, { "epoch": 2.2150517665629788, "grad_norm": 37.65372085571289, "learning_rate": 1.67160963833047e-07, "logits/chosen": -18.35175132751465, "logits/rejected": -18.234357833862305, "logps/chosen": -440.92462158203125, "logps/rejected": -459.65887451171875, "loss": 0.6566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5815200805664062, "rewards/margins": 0.5669606328010559, "rewards/rejected": 3.014559507369995, "step": 47710 }, { "epoch": 2.215516040670412, "grad_norm": 1.3259854316711426, "learning_rate": 1.6713310738660104e-07, "logits/chosen": -18.82707405090332, "logits/rejected": -18.668041229248047, "logps/chosen": -413.4007873535156, "logps/rejected": -347.85345458984375, "loss": 0.8222, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.562784194946289, "rewards/margins": 0.6986575126647949, "rewards/rejected": 2.8641269207000732, "step": 47720 }, { "epoch": 2.2159803147778447, "grad_norm": 148.7829132080078, "learning_rate": 1.6710525094015505e-07, "logits/chosen": -19.756864547729492, "logits/rejected": -19.60774040222168, "logps/chosen": -445.93377685546875, "logps/rejected": -438.1681213378906, "loss": 0.7069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4690959453582764, "rewards/margins": 0.5351706743240356, "rewards/rejected": 2.9339258670806885, "step": 47730 }, { "epoch": 2.216444588885278, "grad_norm": 23.425457000732422, "learning_rate": 1.670773944937091e-07, "logits/chosen": -18.240711212158203, "logits/rejected": -18.279565811157227, "logps/chosen": -385.4905700683594, "logps/rejected": -472.42559814453125, "loss": 0.8882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.428110122680664, "rewards/margins": 0.321452796459198, "rewards/rejected": 3.1066572666168213, "step": 47740 }, { "epoch": 2.2169088629927107, "grad_norm": 39.37617111206055, "learning_rate": 1.6704953804726308e-07, "logits/chosen": -18.021625518798828, "logits/rejected": -17.683069229125977, "logps/chosen": -499.3292541503906, "logps/rejected": -435.39532470703125, "loss": 0.9937, "rewards/accuracies": 0.5, "rewards/chosen": 3.329882860183716, "rewards/margins": 0.39445438981056213, "rewards/rejected": 2.9354286193847656, "step": 47750 }, { "epoch": 2.217373137100144, "grad_norm": 34.10846710205078, "learning_rate": 1.670216816008171e-07, "logits/chosen": -18.752628326416016, "logits/rejected": -17.69503402709961, "logps/chosen": -367.96954345703125, "logps/rejected": -226.24880981445312, "loss": 0.4518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4492945671081543, "rewards/margins": 0.7598556280136108, "rewards/rejected": 1.689439058303833, "step": 47760 }, { "epoch": 2.217837411207577, "grad_norm": 227.03158569335938, "learning_rate": 1.6699382515437114e-07, "logits/chosen": -18.683744430541992, "logits/rejected": -18.09039306640625, "logps/chosen": -457.0647888183594, "logps/rejected": -380.4178161621094, "loss": 0.5076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8132760524749756, "rewards/margins": 1.1667150259017944, "rewards/rejected": 1.6465610265731812, "step": 47770 }, { "epoch": 2.21830168531501, "grad_norm": 162.32778930664062, "learning_rate": 1.6696596870792515e-07, "logits/chosen": -18.92508316040039, "logits/rejected": -18.78371238708496, "logps/chosen": -464.2725524902344, "logps/rejected": -418.6844787597656, "loss": 0.6841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.711587429046631, "rewards/margins": 1.0705294609069824, "rewards/rejected": 3.6410574913024902, "step": 47780 }, { "epoch": 2.218765959422443, "grad_norm": 131.852783203125, "learning_rate": 1.6693811226147917e-07, "logits/chosen": -18.660795211791992, "logits/rejected": -18.716045379638672, "logps/chosen": -406.74853515625, "logps/rejected": -367.6003112792969, "loss": 0.7482, "rewards/accuracies": 0.5, "rewards/chosen": 3.733588695526123, "rewards/margins": 1.1011378765106201, "rewards/rejected": 2.632450819015503, "step": 47790 }, { "epoch": 2.219230233529876, "grad_norm": 63.57435607910156, "learning_rate": 1.6691025581503318e-07, "logits/chosen": -19.90142250061035, "logits/rejected": -18.319238662719727, "logps/chosen": -403.727783203125, "logps/rejected": -344.2961120605469, "loss": 0.4391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9742679595947266, "rewards/margins": 1.9912135601043701, "rewards/rejected": 1.9830543994903564, "step": 47800 }, { "epoch": 2.219694507637309, "grad_norm": 9.991031646728516, "learning_rate": 1.6688239936858722e-07, "logits/chosen": -19.058433532714844, "logits/rejected": -17.9116268157959, "logps/chosen": -415.8326721191406, "logps/rejected": -267.0153503417969, "loss": 0.3735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.164167881011963, "rewards/margins": 1.0869334936141968, "rewards/rejected": 2.0772345066070557, "step": 47810 }, { "epoch": 2.220158781744742, "grad_norm": 1.762584924697876, "learning_rate": 1.6685454292214124e-07, "logits/chosen": -18.294208526611328, "logits/rejected": -17.67905044555664, "logps/chosen": -246.80331420898438, "logps/rejected": -189.5526580810547, "loss": 0.9057, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1661887168884277, "rewards/margins": 1.1082446575164795, "rewards/rejected": 1.0579442977905273, "step": 47820 }, { "epoch": 2.220623055852175, "grad_norm": 38.714263916015625, "learning_rate": 1.6682668647569523e-07, "logits/chosen": -18.980560302734375, "logits/rejected": -18.982282638549805, "logps/chosen": -379.8348083496094, "logps/rejected": -377.320556640625, "loss": 0.7526, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1171011924743652, "rewards/margins": 0.20032265782356262, "rewards/rejected": 2.916778564453125, "step": 47830 }, { "epoch": 2.2210873299596083, "grad_norm": 62.6083869934082, "learning_rate": 1.6679883002924927e-07, "logits/chosen": -19.219928741455078, "logits/rejected": -18.44647979736328, "logps/chosen": -375.5965270996094, "logps/rejected": -292.3179931640625, "loss": 0.4796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.629207134246826, "rewards/margins": 1.0926960706710815, "rewards/rejected": 2.536510944366455, "step": 47840 }, { "epoch": 2.221551604067041, "grad_norm": 23.230520248413086, "learning_rate": 1.6677097358280328e-07, "logits/chosen": -18.449649810791016, "logits/rejected": -18.032672882080078, "logps/chosen": -396.71075439453125, "logps/rejected": -416.96429443359375, "loss": 0.8613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8002688884735107, "rewards/margins": 0.8084951639175415, "rewards/rejected": 2.991774082183838, "step": 47850 }, { "epoch": 2.2220158781744743, "grad_norm": 82.25534057617188, "learning_rate": 1.6674311713635732e-07, "logits/chosen": -18.73946189880371, "logits/rejected": -18.808002471923828, "logps/chosen": -454.4136657714844, "logps/rejected": -481.69720458984375, "loss": 0.609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5654759407043457, "rewards/margins": 0.4368619918823242, "rewards/rejected": 3.128613233566284, "step": 47860 }, { "epoch": 2.222480152281907, "grad_norm": 21.65461540222168, "learning_rate": 1.667152606899113e-07, "logits/chosen": -18.993370056152344, "logits/rejected": -18.141658782958984, "logps/chosen": -448.96392822265625, "logps/rejected": -363.68670654296875, "loss": 0.3124, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.236950397491455, "rewards/margins": 1.3128875494003296, "rewards/rejected": 2.924062967300415, "step": 47870 }, { "epoch": 2.2229444263893403, "grad_norm": 30.00179672241211, "learning_rate": 1.6668740424346533e-07, "logits/chosen": -19.727304458618164, "logits/rejected": -18.93492317199707, "logps/chosen": -421.9727478027344, "logps/rejected": -356.06097412109375, "loss": 0.3937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.487586975097656, "rewards/margins": 1.302287220954895, "rewards/rejected": 3.18530011177063, "step": 47880 }, { "epoch": 2.2234087004967735, "grad_norm": 1.922605037689209, "learning_rate": 1.6665954779701937e-07, "logits/chosen": -18.937423706054688, "logits/rejected": -19.32012367248535, "logps/chosen": -423.77166748046875, "logps/rejected": -437.9232482910156, "loss": 0.8538, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.2077226638793945, "rewards/margins": 0.2815554141998291, "rewards/rejected": 3.9261677265167236, "step": 47890 }, { "epoch": 2.2238729746042063, "grad_norm": 54.92503356933594, "learning_rate": 1.6663169135057335e-07, "logits/chosen": -19.336856842041016, "logits/rejected": -19.205982208251953, "logps/chosen": -410.59283447265625, "logps/rejected": -399.40191650390625, "loss": 0.88, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2440998554229736, "rewards/margins": 0.07608147710561752, "rewards/rejected": 3.168018341064453, "step": 47900 }, { "epoch": 2.2243372487116395, "grad_norm": 84.74044036865234, "learning_rate": 1.666038349041274e-07, "logits/chosen": -18.60051155090332, "logits/rejected": -17.719478607177734, "logps/chosen": -432.39013671875, "logps/rejected": -404.5492248535156, "loss": 0.4686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9542109966278076, "rewards/margins": 0.9631515741348267, "rewards/rejected": 2.9910595417022705, "step": 47910 }, { "epoch": 2.2248015228190723, "grad_norm": 158.71109008789062, "learning_rate": 1.665759784576814e-07, "logits/chosen": -18.821596145629883, "logits/rejected": -18.202375411987305, "logps/chosen": -335.32806396484375, "logps/rejected": -270.2469177246094, "loss": 0.7555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5858120918273926, "rewards/margins": 1.2083561420440674, "rewards/rejected": 1.3774559497833252, "step": 47920 }, { "epoch": 2.2252657969265055, "grad_norm": 135.7685089111328, "learning_rate": 1.6654812201123542e-07, "logits/chosen": -19.601823806762695, "logits/rejected": -18.89813232421875, "logps/chosen": -423.1959533691406, "logps/rejected": -362.739990234375, "loss": 0.6083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1258089542388916, "rewards/margins": 0.43036574125289917, "rewards/rejected": 2.6954429149627686, "step": 47930 }, { "epoch": 2.2257300710339383, "grad_norm": 67.90322875976562, "learning_rate": 1.6652026556478944e-07, "logits/chosen": -18.608036041259766, "logits/rejected": -18.245365142822266, "logps/chosen": -411.123779296875, "logps/rejected": -321.05523681640625, "loss": 0.7514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0635578632354736, "rewards/margins": 0.8059101104736328, "rewards/rejected": 2.257647752761841, "step": 47940 }, { "epoch": 2.2261943451413715, "grad_norm": 161.52810668945312, "learning_rate": 1.6649240911834345e-07, "logits/chosen": -19.532974243164062, "logits/rejected": -18.865161895751953, "logps/chosen": -475.821533203125, "logps/rejected": -398.72454833984375, "loss": 0.3125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.256463527679443, "rewards/margins": 1.4871212244033813, "rewards/rejected": 2.7693426609039307, "step": 47950 }, { "epoch": 2.2266586192488047, "grad_norm": 3.934217691421509, "learning_rate": 1.664645526718975e-07, "logits/chosen": -19.490253448486328, "logits/rejected": -17.748699188232422, "logps/chosen": -500.24066162109375, "logps/rejected": -413.688232421875, "loss": 0.6262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.500920295715332, "rewards/margins": 1.5962013006210327, "rewards/rejected": 2.9047188758850098, "step": 47960 }, { "epoch": 2.2271228933562375, "grad_norm": 82.4210433959961, "learning_rate": 1.664366962254515e-07, "logits/chosen": -20.00075912475586, "logits/rejected": -18.315196990966797, "logps/chosen": -304.4004821777344, "logps/rejected": -188.94290161132812, "loss": 0.5186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.359476089477539, "rewards/margins": 1.2116937637329102, "rewards/rejected": 1.1477824449539185, "step": 47970 }, { "epoch": 2.2275871674636707, "grad_norm": 35.71485137939453, "learning_rate": 1.664088397790055e-07, "logits/chosen": -19.28148651123047, "logits/rejected": -18.818859100341797, "logps/chosen": -282.2428283691406, "logps/rejected": -288.48748779296875, "loss": 0.6624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.391072988510132, "rewards/margins": 0.6575314998626709, "rewards/rejected": 1.733541488647461, "step": 47980 }, { "epoch": 2.2280514415711035, "grad_norm": 35.36197280883789, "learning_rate": 1.6638098333255954e-07, "logits/chosen": -18.577823638916016, "logits/rejected": -18.580259323120117, "logps/chosen": -322.1528625488281, "logps/rejected": -331.18572998046875, "loss": 0.7068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9815452098846436, "rewards/margins": 0.2772842049598694, "rewards/rejected": 2.704261302947998, "step": 47990 }, { "epoch": 2.2285157156785367, "grad_norm": 3.758636951446533, "learning_rate": 1.6635312688611355e-07, "logits/chosen": -19.19605827331543, "logits/rejected": -18.769994735717773, "logps/chosen": -327.69140625, "logps/rejected": -286.4533386230469, "loss": 0.6599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8638479709625244, "rewards/margins": 0.9350635409355164, "rewards/rejected": 1.9287840127944946, "step": 48000 }, { "epoch": 2.2289799897859695, "grad_norm": 169.49305725097656, "learning_rate": 1.663252704396676e-07, "logits/chosen": -19.01341438293457, "logits/rejected": -18.84111976623535, "logps/chosen": -273.9248352050781, "logps/rejected": -245.80032348632812, "loss": 0.8466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.479902744293213, "rewards/margins": 0.7984222769737244, "rewards/rejected": 2.681480646133423, "step": 48010 }, { "epoch": 2.2294442638934027, "grad_norm": 53.377315521240234, "learning_rate": 1.6629741399322158e-07, "logits/chosen": -18.115177154541016, "logits/rejected": -17.793621063232422, "logps/chosen": -405.0979309082031, "logps/rejected": -315.56475830078125, "loss": 1.0417, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.626760721206665, "rewards/margins": 0.09982363879680634, "rewards/rejected": 2.5269370079040527, "step": 48020 }, { "epoch": 2.229908538000836, "grad_norm": 136.75360107421875, "learning_rate": 1.662695575467756e-07, "logits/chosen": -19.203397750854492, "logits/rejected": -19.648374557495117, "logps/chosen": -355.57916259765625, "logps/rejected": -378.189453125, "loss": 1.6348, "rewards/accuracies": 0.5, "rewards/chosen": 2.181093454360962, "rewards/margins": -0.9646949768066406, "rewards/rejected": 3.1457884311676025, "step": 48030 }, { "epoch": 2.2303728121082687, "grad_norm": 60.69009017944336, "learning_rate": 1.6624170110032964e-07, "logits/chosen": -19.01850700378418, "logits/rejected": -18.288097381591797, "logps/chosen": -394.53131103515625, "logps/rejected": -339.47003173828125, "loss": 0.4283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.637907028198242, "rewards/margins": 1.2252830266952515, "rewards/rejected": 2.412623643875122, "step": 48040 }, { "epoch": 2.230837086215702, "grad_norm": 230.18406677246094, "learning_rate": 1.6621384465388365e-07, "logits/chosen": -19.48006248474121, "logits/rejected": -17.91189193725586, "logps/chosen": -506.9607849121094, "logps/rejected": -357.16986083984375, "loss": 0.5575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6339850425720215, "rewards/margins": 1.7369922399520874, "rewards/rejected": 2.8969924449920654, "step": 48050 }, { "epoch": 2.2313013603231346, "grad_norm": 180.04000854492188, "learning_rate": 1.6618598820743767e-07, "logits/chosen": -19.119680404663086, "logits/rejected": -18.537220001220703, "logps/chosen": -426.9842834472656, "logps/rejected": -323.64501953125, "loss": 0.8485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.443127632141113, "rewards/margins": 1.4600883722305298, "rewards/rejected": 2.983039379119873, "step": 48060 }, { "epoch": 2.231765634430568, "grad_norm": 113.78357696533203, "learning_rate": 1.6615813176099168e-07, "logits/chosen": -18.779611587524414, "logits/rejected": -18.716848373413086, "logps/chosen": -436.4810485839844, "logps/rejected": -395.1403503417969, "loss": 0.8556, "rewards/accuracies": 0.5, "rewards/chosen": 3.253627300262451, "rewards/margins": 0.3303855061531067, "rewards/rejected": 2.9232420921325684, "step": 48070 }, { "epoch": 2.2322299085380006, "grad_norm": 111.14566802978516, "learning_rate": 1.6613027531454572e-07, "logits/chosen": -19.400970458984375, "logits/rejected": -19.19974708557129, "logps/chosen": -439.098876953125, "logps/rejected": -449.69793701171875, "loss": 0.7305, "rewards/accuracies": 0.5, "rewards/chosen": 3.4997963905334473, "rewards/margins": 0.2608281970024109, "rewards/rejected": 3.2389683723449707, "step": 48080 }, { "epoch": 2.232694182645434, "grad_norm": 30.487951278686523, "learning_rate": 1.661024188680997e-07, "logits/chosen": -18.05399513244629, "logits/rejected": -17.872974395751953, "logps/chosen": -405.47796630859375, "logps/rejected": -344.00970458984375, "loss": 0.8541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9233667850494385, "rewards/margins": 0.32070592045783997, "rewards/rejected": 2.602660894393921, "step": 48090 }, { "epoch": 2.233158456752867, "grad_norm": 2.351306676864624, "learning_rate": 1.6607456242165372e-07, "logits/chosen": -17.643972396850586, "logits/rejected": -16.579954147338867, "logps/chosen": -378.9549560546875, "logps/rejected": -273.21820068359375, "loss": 0.7813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4612343311309814, "rewards/margins": 1.1357333660125732, "rewards/rejected": 2.32550048828125, "step": 48100 }, { "epoch": 2.2336227308603, "grad_norm": 37.738059997558594, "learning_rate": 1.6604670597520777e-07, "logits/chosen": -18.381099700927734, "logits/rejected": -17.418872833251953, "logps/chosen": -575.580322265625, "logps/rejected": -364.98028564453125, "loss": 0.316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.811367511749268, "rewards/margins": 1.9269310235977173, "rewards/rejected": 2.8844361305236816, "step": 48110 }, { "epoch": 2.234087004967733, "grad_norm": 82.97911071777344, "learning_rate": 1.6601884952876178e-07, "logits/chosen": -18.62892723083496, "logits/rejected": -17.77635383605957, "logps/chosen": -365.69000244140625, "logps/rejected": -311.54730224609375, "loss": 0.5384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.193965435028076, "rewards/margins": 1.3623454570770264, "rewards/rejected": 1.8316199779510498, "step": 48120 }, { "epoch": 2.234551279075166, "grad_norm": 14.571691513061523, "learning_rate": 1.6599099308231577e-07, "logits/chosen": -18.707523345947266, "logits/rejected": -17.470352172851562, "logps/chosen": -420.1659240722656, "logps/rejected": -303.13104248046875, "loss": 0.2911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.598934173583984, "rewards/margins": 2.9707868099212646, "rewards/rejected": 1.6281474828720093, "step": 48130 }, { "epoch": 2.235015553182599, "grad_norm": 28.516355514526367, "learning_rate": 1.659631366358698e-07, "logits/chosen": -17.988243103027344, "logits/rejected": -18.16360855102539, "logps/chosen": -311.4550476074219, "logps/rejected": -226.9312286376953, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.585476875305176, "rewards/margins": 0.9624084234237671, "rewards/rejected": 1.6230682134628296, "step": 48140 }, { "epoch": 2.235479827290032, "grad_norm": 29.812971115112305, "learning_rate": 1.6593528018942382e-07, "logits/chosen": -19.83292579650879, "logits/rejected": -18.372600555419922, "logps/chosen": -349.54248046875, "logps/rejected": -204.13314819335938, "loss": 0.3032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6248550415039062, "rewards/margins": 2.381493330001831, "rewards/rejected": 1.2433619499206543, "step": 48150 }, { "epoch": 2.235944101397465, "grad_norm": 28.056764602661133, "learning_rate": 1.6590742374297787e-07, "logits/chosen": -19.520559310913086, "logits/rejected": -18.428573608398438, "logps/chosen": -355.8650817871094, "logps/rejected": -249.75390625, "loss": 0.3526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4933464527130127, "rewards/margins": 1.2759835720062256, "rewards/rejected": 2.217362880706787, "step": 48160 }, { "epoch": 2.2364083755048982, "grad_norm": 4.1512064933776855, "learning_rate": 1.6587956729653185e-07, "logits/chosen": -19.72675323486328, "logits/rejected": -18.471454620361328, "logps/chosen": -373.80615234375, "logps/rejected": -300.38372802734375, "loss": 0.3364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5643062591552734, "rewards/margins": 1.8788127899169922, "rewards/rejected": 1.6854934692382812, "step": 48170 }, { "epoch": 2.236872649612331, "grad_norm": 137.3043212890625, "learning_rate": 1.6585171085008587e-07, "logits/chosen": -19.13745880126953, "logits/rejected": -18.900550842285156, "logps/chosen": -358.5599365234375, "logps/rejected": -291.78753662109375, "loss": 0.5959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.286165714263916, "rewards/margins": 0.6575471758842468, "rewards/rejected": 2.6286182403564453, "step": 48180 }, { "epoch": 2.2373369237197642, "grad_norm": 78.75800323486328, "learning_rate": 1.658238544036399e-07, "logits/chosen": -19.698923110961914, "logits/rejected": -18.617504119873047, "logps/chosen": -425.7608337402344, "logps/rejected": -326.1921081542969, "loss": 0.2929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9842000007629395, "rewards/margins": 1.6729938983917236, "rewards/rejected": 2.311206102371216, "step": 48190 }, { "epoch": 2.237801197827197, "grad_norm": 137.4241943359375, "learning_rate": 1.6579599795719392e-07, "logits/chosen": -19.441823959350586, "logits/rejected": -18.53993034362793, "logps/chosen": -582.013916015625, "logps/rejected": -399.1526794433594, "loss": 0.3526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.601987361907959, "rewards/margins": 2.057605504989624, "rewards/rejected": 2.544381618499756, "step": 48200 }, { "epoch": 2.23826547193463, "grad_norm": 1.8822094202041626, "learning_rate": 1.6576814151074794e-07, "logits/chosen": -18.478784561157227, "logits/rejected": -17.058313369750977, "logps/chosen": -392.3755798339844, "logps/rejected": -254.05484008789062, "loss": 0.4856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.28973126411438, "rewards/margins": 1.6931240558624268, "rewards/rejected": 1.5966074466705322, "step": 48210 }, { "epoch": 2.2387297460420634, "grad_norm": 8.620141983032227, "learning_rate": 1.6574028506430195e-07, "logits/chosen": -19.008747100830078, "logits/rejected": -18.303003311157227, "logps/chosen": -333.35614013671875, "logps/rejected": -292.317138671875, "loss": 0.3776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7640209197998047, "rewards/margins": 1.637840986251831, "rewards/rejected": 2.1261794567108154, "step": 48220 }, { "epoch": 2.239194020149496, "grad_norm": 195.30242919921875, "learning_rate": 1.65712428617856e-07, "logits/chosen": -19.39829444885254, "logits/rejected": -18.405061721801758, "logps/chosen": -396.0462951660156, "logps/rejected": -330.8504333496094, "loss": 0.4982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6997172832489014, "rewards/margins": 1.584109902381897, "rewards/rejected": 2.115607500076294, "step": 48230 }, { "epoch": 2.2396582942569294, "grad_norm": 59.93669891357422, "learning_rate": 1.6568457217141e-07, "logits/chosen": -18.42801856994629, "logits/rejected": -18.225643157958984, "logps/chosen": -269.0589904785156, "logps/rejected": -208.310302734375, "loss": 0.4011, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.696167469024658, "rewards/margins": 1.1617317199707031, "rewards/rejected": 1.534435749053955, "step": 48240 }, { "epoch": 2.240122568364362, "grad_norm": 83.75255584716797, "learning_rate": 1.65656715724964e-07, "logits/chosen": -18.312084197998047, "logits/rejected": -17.171125411987305, "logps/chosen": -452.27447509765625, "logps/rejected": -330.8699035644531, "loss": 0.4103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4060940742492676, "rewards/margins": 1.5052348375320435, "rewards/rejected": 1.9008591175079346, "step": 48250 }, { "epoch": 2.2405868424717954, "grad_norm": 209.44725036621094, "learning_rate": 1.6562885927851804e-07, "logits/chosen": -18.09025764465332, "logits/rejected": -18.067270278930664, "logps/chosen": -396.4805603027344, "logps/rejected": -399.5520324707031, "loss": 0.6503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6399223804473877, "rewards/margins": 0.8569595217704773, "rewards/rejected": 2.7829623222351074, "step": 48260 }, { "epoch": 2.241051116579228, "grad_norm": 3.543355703353882, "learning_rate": 1.6560100283207205e-07, "logits/chosen": -18.480484008789062, "logits/rejected": -17.565053939819336, "logps/chosen": -434.9612731933594, "logps/rejected": -344.14068603515625, "loss": 0.5698, "rewards/accuracies": 0.5, "rewards/chosen": 3.3348495960235596, "rewards/margins": 1.4053019285202026, "rewards/rejected": 1.9295473098754883, "step": 48270 }, { "epoch": 2.2415153906866614, "grad_norm": 14.885058403015137, "learning_rate": 1.6557314638562604e-07, "logits/chosen": -17.941478729248047, "logits/rejected": -17.357711791992188, "logps/chosen": -416.9423828125, "logps/rejected": -300.1816101074219, "loss": 0.3734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9856438636779785, "rewards/margins": 1.8839439153671265, "rewards/rejected": 2.1017000675201416, "step": 48280 }, { "epoch": 2.2419796647940946, "grad_norm": 142.04367065429688, "learning_rate": 1.6554528993918008e-07, "logits/chosen": -18.774534225463867, "logits/rejected": -18.449382781982422, "logps/chosen": -371.31353759765625, "logps/rejected": -307.68927001953125, "loss": 0.9075, "rewards/accuracies": 0.5, "rewards/chosen": 2.7072207927703857, "rewards/margins": 0.10405542701482773, "rewards/rejected": 2.6031651496887207, "step": 48290 }, { "epoch": 2.2424439389015274, "grad_norm": 84.6504898071289, "learning_rate": 1.655174334927341e-07, "logits/chosen": -18.655834197998047, "logits/rejected": -18.30595588684082, "logps/chosen": -400.1907043457031, "logps/rejected": -370.8265380859375, "loss": 0.39, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.032278060913086, "rewards/margins": 1.2131695747375488, "rewards/rejected": 2.819108247756958, "step": 48300 }, { "epoch": 2.2429082130089606, "grad_norm": 99.70349884033203, "learning_rate": 1.6548957704628814e-07, "logits/chosen": -18.599815368652344, "logits/rejected": -18.753854751586914, "logps/chosen": -386.0869445800781, "logps/rejected": -358.2931213378906, "loss": 1.412, "rewards/accuracies": 0.5, "rewards/chosen": 2.709794044494629, "rewards/margins": -0.4136075973510742, "rewards/rejected": 3.123401641845703, "step": 48310 }, { "epoch": 2.2433724871163934, "grad_norm": 27.73550033569336, "learning_rate": 1.6546172059984212e-07, "logits/chosen": -18.657224655151367, "logits/rejected": -17.846221923828125, "logps/chosen": -431.5341796875, "logps/rejected": -419.6558532714844, "loss": 0.9233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.963758945465088, "rewards/margins": -0.02640848234295845, "rewards/rejected": 3.9901671409606934, "step": 48320 }, { "epoch": 2.2438367612238266, "grad_norm": 35.25945281982422, "learning_rate": 1.6543386415339617e-07, "logits/chosen": -18.541715621948242, "logits/rejected": -16.588966369628906, "logps/chosen": -466.2093811035156, "logps/rejected": -252.90957641601562, "loss": 0.1995, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8572754859924316, "rewards/margins": 3.2159507274627686, "rewards/rejected": 0.6413246989250183, "step": 48330 }, { "epoch": 2.24430103533126, "grad_norm": 8.498299598693848, "learning_rate": 1.6540600770695018e-07, "logits/chosen": -19.700754165649414, "logits/rejected": -18.785734176635742, "logps/chosen": -363.4366760253906, "logps/rejected": -307.6552734375, "loss": 0.6014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.403933525085449, "rewards/margins": 1.3117125034332275, "rewards/rejected": 3.0922210216522217, "step": 48340 }, { "epoch": 2.2447653094386926, "grad_norm": 4.7256340980529785, "learning_rate": 1.653781512605042e-07, "logits/chosen": -17.946565628051758, "logits/rejected": -18.365583419799805, "logps/chosen": -326.86798095703125, "logps/rejected": -392.5791015625, "loss": 1.1986, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4250094890594482, "rewards/margins": -0.09216475486755371, "rewards/rejected": 2.517174243927002, "step": 48350 }, { "epoch": 2.245229583546126, "grad_norm": 138.07643127441406, "learning_rate": 1.653502948140582e-07, "logits/chosen": -17.76523208618164, "logits/rejected": -17.53011703491211, "logps/chosen": -298.1828918457031, "logps/rejected": -333.950927734375, "loss": 0.8, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5347790718078613, "rewards/margins": 0.32550156116485596, "rewards/rejected": 2.209277629852295, "step": 48360 }, { "epoch": 2.2456938576535586, "grad_norm": 78.95170593261719, "learning_rate": 1.6532243836761222e-07, "logits/chosen": -18.353601455688477, "logits/rejected": -18.710880279541016, "logps/chosen": -301.90899658203125, "logps/rejected": -366.87548828125, "loss": 1.7437, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.268273115158081, "rewards/margins": -1.0982939004898071, "rewards/rejected": 3.3665668964385986, "step": 48370 }, { "epoch": 2.2461581317609918, "grad_norm": 77.5827865600586, "learning_rate": 1.6529458192116626e-07, "logits/chosen": -17.9072265625, "logits/rejected": -17.955183029174805, "logps/chosen": -404.45721435546875, "logps/rejected": -381.5264892578125, "loss": 0.8173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6697583198547363, "rewards/margins": 0.8651461601257324, "rewards/rejected": 2.804612398147583, "step": 48380 }, { "epoch": 2.2466224058684245, "grad_norm": 9.173998832702637, "learning_rate": 1.6526672547472028e-07, "logits/chosen": -18.921062469482422, "logits/rejected": -17.876693725585938, "logps/chosen": -426.5523986816406, "logps/rejected": -313.8823547363281, "loss": 0.4272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5998120307922363, "rewards/margins": 2.021212100982666, "rewards/rejected": 1.5786001682281494, "step": 48390 }, { "epoch": 2.2470866799758578, "grad_norm": 6.013619899749756, "learning_rate": 1.6523886902827427e-07, "logits/chosen": -18.857166290283203, "logits/rejected": -18.17901039123535, "logps/chosen": -407.80023193359375, "logps/rejected": -348.44744873046875, "loss": 0.6401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4329445362091064, "rewards/margins": 1.1592044830322266, "rewards/rejected": 2.27374005317688, "step": 48400 }, { "epoch": 2.247550954083291, "grad_norm": 22.784420013427734, "learning_rate": 1.652110125818283e-07, "logits/chosen": -19.208099365234375, "logits/rejected": -18.66232681274414, "logps/chosen": -311.6127624511719, "logps/rejected": -251.71267700195312, "loss": 0.7355, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6301631927490234, "rewards/margins": 0.8082143068313599, "rewards/rejected": 1.8219490051269531, "step": 48410 }, { "epoch": 2.2480152281907237, "grad_norm": 85.07467651367188, "learning_rate": 1.6518315613538232e-07, "logits/chosen": -18.60073471069336, "logits/rejected": -18.074737548828125, "logps/chosen": -452.1201171875, "logps/rejected": -377.4635314941406, "loss": 0.6979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.583266496658325, "rewards/margins": 0.4584329128265381, "rewards/rejected": 3.124833583831787, "step": 48420 }, { "epoch": 2.248479502298157, "grad_norm": 90.99738311767578, "learning_rate": 1.6515529968893636e-07, "logits/chosen": -19.34781837463379, "logits/rejected": -18.20938491821289, "logps/chosen": -409.8372497558594, "logps/rejected": -277.35211181640625, "loss": 0.3367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3330090045928955, "rewards/margins": 1.3649437427520752, "rewards/rejected": 1.9680650234222412, "step": 48430 }, { "epoch": 2.2489437764055897, "grad_norm": 48.252010345458984, "learning_rate": 1.6512744324249035e-07, "logits/chosen": -19.01547622680664, "logits/rejected": -17.86257553100586, "logps/chosen": -373.13946533203125, "logps/rejected": -272.36956787109375, "loss": 0.1877, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.574317455291748, "rewards/margins": 2.174164295196533, "rewards/rejected": 1.4001531600952148, "step": 48440 }, { "epoch": 2.249408050513023, "grad_norm": 14.9766263961792, "learning_rate": 1.6509958679604437e-07, "logits/chosen": -19.80295181274414, "logits/rejected": -18.24631118774414, "logps/chosen": -377.9588317871094, "logps/rejected": -333.9394836425781, "loss": 0.3843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.225769996643066, "rewards/margins": 1.745125412940979, "rewards/rejected": 2.480644702911377, "step": 48450 }, { "epoch": 2.2498723246204557, "grad_norm": 47.184078216552734, "learning_rate": 1.650717303495984e-07, "logits/chosen": -20.079259872436523, "logits/rejected": -19.456205368041992, "logps/chosen": -422.04461669921875, "logps/rejected": -333.2691955566406, "loss": 0.8515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.688096284866333, "rewards/margins": 0.9547693133354187, "rewards/rejected": 2.7333264350891113, "step": 48460 }, { "epoch": 2.250336598727889, "grad_norm": 77.99734497070312, "learning_rate": 1.650438739031524e-07, "logits/chosen": -18.103721618652344, "logits/rejected": -17.820083618164062, "logps/chosen": -374.6256408691406, "logps/rejected": -364.64971923828125, "loss": 0.9761, "rewards/accuracies": 0.5, "rewards/chosen": 2.9807188510894775, "rewards/margins": 0.6735067963600159, "rewards/rejected": 2.3072123527526855, "step": 48470 }, { "epoch": 2.250800872835322, "grad_norm": 0.11802612245082855, "learning_rate": 1.6501601745670644e-07, "logits/chosen": -19.438732147216797, "logits/rejected": -16.897510528564453, "logps/chosen": -467.7049255371094, "logps/rejected": -182.79917907714844, "loss": 0.1712, "rewards/accuracies": 1.0, "rewards/chosen": 5.040026664733887, "rewards/margins": 3.710045337677002, "rewards/rejected": 1.3299816846847534, "step": 48480 }, { "epoch": 2.251265146942755, "grad_norm": 128.45172119140625, "learning_rate": 1.6498816101026045e-07, "logits/chosen": -19.391681671142578, "logits/rejected": -19.10555648803711, "logps/chosen": -393.89337158203125, "logps/rejected": -356.0325012207031, "loss": 0.6948, "rewards/accuracies": 0.5, "rewards/chosen": 3.3514294624328613, "rewards/margins": 0.7773544192314148, "rewards/rejected": 2.574075222015381, "step": 48490 }, { "epoch": 2.251729421050188, "grad_norm": 53.27146530151367, "learning_rate": 1.649603045638145e-07, "logits/chosen": -19.541208267211914, "logits/rejected": -18.63340950012207, "logps/chosen": -378.6891174316406, "logps/rejected": -267.25323486328125, "loss": 0.4366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.30340313911438, "rewards/margins": 1.3631505966186523, "rewards/rejected": 1.9402520656585693, "step": 48500 }, { "epoch": 2.252193695157621, "grad_norm": 77.07122039794922, "learning_rate": 1.6493244811736848e-07, "logits/chosen": -19.640037536621094, "logits/rejected": -19.443790435791016, "logps/chosen": -513.9044799804688, "logps/rejected": -378.6018981933594, "loss": 1.0671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.4738054275512695, "rewards/margins": 0.2621155381202698, "rewards/rejected": 4.2116899490356445, "step": 48510 }, { "epoch": 2.252657969265054, "grad_norm": 120.4674072265625, "learning_rate": 1.649045916709225e-07, "logits/chosen": -18.868408203125, "logits/rejected": -18.266525268554688, "logps/chosen": -422.71319580078125, "logps/rejected": -322.32110595703125, "loss": 0.4097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.148461818695068, "rewards/margins": 1.304602026939392, "rewards/rejected": 2.843860149383545, "step": 48520 }, { "epoch": 2.253122243372487, "grad_norm": 38.41083526611328, "learning_rate": 1.6487673522447654e-07, "logits/chosen": -19.719858169555664, "logits/rejected": -18.625978469848633, "logps/chosen": -454.933349609375, "logps/rejected": -306.5509948730469, "loss": 0.2868, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.802414894104004, "rewards/margins": 2.248544692993164, "rewards/rejected": 2.5538699626922607, "step": 48530 }, { "epoch": 2.25358651747992, "grad_norm": 51.05217361450195, "learning_rate": 1.6484887877803055e-07, "logits/chosen": -19.341259002685547, "logits/rejected": -18.295900344848633, "logps/chosen": -504.68011474609375, "logps/rejected": -360.92242431640625, "loss": 0.4837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.82021427154541, "rewards/margins": 1.7805076837539673, "rewards/rejected": 3.0397067070007324, "step": 48540 }, { "epoch": 2.2540507915873533, "grad_norm": 16.196855545043945, "learning_rate": 1.6482102233158454e-07, "logits/chosen": -18.732044219970703, "logits/rejected": -18.15346908569336, "logps/chosen": -421.0352478027344, "logps/rejected": -301.23455810546875, "loss": 0.7228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4917125701904297, "rewards/margins": 0.7820780277252197, "rewards/rejected": 2.709634304046631, "step": 48550 }, { "epoch": 2.254515065694786, "grad_norm": 117.30980682373047, "learning_rate": 1.6479316588513858e-07, "logits/chosen": -18.290367126464844, "logits/rejected": -18.497486114501953, "logps/chosen": -382.8450012207031, "logps/rejected": -428.59368896484375, "loss": 0.8015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5566444396972656, "rewards/margins": 0.6352149248123169, "rewards/rejected": 2.921429395675659, "step": 48560 }, { "epoch": 2.2549793398022193, "grad_norm": 44.07389831542969, "learning_rate": 1.647653094386926e-07, "logits/chosen": -18.891427993774414, "logits/rejected": -18.401020050048828, "logps/chosen": -391.632080078125, "logps/rejected": -344.475830078125, "loss": 0.7345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.741426467895508, "rewards/margins": 0.8823347091674805, "rewards/rejected": 2.8590917587280273, "step": 48570 }, { "epoch": 2.255443613909652, "grad_norm": 267.94091796875, "learning_rate": 1.6473745299224663e-07, "logits/chosen": -19.235214233398438, "logits/rejected": -17.775869369506836, "logps/chosen": -386.4110107421875, "logps/rejected": -288.04388427734375, "loss": 0.702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9021191596984863, "rewards/margins": 0.8332866430282593, "rewards/rejected": 3.0688321590423584, "step": 48580 }, { "epoch": 2.2559078880170853, "grad_norm": 144.6549835205078, "learning_rate": 1.6470959654580062e-07, "logits/chosen": -18.26107406616211, "logits/rejected": -17.71600341796875, "logps/chosen": -307.24639892578125, "logps/rejected": -261.9051818847656, "loss": 0.7643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3531627655029297, "rewards/margins": 0.6355351805686951, "rewards/rejected": 1.717627763748169, "step": 48590 }, { "epoch": 2.256372162124518, "grad_norm": 27.327224731445312, "learning_rate": 1.6468174009935464e-07, "logits/chosen": -18.519638061523438, "logits/rejected": -17.644100189208984, "logps/chosen": -393.6795654296875, "logps/rejected": -300.6549377441406, "loss": 0.4172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.234814167022705, "rewards/margins": 1.8103145360946655, "rewards/rejected": 1.424499750137329, "step": 48600 }, { "epoch": 2.2568364362319513, "grad_norm": 0.4969560503959656, "learning_rate": 1.6465388365290868e-07, "logits/chosen": -18.00343894958496, "logits/rejected": -17.676925659179688, "logps/chosen": -313.37603759765625, "logps/rejected": -382.7843322753906, "loss": 1.6366, "rewards/accuracies": 0.5, "rewards/chosen": 3.1789891719818115, "rewards/margins": 0.04376616328954697, "rewards/rejected": 3.135222911834717, "step": 48610 }, { "epoch": 2.2573007103393845, "grad_norm": 24.928834915161133, "learning_rate": 1.646260272064627e-07, "logits/chosen": -18.6438045501709, "logits/rejected": -18.12594985961914, "logps/chosen": -400.664794921875, "logps/rejected": -321.510986328125, "loss": 0.9345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7984542846679688, "rewards/margins": 0.4387974739074707, "rewards/rejected": 3.359656810760498, "step": 48620 }, { "epoch": 2.2577649844468173, "grad_norm": 105.84502410888672, "learning_rate": 1.645981707600167e-07, "logits/chosen": -18.637096405029297, "logits/rejected": -18.60091781616211, "logps/chosen": -487.543212890625, "logps/rejected": -395.7742004394531, "loss": 0.9257, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.981674909591675, "rewards/margins": 0.3746131956577301, "rewards/rejected": 3.6070618629455566, "step": 48630 }, { "epoch": 2.2582292585542505, "grad_norm": 28.1889591217041, "learning_rate": 1.6457031431357072e-07, "logits/chosen": -18.627647399902344, "logits/rejected": -18.541873931884766, "logps/chosen": -370.74237060546875, "logps/rejected": -350.23748779296875, "loss": 0.9271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.56693959236145, "rewards/margins": 0.02064533159136772, "rewards/rejected": 3.5462944507598877, "step": 48640 }, { "epoch": 2.2586935326616833, "grad_norm": 21.6577205657959, "learning_rate": 1.6454245786712476e-07, "logits/chosen": -17.520381927490234, "logits/rejected": -17.106908798217773, "logps/chosen": -385.0441589355469, "logps/rejected": -315.460693359375, "loss": 0.679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.822833299636841, "rewards/margins": 1.0793894529342651, "rewards/rejected": 2.743443727493286, "step": 48650 }, { "epoch": 2.2591578067691165, "grad_norm": 1.8666248321533203, "learning_rate": 1.6451460142067875e-07, "logits/chosen": -20.32731819152832, "logits/rejected": -19.08757972717285, "logps/chosen": -373.20050048828125, "logps/rejected": -313.3849182128906, "loss": 0.4869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.66033673286438, "rewards/margins": 1.6945546865463257, "rewards/rejected": 1.9657821655273438, "step": 48660 }, { "epoch": 2.2596220808765497, "grad_norm": 19.10965919494629, "learning_rate": 1.6448674497423277e-07, "logits/chosen": -19.724124908447266, "logits/rejected": -17.627559661865234, "logps/chosen": -380.1712341308594, "logps/rejected": -237.03158569335938, "loss": 0.4153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.344943046569824, "rewards/margins": 2.4900338649749756, "rewards/rejected": 1.8549093008041382, "step": 48670 }, { "epoch": 2.2600863549839825, "grad_norm": 1.5595277547836304, "learning_rate": 1.644588885277868e-07, "logits/chosen": -17.69746208190918, "logits/rejected": -17.226837158203125, "logps/chosen": -310.56243896484375, "logps/rejected": -325.32232666015625, "loss": 0.8245, "rewards/accuracies": 0.5, "rewards/chosen": 2.96067476272583, "rewards/margins": 0.6696981191635132, "rewards/rejected": 2.2909770011901855, "step": 48680 }, { "epoch": 2.2605506290914157, "grad_norm": 96.57598876953125, "learning_rate": 1.6443103208134082e-07, "logits/chosen": -18.644372940063477, "logits/rejected": -18.706607818603516, "logps/chosen": -454.9495544433594, "logps/rejected": -400.8262634277344, "loss": 0.7474, "rewards/accuracies": 0.5, "rewards/chosen": 3.2956416606903076, "rewards/margins": 0.5997424125671387, "rewards/rejected": 2.69589900970459, "step": 48690 }, { "epoch": 2.2610149031988485, "grad_norm": 116.6360092163086, "learning_rate": 1.644031756348948e-07, "logits/chosen": -19.154983520507812, "logits/rejected": -18.810354232788086, "logps/chosen": -322.41925048828125, "logps/rejected": -288.14666748046875, "loss": 0.3278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5990118980407715, "rewards/margins": 1.4413974285125732, "rewards/rejected": 2.1576147079467773, "step": 48700 }, { "epoch": 2.2614791773062817, "grad_norm": 91.39971923828125, "learning_rate": 1.6437531918844885e-07, "logits/chosen": -18.75885772705078, "logits/rejected": -17.5874080657959, "logps/chosen": -428.4752502441406, "logps/rejected": -373.874755859375, "loss": 0.6965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.344655990600586, "rewards/margins": 0.7784962058067322, "rewards/rejected": 3.56615948677063, "step": 48710 }, { "epoch": 2.261943451413715, "grad_norm": 83.1126480102539, "learning_rate": 1.6434746274200286e-07, "logits/chosen": -18.68179702758789, "logits/rejected": -17.704530715942383, "logps/chosen": -420.6785583496094, "logps/rejected": -291.24566650390625, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7649447917938232, "rewards/margins": 1.0401197671890259, "rewards/rejected": 2.724825382232666, "step": 48720 }, { "epoch": 2.2624077255211477, "grad_norm": 51.43518829345703, "learning_rate": 1.643196062955569e-07, "logits/chosen": -18.847410202026367, "logits/rejected": -18.385845184326172, "logps/chosen": -612.6915283203125, "logps/rejected": -464.3096618652344, "loss": 0.4905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.650788307189941, "rewards/margins": 1.0192795991897583, "rewards/rejected": 3.6315090656280518, "step": 48730 }, { "epoch": 2.262871999628581, "grad_norm": 172.30947875976562, "learning_rate": 1.642917498491109e-07, "logits/chosen": -19.413244247436523, "logits/rejected": -19.409008026123047, "logps/chosen": -461.4502868652344, "logps/rejected": -515.5863037109375, "loss": 1.1727, "rewards/accuracies": 0.5, "rewards/chosen": 3.9240219593048096, "rewards/margins": -0.19327135384082794, "rewards/rejected": 4.117293357849121, "step": 48740 }, { "epoch": 2.2633362737360136, "grad_norm": 83.02571868896484, "learning_rate": 1.6426389340266493e-07, "logits/chosen": -18.5694522857666, "logits/rejected": -18.159391403198242, "logps/chosen": -365.2480773925781, "logps/rejected": -264.10272216796875, "loss": 0.514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.972957968711853, "rewards/margins": 0.5927032828330994, "rewards/rejected": 1.3802545070648193, "step": 48750 }, { "epoch": 2.263800547843447, "grad_norm": 56.00358963012695, "learning_rate": 1.6423603695621895e-07, "logits/chosen": -18.949100494384766, "logits/rejected": -17.83390998840332, "logps/chosen": -294.4528503417969, "logps/rejected": -169.55682373046875, "loss": 0.4913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2858829498291016, "rewards/margins": 1.506654143333435, "rewards/rejected": 0.7792288661003113, "step": 48760 }, { "epoch": 2.2642648219508796, "grad_norm": 63.696632385253906, "learning_rate": 1.6420818050977296e-07, "logits/chosen": -19.54880142211914, "logits/rejected": -19.476537704467773, "logps/chosen": -444.59942626953125, "logps/rejected": -397.84906005859375, "loss": 0.4938, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.424704551696777, "rewards/margins": 1.0001753568649292, "rewards/rejected": 3.4245293140411377, "step": 48770 }, { "epoch": 2.264729096058313, "grad_norm": 287.9057312011719, "learning_rate": 1.6418032406332698e-07, "logits/chosen": -19.136791229248047, "logits/rejected": -18.289323806762695, "logps/chosen": -332.26580810546875, "logps/rejected": -311.79998779296875, "loss": 0.8472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.927753210067749, "rewards/margins": -0.028779661282896996, "rewards/rejected": 1.9565327167510986, "step": 48780 }, { "epoch": 2.265193370165746, "grad_norm": 150.08570861816406, "learning_rate": 1.64152467616881e-07, "logits/chosen": -18.49533462524414, "logits/rejected": -17.42342185974121, "logps/chosen": -415.7464294433594, "logps/rejected": -277.7569274902344, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.292095184326172, "rewards/margins": 1.757584571838379, "rewards/rejected": 2.534510612487793, "step": 48790 }, { "epoch": 2.265657644273179, "grad_norm": 31.600360870361328, "learning_rate": 1.6412461117043503e-07, "logits/chosen": -18.562786102294922, "logits/rejected": -17.446754455566406, "logps/chosen": -336.9914245605469, "logps/rejected": -291.49163818359375, "loss": 0.8715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.673126697540283, "rewards/margins": 0.9833072423934937, "rewards/rejected": 1.689819574356079, "step": 48800 }, { "epoch": 2.266121918380612, "grad_norm": 29.136871337890625, "learning_rate": 1.6409675472398905e-07, "logits/chosen": -19.705310821533203, "logits/rejected": -19.40933609008789, "logps/chosen": -399.32147216796875, "logps/rejected": -319.85504150390625, "loss": 0.6918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8534843921661377, "rewards/margins": 1.4766945838928223, "rewards/rejected": 2.3767895698547363, "step": 48810 }, { "epoch": 2.266586192488045, "grad_norm": 107.84008026123047, "learning_rate": 1.6406889827754304e-07, "logits/chosen": -18.386442184448242, "logits/rejected": -17.391613006591797, "logps/chosen": -479.65191650390625, "logps/rejected": -291.26702880859375, "loss": 0.5458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.089108943939209, "rewards/margins": 1.8379042148590088, "rewards/rejected": 2.2512047290802, "step": 48820 }, { "epoch": 2.267050466595478, "grad_norm": 95.69398498535156, "learning_rate": 1.6404104183109708e-07, "logits/chosen": -19.038799285888672, "logits/rejected": -17.81801986694336, "logps/chosen": -378.81951904296875, "logps/rejected": -296.39190673828125, "loss": 0.6917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8048222064971924, "rewards/margins": 1.5518848896026611, "rewards/rejected": 2.252936840057373, "step": 48830 }, { "epoch": 2.267514740702911, "grad_norm": 106.30868530273438, "learning_rate": 1.640131853846511e-07, "logits/chosen": -19.310123443603516, "logits/rejected": -19.41769027709961, "logps/chosen": -394.7460021972656, "logps/rejected": -394.0754089355469, "loss": 0.5949, "rewards/accuracies": 0.5, "rewards/chosen": 4.31564998626709, "rewards/margins": 0.9491463899612427, "rewards/rejected": 3.3665034770965576, "step": 48840 }, { "epoch": 2.267979014810344, "grad_norm": 98.37886810302734, "learning_rate": 1.6398532893820513e-07, "logits/chosen": -19.865680694580078, "logits/rejected": -19.056079864501953, "logps/chosen": -385.04510498046875, "logps/rejected": -368.0614318847656, "loss": 0.5477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.741658926010132, "rewards/margins": 0.9555648565292358, "rewards/rejected": 1.786094069480896, "step": 48850 }, { "epoch": 2.2684432889177772, "grad_norm": 1.9782272577285767, "learning_rate": 1.6395747249175912e-07, "logits/chosen": -18.563268661499023, "logits/rejected": -17.71406364440918, "logps/chosen": -448.81475830078125, "logps/rejected": -299.58642578125, "loss": 0.9475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.863870620727539, "rewards/margins": 1.3187386989593506, "rewards/rejected": 2.5451319217681885, "step": 48860 }, { "epoch": 2.26890756302521, "grad_norm": 163.4766082763672, "learning_rate": 1.6392961604531314e-07, "logits/chosen": -19.963109970092773, "logits/rejected": -19.089771270751953, "logps/chosen": -290.67352294921875, "logps/rejected": -295.1371765136719, "loss": 0.8148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9734292030334473, "rewards/margins": 0.6112080812454224, "rewards/rejected": 2.3622207641601562, "step": 48870 }, { "epoch": 2.2693718371326432, "grad_norm": 145.08143615722656, "learning_rate": 1.6390175959886718e-07, "logits/chosen": -18.6130313873291, "logits/rejected": -17.864458084106445, "logps/chosen": -399.3536071777344, "logps/rejected": -363.5687561035156, "loss": 0.4449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.417566299438477, "rewards/margins": 2.1477389335632324, "rewards/rejected": 2.269827365875244, "step": 48880 }, { "epoch": 2.269836111240076, "grad_norm": 0.08383393287658691, "learning_rate": 1.6387390315242116e-07, "logits/chosen": -18.623062133789062, "logits/rejected": -17.649930953979492, "logps/chosen": -427.1300354003906, "logps/rejected": -348.255126953125, "loss": 0.425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9008915424346924, "rewards/margins": 1.903673529624939, "rewards/rejected": 1.9972177743911743, "step": 48890 }, { "epoch": 2.270300385347509, "grad_norm": 33.380096435546875, "learning_rate": 1.638460467059752e-07, "logits/chosen": -18.06865119934082, "logits/rejected": -17.228206634521484, "logps/chosen": -436.61865234375, "logps/rejected": -309.6601867675781, "loss": 0.9727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.376777172088623, "rewards/margins": 1.0884867906570435, "rewards/rejected": 2.2882907390594482, "step": 48900 }, { "epoch": 2.270764659454942, "grad_norm": 2.332270383834839, "learning_rate": 1.6381819025952922e-07, "logits/chosen": -18.900182723999023, "logits/rejected": -17.73269271850586, "logps/chosen": -341.1622619628906, "logps/rejected": -277.112548828125, "loss": 0.7375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.320876359939575, "rewards/margins": 1.2393139600753784, "rewards/rejected": 2.0815625190734863, "step": 48910 }, { "epoch": 2.271228933562375, "grad_norm": 191.23117065429688, "learning_rate": 1.6379033381308326e-07, "logits/chosen": -18.782093048095703, "logits/rejected": -18.269699096679688, "logps/chosen": -282.569091796875, "logps/rejected": -240.8099365234375, "loss": 0.852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.638423204421997, "rewards/margins": 0.625799834728241, "rewards/rejected": 2.0126230716705322, "step": 48920 }, { "epoch": 2.2716932076698084, "grad_norm": 50.93635940551758, "learning_rate": 1.6376247736663725e-07, "logits/chosen": -19.930583953857422, "logits/rejected": -18.924043655395508, "logps/chosen": -392.78179931640625, "logps/rejected": -258.5283203125, "loss": 0.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.204019546508789, "rewards/margins": 1.0856068134307861, "rewards/rejected": 2.118412494659424, "step": 48930 }, { "epoch": 2.272157481777241, "grad_norm": 74.04155731201172, "learning_rate": 1.6373462092019126e-07, "logits/chosen": -18.535953521728516, "logits/rejected": -17.916284561157227, "logps/chosen": -392.20416259765625, "logps/rejected": -313.83868408203125, "loss": 0.5067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6738224029541016, "rewards/margins": 0.7609565854072571, "rewards/rejected": 1.9128658771514893, "step": 48940 }, { "epoch": 2.2726217558846744, "grad_norm": 68.46028137207031, "learning_rate": 1.637067644737453e-07, "logits/chosen": -19.05971336364746, "logits/rejected": -18.907188415527344, "logps/chosen": -367.1801452636719, "logps/rejected": -278.9743347167969, "loss": 0.621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.912856101989746, "rewards/margins": 0.5636226534843445, "rewards/rejected": 2.3492331504821777, "step": 48950 }, { "epoch": 2.273086029992107, "grad_norm": 232.4998016357422, "learning_rate": 1.6367890802729932e-07, "logits/chosen": -19.27101707458496, "logits/rejected": -18.40444564819336, "logps/chosen": -394.39129638671875, "logps/rejected": -346.09100341796875, "loss": 0.4905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.256119966506958, "rewards/margins": 0.866334080696106, "rewards/rejected": 2.3897862434387207, "step": 48960 }, { "epoch": 2.2735503040995404, "grad_norm": 78.11473846435547, "learning_rate": 1.636510515808533e-07, "logits/chosen": -18.953466415405273, "logits/rejected": -18.39809226989746, "logps/chosen": -457.84716796875, "logps/rejected": -369.35382080078125, "loss": 0.7011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7080764770507812, "rewards/margins": 1.022477149963379, "rewards/rejected": 2.6855995655059814, "step": 48970 }, { "epoch": 2.274014578206973, "grad_norm": 211.84384155273438, "learning_rate": 1.6362319513440735e-07, "logits/chosen": -18.574668884277344, "logits/rejected": -18.79067611694336, "logps/chosen": -314.66278076171875, "logps/rejected": -298.01226806640625, "loss": 0.9234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8505349159240723, "rewards/margins": 0.13329385221004486, "rewards/rejected": 2.7172412872314453, "step": 48980 }, { "epoch": 2.2744788523144064, "grad_norm": 193.34039306640625, "learning_rate": 1.6359533868796136e-07, "logits/chosen": -20.052753448486328, "logits/rejected": -18.195140838623047, "logps/chosen": -297.51171875, "logps/rejected": -234.5531005859375, "loss": 0.4401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.657559633255005, "rewards/margins": 1.2943065166473389, "rewards/rejected": 1.3632527589797974, "step": 48990 }, { "epoch": 2.2749431264218396, "grad_norm": 178.0741729736328, "learning_rate": 1.635674822415154e-07, "logits/chosen": -18.73385238647461, "logits/rejected": -18.51918601989746, "logps/chosen": -346.0289001464844, "logps/rejected": -300.59600830078125, "loss": 0.6093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.934816360473633, "rewards/margins": 0.8537648320198059, "rewards/rejected": 2.0810513496398926, "step": 49000 }, { "epoch": 2.2754074005292724, "grad_norm": 86.76314544677734, "learning_rate": 1.635396257950694e-07, "logits/chosen": -19.144977569580078, "logits/rejected": -18.37784194946289, "logps/chosen": -446.95001220703125, "logps/rejected": -292.3216552734375, "loss": 0.6264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.544830322265625, "rewards/margins": 1.927825927734375, "rewards/rejected": 2.617004871368408, "step": 49010 }, { "epoch": 2.2758716746367056, "grad_norm": 108.11788940429688, "learning_rate": 1.635117693486234e-07, "logits/chosen": -19.20926284790039, "logits/rejected": -18.297321319580078, "logps/chosen": -413.4103088378906, "logps/rejected": -291.0352478027344, "loss": 0.6307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.619849681854248, "rewards/margins": 1.418526291847229, "rewards/rejected": 2.2013237476348877, "step": 49020 }, { "epoch": 2.2763359487441384, "grad_norm": 67.1060791015625, "learning_rate": 1.6348391290217745e-07, "logits/chosen": -19.34516716003418, "logits/rejected": -19.133869171142578, "logps/chosen": -343.6422424316406, "logps/rejected": -290.0654296875, "loss": 0.7719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.999838352203369, "rewards/margins": 0.8926283717155457, "rewards/rejected": 2.1072099208831787, "step": 49030 }, { "epoch": 2.2768002228515716, "grad_norm": 130.65206909179688, "learning_rate": 1.6345605645573146e-07, "logits/chosen": -17.62017250061035, "logits/rejected": -17.38024139404297, "logps/chosen": -302.27447509765625, "logps/rejected": -271.68560791015625, "loss": 0.8478, "rewards/accuracies": 0.5, "rewards/chosen": 1.780347466468811, "rewards/margins": 0.47323593497276306, "rewards/rejected": 1.3071117401123047, "step": 49040 }, { "epoch": 2.2772644969590043, "grad_norm": 86.7197265625, "learning_rate": 1.6342820000928548e-07, "logits/chosen": -18.928190231323242, "logits/rejected": -17.313087463378906, "logps/chosen": -460.8766174316406, "logps/rejected": -284.7972412109375, "loss": 0.2743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.909811496734619, "rewards/margins": 2.078367233276367, "rewards/rejected": 1.831444501876831, "step": 49050 }, { "epoch": 2.2777287710664376, "grad_norm": 26.928329467773438, "learning_rate": 1.634003435628395e-07, "logits/chosen": -18.85284423828125, "logits/rejected": -18.498783111572266, "logps/chosen": -341.2564392089844, "logps/rejected": -308.3751525878906, "loss": 0.3409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2513070106506348, "rewards/margins": 1.2256765365600586, "rewards/rejected": 2.0256307125091553, "step": 49060 }, { "epoch": 2.278193045173871, "grad_norm": 23.478363037109375, "learning_rate": 1.6337248711639353e-07, "logits/chosen": -19.35442352294922, "logits/rejected": -17.917858123779297, "logps/chosen": -430.45660400390625, "logps/rejected": -337.79266357421875, "loss": 0.8215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6076388359069824, "rewards/margins": 0.5638179779052734, "rewards/rejected": 3.04382061958313, "step": 49070 }, { "epoch": 2.2786573192813036, "grad_norm": 23.062349319458008, "learning_rate": 1.6334463066994752e-07, "logits/chosen": -19.674646377563477, "logits/rejected": -18.603199005126953, "logps/chosen": -402.94970703125, "logps/rejected": -374.3115539550781, "loss": 0.6689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9787039756774902, "rewards/margins": 1.483199954032898, "rewards/rejected": 2.495504379272461, "step": 49080 }, { "epoch": 2.2791215933887368, "grad_norm": 48.84210205078125, "learning_rate": 1.6331677422350153e-07, "logits/chosen": -19.230459213256836, "logits/rejected": -18.179609298706055, "logps/chosen": -350.7208251953125, "logps/rejected": -258.08233642578125, "loss": 0.5868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.656712770462036, "rewards/margins": 1.3424001932144165, "rewards/rejected": 2.31431245803833, "step": 49090 }, { "epoch": 2.27958586749617, "grad_norm": 146.8787384033203, "learning_rate": 1.6328891777705558e-07, "logits/chosen": -19.147518157958984, "logits/rejected": -18.787567138671875, "logps/chosen": -276.34051513671875, "logps/rejected": -268.96185302734375, "loss": 0.7875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.076603412628174, "rewards/margins": 0.5274235606193542, "rewards/rejected": 1.5491797924041748, "step": 49100 }, { "epoch": 2.2800501416036028, "grad_norm": NaN, "learning_rate": 1.6326384697525417e-07, "logits/chosen": -18.536731719970703, "logits/rejected": -18.778663635253906, "logps/chosen": -410.37091064453125, "logps/rejected": -369.38677978515625, "loss": 0.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.915635585784912, "rewards/margins": 0.8009145855903625, "rewards/rejected": 3.1147212982177734, "step": 49110 }, { "epoch": 2.280514415711036, "grad_norm": 60.15372848510742, "learning_rate": 1.632359905288082e-07, "logits/chosen": -19.00594711303711, "logits/rejected": -16.867782592773438, "logps/chosen": -463.97662353515625, "logps/rejected": -218.14431762695312, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": 4.206004619598389, "rewards/margins": 2.673637628555298, "rewards/rejected": 1.532367467880249, "step": 49120 }, { "epoch": 2.2809786898184687, "grad_norm": 67.78812408447266, "learning_rate": 1.632081340823622e-07, "logits/chosen": -18.615535736083984, "logits/rejected": -17.420568466186523, "logps/chosen": -469.32342529296875, "logps/rejected": -332.99554443359375, "loss": 0.5179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.129096984863281, "rewards/margins": 1.3869794607162476, "rewards/rejected": 2.742117166519165, "step": 49130 }, { "epoch": 2.281442963925902, "grad_norm": 70.8221435546875, "learning_rate": 1.6318027763591624e-07, "logits/chosen": -20.086809158325195, "logits/rejected": -18.121055603027344, "logps/chosen": -322.269287109375, "logps/rejected": -279.60369873046875, "loss": 0.5293, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7287232875823975, "rewards/margins": 0.9803308248519897, "rewards/rejected": 1.7483923435211182, "step": 49140 }, { "epoch": 2.2819072380333347, "grad_norm": 0.7354686260223389, "learning_rate": 1.6315242118947025e-07, "logits/chosen": -19.699392318725586, "logits/rejected": -19.66592788696289, "logps/chosen": -432.48370361328125, "logps/rejected": -430.7501525878906, "loss": 0.7797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9280953407287598, "rewards/margins": 1.0002901554107666, "rewards/rejected": 2.927804946899414, "step": 49150 }, { "epoch": 2.282371512140768, "grad_norm": 314.5116882324219, "learning_rate": 1.631245647430243e-07, "logits/chosen": -18.570634841918945, "logits/rejected": -19.011844635009766, "logps/chosen": -305.3511657714844, "logps/rejected": -339.31707763671875, "loss": 0.8698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.119028329849243, "rewards/margins": 0.6885952949523926, "rewards/rejected": 2.4304327964782715, "step": 49160 }, { "epoch": 2.282835786248201, "grad_norm": 13.75861644744873, "learning_rate": 1.6309670829657828e-07, "logits/chosen": -19.20992660522461, "logits/rejected": -18.244327545166016, "logps/chosen": -413.65277099609375, "logps/rejected": -321.1776428222656, "loss": 0.4439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.692505359649658, "rewards/margins": 1.657573938369751, "rewards/rejected": 2.0349316596984863, "step": 49170 }, { "epoch": 2.283300060355634, "grad_norm": 282.8756103515625, "learning_rate": 1.630688518501323e-07, "logits/chosen": -18.467529296875, "logits/rejected": -18.263046264648438, "logps/chosen": -345.429443359375, "logps/rejected": -312.8924255371094, "loss": 0.7998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1338391304016113, "rewards/margins": 0.6433666348457336, "rewards/rejected": 2.4904725551605225, "step": 49180 }, { "epoch": 2.283764334463067, "grad_norm": 31.999706268310547, "learning_rate": 1.6304099540368634e-07, "logits/chosen": -18.53671646118164, "logits/rejected": -18.529041290283203, "logps/chosen": -363.6177978515625, "logps/rejected": -383.8226623535156, "loss": 0.588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3228232860565186, "rewards/margins": 0.6402347087860107, "rewards/rejected": 2.6825883388519287, "step": 49190 }, { "epoch": 2.2842286085705, "grad_norm": 88.99905395507812, "learning_rate": 1.6301313895724035e-07, "logits/chosen": -19.41921043395996, "logits/rejected": -18.463069915771484, "logps/chosen": -394.43768310546875, "logps/rejected": -259.41485595703125, "loss": 0.3909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.321813583374023, "rewards/margins": 2.0666491985321045, "rewards/rejected": 2.255164623260498, "step": 49200 }, { "epoch": 2.284692882677933, "grad_norm": 42.757240295410156, "learning_rate": 1.6298528251079434e-07, "logits/chosen": -19.254600524902344, "logits/rejected": -18.888835906982422, "logps/chosen": -388.186767578125, "logps/rejected": -327.3891296386719, "loss": 0.4565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.79859733581543, "rewards/margins": 1.3565328121185303, "rewards/rejected": 3.4420642852783203, "step": 49210 }, { "epoch": 2.285157156785366, "grad_norm": 33.36330795288086, "learning_rate": 1.6295742606434838e-07, "logits/chosen": -18.95472526550293, "logits/rejected": -17.871349334716797, "logps/chosen": -478.21319580078125, "logps/rejected": -294.91864013671875, "loss": 0.2515, "rewards/accuracies": 1.0, "rewards/chosen": 3.8687281608581543, "rewards/margins": 2.0654444694519043, "rewards/rejected": 1.80328369140625, "step": 49220 }, { "epoch": 2.285621430892799, "grad_norm": 65.7088394165039, "learning_rate": 1.629295696179024e-07, "logits/chosen": -19.250749588012695, "logits/rejected": -19.363811492919922, "logps/chosen": -342.5984191894531, "logps/rejected": -286.06707763671875, "loss": 0.4709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0484797954559326, "rewards/margins": 1.188857913017273, "rewards/rejected": 1.8596220016479492, "step": 49230 }, { "epoch": 2.2860857050002323, "grad_norm": 5.078979969024658, "learning_rate": 1.6290171317145644e-07, "logits/chosen": -19.020227432250977, "logits/rejected": -19.065847396850586, "logps/chosen": -435.7294921875, "logps/rejected": -360.86724853515625, "loss": 1.0588, "rewards/accuracies": 0.5, "rewards/chosen": 3.444056749343872, "rewards/margins": 0.46478691697120667, "rewards/rejected": 2.9792699813842773, "step": 49240 }, { "epoch": 2.286549979107665, "grad_norm": 190.04978942871094, "learning_rate": 1.6287385672501043e-07, "logits/chosen": -18.434539794921875, "logits/rejected": -17.837993621826172, "logps/chosen": -441.1524963378906, "logps/rejected": -609.83935546875, "loss": 1.2573, "rewards/accuracies": 0.5, "rewards/chosen": 3.6238582134246826, "rewards/margins": 0.331177294254303, "rewards/rejected": 3.2926807403564453, "step": 49250 }, { "epoch": 2.2870142532150983, "grad_norm": 55.00882339477539, "learning_rate": 1.6284600027856447e-07, "logits/chosen": -19.06004524230957, "logits/rejected": -17.810949325561523, "logps/chosen": -382.59515380859375, "logps/rejected": -282.679931640625, "loss": 0.3576, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.701200008392334, "rewards/margins": 1.5022289752960205, "rewards/rejected": 2.1989710330963135, "step": 49260 }, { "epoch": 2.287478527322531, "grad_norm": 109.70894622802734, "learning_rate": 1.6281814383211848e-07, "logits/chosen": -18.970500946044922, "logits/rejected": -19.054889678955078, "logps/chosen": -347.65472412109375, "logps/rejected": -389.4097595214844, "loss": 0.9047, "rewards/accuracies": 0.5, "rewards/chosen": 3.158388614654541, "rewards/margins": -0.0631907731294632, "rewards/rejected": 3.221579074859619, "step": 49270 }, { "epoch": 2.2879428014299643, "grad_norm": 5.06525993347168, "learning_rate": 1.627902873856725e-07, "logits/chosen": -19.306245803833008, "logits/rejected": -18.149513244628906, "logps/chosen": -532.8405151367188, "logps/rejected": -375.62725830078125, "loss": 0.4557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.829448223114014, "rewards/margins": 1.7112197875976562, "rewards/rejected": 3.1182284355163574, "step": 49280 }, { "epoch": 2.288407075537397, "grad_norm": 0.36635205149650574, "learning_rate": 1.627624309392265e-07, "logits/chosen": -19.987995147705078, "logits/rejected": -18.563508987426758, "logps/chosen": -365.017578125, "logps/rejected": -249.05972290039062, "loss": 0.27, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.68511700630188, "rewards/margins": 2.301361560821533, "rewards/rejected": 1.3837552070617676, "step": 49290 }, { "epoch": 2.2888713496448303, "grad_norm": 18.107818603515625, "learning_rate": 1.6273457449278053e-07, "logits/chosen": -18.019376754760742, "logits/rejected": -17.10061264038086, "logps/chosen": -430.47491455078125, "logps/rejected": -304.59124755859375, "loss": 0.671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.133476734161377, "rewards/margins": 1.6503413915634155, "rewards/rejected": 1.4831349849700928, "step": 49300 }, { "epoch": 2.2893356237522635, "grad_norm": 16.82773208618164, "learning_rate": 1.6270671804633457e-07, "logits/chosen": -18.234771728515625, "logits/rejected": -17.74142074584961, "logps/chosen": -304.17340087890625, "logps/rejected": -259.227783203125, "loss": 1.0158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1878743171691895, "rewards/margins": 0.529249370098114, "rewards/rejected": 1.6586250066757202, "step": 49310 }, { "epoch": 2.2897998978596963, "grad_norm": 36.57071304321289, "learning_rate": 1.6267886159988855e-07, "logits/chosen": -18.913501739501953, "logits/rejected": -19.27694320678711, "logps/chosen": -310.5836181640625, "logps/rejected": -316.6380920410156, "loss": 0.8947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1917808055877686, "rewards/margins": 0.03906048461794853, "rewards/rejected": 2.1527202129364014, "step": 49320 }, { "epoch": 2.2902641719671295, "grad_norm": 60.0829963684082, "learning_rate": 1.6265100515344257e-07, "logits/chosen": -18.556358337402344, "logits/rejected": -17.563610076904297, "logps/chosen": -377.2724609375, "logps/rejected": -286.7655334472656, "loss": 0.4966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0048530101776123, "rewards/margins": 1.005602478981018, "rewards/rejected": 1.9992506504058838, "step": 49330 }, { "epoch": 2.2907284460745623, "grad_norm": 184.08663940429688, "learning_rate": 1.626231487069966e-07, "logits/chosen": -19.551776885986328, "logits/rejected": -18.160907745361328, "logps/chosen": -443.7151794433594, "logps/rejected": -336.89239501953125, "loss": 0.7803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.537179946899414, "rewards/margins": 0.9056426882743835, "rewards/rejected": 2.6315367221832275, "step": 49340 }, { "epoch": 2.2911927201819955, "grad_norm": 5.215745449066162, "learning_rate": 1.6259529226055062e-07, "logits/chosen": -18.473716735839844, "logits/rejected": -18.456708908081055, "logps/chosen": -440.43927001953125, "logps/rejected": -417.2945861816406, "loss": 0.8171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6881747245788574, "rewards/margins": 0.7285931706428528, "rewards/rejected": 2.9595813751220703, "step": 49350 }, { "epoch": 2.2916569942894283, "grad_norm": 39.39735794067383, "learning_rate": 1.625674358141046e-07, "logits/chosen": -19.227603912353516, "logits/rejected": -18.53204917907715, "logps/chosen": -396.03912353515625, "logps/rejected": -283.4632873535156, "loss": 0.5622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.437981367111206, "rewards/margins": 0.9386483430862427, "rewards/rejected": 2.499333381652832, "step": 49360 }, { "epoch": 2.2921212683968615, "grad_norm": 138.7121124267578, "learning_rate": 1.6253957936765865e-07, "logits/chosen": -18.325284957885742, "logits/rejected": -17.307588577270508, "logps/chosen": -377.4870300292969, "logps/rejected": -262.2472229003906, "loss": 0.9019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6485564708709717, "rewards/margins": 1.2454450130462646, "rewards/rejected": 2.403110980987549, "step": 49370 }, { "epoch": 2.2925855425042947, "grad_norm": 25.873157501220703, "learning_rate": 1.6251172292121267e-07, "logits/chosen": -18.664106369018555, "logits/rejected": -18.658275604248047, "logps/chosen": -456.77197265625, "logps/rejected": -445.499755859375, "loss": 0.8317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.734830141067505, "rewards/margins": 0.36750417947769165, "rewards/rejected": 3.367326021194458, "step": 49380 }, { "epoch": 2.2930498166117275, "grad_norm": 34.92900848388672, "learning_rate": 1.624838664747667e-07, "logits/chosen": -19.607175827026367, "logits/rejected": -19.012197494506836, "logps/chosen": -344.6950988769531, "logps/rejected": -315.63287353515625, "loss": 0.8086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.951167106628418, "rewards/margins": 0.5857528448104858, "rewards/rejected": 2.3654141426086426, "step": 49390 }, { "epoch": 2.2935140907191607, "grad_norm": 1.7473493814468384, "learning_rate": 1.624560100283207e-07, "logits/chosen": -19.970449447631836, "logits/rejected": -18.924266815185547, "logps/chosen": -434.45843505859375, "logps/rejected": -340.556640625, "loss": 0.4313, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6240615844726562, "rewards/margins": 1.3034050464630127, "rewards/rejected": 2.3206562995910645, "step": 49400 }, { "epoch": 2.2939783648265935, "grad_norm": 42.987159729003906, "learning_rate": 1.6242815358187474e-07, "logits/chosen": -19.44777488708496, "logits/rejected": -18.696428298950195, "logps/chosen": -388.5439147949219, "logps/rejected": -318.1443786621094, "loss": 0.7141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9237658977508545, "rewards/margins": 0.5396803617477417, "rewards/rejected": 3.3840858936309814, "step": 49410 }, { "epoch": 2.2944426389340267, "grad_norm": 21.173603057861328, "learning_rate": 1.6240029713542875e-07, "logits/chosen": -17.92437744140625, "logits/rejected": -17.820688247680664, "logps/chosen": -372.74114990234375, "logps/rejected": -383.0227355957031, "loss": 1.0669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9429545402526855, "rewards/margins": 0.6693257093429565, "rewards/rejected": 3.2736287117004395, "step": 49420 }, { "epoch": 2.2949069130414594, "grad_norm": 14.013479232788086, "learning_rate": 1.623724406889828e-07, "logits/chosen": -18.437084197998047, "logits/rejected": -17.26442527770996, "logps/chosen": -383.6871643066406, "logps/rejected": -263.0024108886719, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 3.4224014282226562, "rewards/margins": 1.968515396118164, "rewards/rejected": 1.4538863897323608, "step": 49430 }, { "epoch": 2.2953711871488927, "grad_norm": 3.936487913131714, "learning_rate": 1.6234458424253678e-07, "logits/chosen": -18.596240997314453, "logits/rejected": -17.2259464263916, "logps/chosen": -395.60015869140625, "logps/rejected": -273.30029296875, "loss": 0.2873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.578607082366943, "rewards/margins": 2.666659116744995, "rewards/rejected": 1.9119478464126587, "step": 49440 }, { "epoch": 2.295835461256326, "grad_norm": 262.1181640625, "learning_rate": 1.623167277960908e-07, "logits/chosen": -19.287195205688477, "logits/rejected": -18.387033462524414, "logps/chosen": -382.3617248535156, "logps/rejected": -362.8486328125, "loss": 1.0772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.179795742034912, "rewards/margins": 0.007984387688338757, "rewards/rejected": 3.171811103820801, "step": 49450 }, { "epoch": 2.2962997353637586, "grad_norm": 34.68914794921875, "learning_rate": 1.6228887134964484e-07, "logits/chosen": -18.46192741394043, "logits/rejected": -18.40028953552246, "logps/chosen": -225.6632537841797, "logps/rejected": -272.69580078125, "loss": 0.6754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9840444326400757, "rewards/margins": 0.4148143231868744, "rewards/rejected": 1.569230079650879, "step": 49460 }, { "epoch": 2.296764009471192, "grad_norm": 120.84185791015625, "learning_rate": 1.6226101490319885e-07, "logits/chosen": -19.6789608001709, "logits/rejected": -19.12946891784668, "logps/chosen": -414.75030517578125, "logps/rejected": -269.4105529785156, "loss": 1.2604, "rewards/accuracies": 0.5, "rewards/chosen": 3.385720729827881, "rewards/margins": 0.19531427323818207, "rewards/rejected": 3.190406322479248, "step": 49470 }, { "epoch": 2.2972282835786246, "grad_norm": 116.22923278808594, "learning_rate": 1.6223315845675284e-07, "logits/chosen": -19.586896896362305, "logits/rejected": -19.289348602294922, "logps/chosen": -489.04461669921875, "logps/rejected": -467.44854736328125, "loss": 1.2821, "rewards/accuracies": 0.5, "rewards/chosen": 4.317324161529541, "rewards/margins": -0.07736794650554657, "rewards/rejected": 4.394691467285156, "step": 49480 }, { "epoch": 2.297692557686058, "grad_norm": 36.21562957763672, "learning_rate": 1.6220530201030688e-07, "logits/chosen": -18.78896141052246, "logits/rejected": -18.25457763671875, "logps/chosen": -404.49993896484375, "logps/rejected": -296.91357421875, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5098392963409424, "rewards/margins": 0.7323933839797974, "rewards/rejected": 1.7774460315704346, "step": 49490 }, { "epoch": 2.298156831793491, "grad_norm": 3.9477739334106445, "learning_rate": 1.621774455638609e-07, "logits/chosen": -18.625520706176758, "logits/rejected": -17.70989990234375, "logps/chosen": -417.05584716796875, "logps/rejected": -387.7267150878906, "loss": 0.6265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.02166223526001, "rewards/margins": 0.9624443054199219, "rewards/rejected": 3.0592174530029297, "step": 49500 }, { "epoch": 2.298621105900924, "grad_norm": 75.1761474609375, "learning_rate": 1.6214958911741494e-07, "logits/chosen": -19.836244583129883, "logits/rejected": -19.84661293029785, "logps/chosen": -270.0021667480469, "logps/rejected": -283.53350830078125, "loss": 1.1794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8200812339782715, "rewards/margins": 0.008040213957428932, "rewards/rejected": 2.8120410442352295, "step": 49510 }, { "epoch": 2.299085380008357, "grad_norm": 27.265729904174805, "learning_rate": 1.6212173267096893e-07, "logits/chosen": -18.616785049438477, "logits/rejected": -18.22168731689453, "logps/chosen": -374.25079345703125, "logps/rejected": -281.0530090332031, "loss": 0.5194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.571166515350342, "rewards/margins": 1.4296724796295166, "rewards/rejected": 2.1414942741394043, "step": 49520 }, { "epoch": 2.29954965411579, "grad_norm": 0.002462995471432805, "learning_rate": 1.6209387622452294e-07, "logits/chosen": -18.617706298828125, "logits/rejected": -17.387916564941406, "logps/chosen": -347.2183532714844, "logps/rejected": -200.79116821289062, "loss": 0.3776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.32196307182312, "rewards/margins": 2.289731979370117, "rewards/rejected": 1.0322312116622925, "step": 49530 }, { "epoch": 2.300013928223223, "grad_norm": 28.118730545043945, "learning_rate": 1.6206601977807698e-07, "logits/chosen": -18.814748764038086, "logits/rejected": -18.35603904724121, "logps/chosen": -450.6142578125, "logps/rejected": -391.8257751464844, "loss": 0.6998, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.357607841491699, "rewards/margins": 1.0460704565048218, "rewards/rejected": 3.311537504196167, "step": 49540 }, { "epoch": 2.3004782023306563, "grad_norm": 90.19886016845703, "learning_rate": 1.6203816333163097e-07, "logits/chosen": -19.185171127319336, "logits/rejected": -17.273029327392578, "logps/chosen": -425.201904296875, "logps/rejected": -296.15771484375, "loss": 0.3607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.010798454284668, "rewards/margins": 2.1345133781433105, "rewards/rejected": 1.876285195350647, "step": 49550 }, { "epoch": 2.300942476438089, "grad_norm": 90.00825500488281, "learning_rate": 1.62010306885185e-07, "logits/chosen": -19.677005767822266, "logits/rejected": -18.052566528320312, "logps/chosen": -349.3399353027344, "logps/rejected": -255.3409881591797, "loss": 0.5623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1589772701263428, "rewards/margins": 1.2690849304199219, "rewards/rejected": 1.8898922204971313, "step": 49560 }, { "epoch": 2.3014067505455222, "grad_norm": 6.208144664764404, "learning_rate": 1.6198245043873902e-07, "logits/chosen": -19.15997886657715, "logits/rejected": -19.016080856323242, "logps/chosen": -422.81884765625, "logps/rejected": -327.5579833984375, "loss": 0.7666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.114539861679077, "rewards/margins": 0.9008822441101074, "rewards/rejected": 2.2136576175689697, "step": 49570 }, { "epoch": 2.301871024652955, "grad_norm": 32.253692626953125, "learning_rate": 1.6195459399229307e-07, "logits/chosen": -19.103878021240234, "logits/rejected": -18.530790328979492, "logps/chosen": -470.26995849609375, "logps/rejected": -366.1445617675781, "loss": 0.287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.549434661865234, "rewards/margins": 1.8119022846221924, "rewards/rejected": 2.737532377243042, "step": 49580 }, { "epoch": 2.3023352987603882, "grad_norm": 161.0341339111328, "learning_rate": 1.6192673754584705e-07, "logits/chosen": -19.376705169677734, "logits/rejected": -18.76301383972168, "logps/chosen": -385.21722412109375, "logps/rejected": -346.10467529296875, "loss": 0.8605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.450659990310669, "rewards/margins": 0.6442002654075623, "rewards/rejected": 2.806459903717041, "step": 49590 }, { "epoch": 2.302799572867821, "grad_norm": 47.61378479003906, "learning_rate": 1.6189888109940107e-07, "logits/chosen": -20.01178741455078, "logits/rejected": -19.429183959960938, "logps/chosen": -435.2073669433594, "logps/rejected": -413.3377990722656, "loss": 0.7356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.430232048034668, "rewards/margins": 0.319248765707016, "rewards/rejected": 4.110983371734619, "step": 49600 }, { "epoch": 2.303263846975254, "grad_norm": 50.2229118347168, "learning_rate": 1.618710246529551e-07, "logits/chosen": -19.266307830810547, "logits/rejected": -17.50347900390625, "logps/chosen": -497.3839416503906, "logps/rejected": -275.9356384277344, "loss": 0.2011, "rewards/accuracies": 1.0, "rewards/chosen": 4.97586727142334, "rewards/margins": 2.2914185523986816, "rewards/rejected": 2.6844494342803955, "step": 49610 }, { "epoch": 2.3037281210826874, "grad_norm": 57.278404235839844, "learning_rate": 1.6184316820650912e-07, "logits/chosen": -18.59524154663086, "logits/rejected": -18.160465240478516, "logps/chosen": -329.0645446777344, "logps/rejected": -300.6087951660156, "loss": 1.1464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7609000205993652, "rewards/margins": 0.369790643453598, "rewards/rejected": 2.3911094665527344, "step": 49620 }, { "epoch": 2.30419239519012, "grad_norm": 84.70999145507812, "learning_rate": 1.618153117600631e-07, "logits/chosen": -18.250974655151367, "logits/rejected": -16.97861099243164, "logps/chosen": -396.22552490234375, "logps/rejected": -228.0622100830078, "loss": 0.4379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7974700927734375, "rewards/margins": 2.1224050521850586, "rewards/rejected": 1.675065279006958, "step": 49630 }, { "epoch": 2.3046566692975534, "grad_norm": 66.14907836914062, "learning_rate": 1.6178745531361715e-07, "logits/chosen": -19.280628204345703, "logits/rejected": -18.131683349609375, "logps/chosen": -430.35540771484375, "logps/rejected": -355.60382080078125, "loss": 0.544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.265681743621826, "rewards/margins": 1.4138295650482178, "rewards/rejected": 1.8518524169921875, "step": 49640 }, { "epoch": 2.305120943404986, "grad_norm": 38.620811462402344, "learning_rate": 1.6175959886717117e-07, "logits/chosen": -18.335792541503906, "logits/rejected": -17.779932022094727, "logps/chosen": -305.84039306640625, "logps/rejected": -273.5967712402344, "loss": 0.4514, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.771730422973633, "rewards/margins": 1.1969925165176392, "rewards/rejected": 2.574738025665283, "step": 49650 }, { "epoch": 2.3055852175124194, "grad_norm": 20.318418502807617, "learning_rate": 1.617317424207252e-07, "logits/chosen": -19.201244354248047, "logits/rejected": -18.900352478027344, "logps/chosen": -307.2659912109375, "logps/rejected": -345.0021057128906, "loss": 0.9132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5378754138946533, "rewards/margins": 0.23639026284217834, "rewards/rejected": 2.301485538482666, "step": 49660 }, { "epoch": 2.306049491619852, "grad_norm": 138.41934204101562, "learning_rate": 1.617038859742792e-07, "logits/chosen": -18.644298553466797, "logits/rejected": -18.043001174926758, "logps/chosen": -472.91119384765625, "logps/rejected": -393.74554443359375, "loss": 0.8038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8354694843292236, "rewards/margins": 0.5139818787574768, "rewards/rejected": 3.3214874267578125, "step": 49670 }, { "epoch": 2.3065137657272854, "grad_norm": 39.83219909667969, "learning_rate": 1.6167602952783324e-07, "logits/chosen": -19.659244537353516, "logits/rejected": -18.548145294189453, "logps/chosen": -309.84130859375, "logps/rejected": -225.81936645507812, "loss": 0.412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5376930236816406, "rewards/margins": 1.1784298419952393, "rewards/rejected": 2.3592629432678223, "step": 49680 }, { "epoch": 2.3069780398347186, "grad_norm": 0.08160936087369919, "learning_rate": 1.6164817308138725e-07, "logits/chosen": -19.06674575805664, "logits/rejected": -18.773061752319336, "logps/chosen": -383.8902587890625, "logps/rejected": -386.0126647949219, "loss": 1.0457, "rewards/accuracies": 0.5, "rewards/chosen": 3.9247817993164062, "rewards/margins": 0.4706289768218994, "rewards/rejected": 3.4541525840759277, "step": 49690 }, { "epoch": 2.3074423139421514, "grad_norm": 2.2629592418670654, "learning_rate": 1.6162031663494127e-07, "logits/chosen": -18.0389461517334, "logits/rejected": -17.132780075073242, "logps/chosen": -451.45062255859375, "logps/rejected": -277.9051818847656, "loss": 0.559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.492611408233643, "rewards/margins": 2.7285075187683105, "rewards/rejected": 1.7641032934188843, "step": 49700 }, { "epoch": 2.3079065880495846, "grad_norm": 81.07379150390625, "learning_rate": 1.6159246018849528e-07, "logits/chosen": -19.365198135375977, "logits/rejected": -19.893184661865234, "logps/chosen": -363.6102600097656, "logps/rejected": -377.2747497558594, "loss": 1.0697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.077949047088623, "rewards/margins": -0.17595595121383667, "rewards/rejected": 3.2539050579071045, "step": 49710 }, { "epoch": 2.3083708621570174, "grad_norm": 29.59681510925293, "learning_rate": 1.615646037420493e-07, "logits/chosen": -19.225969314575195, "logits/rejected": -17.768909454345703, "logps/chosen": -443.48345947265625, "logps/rejected": -304.8199157714844, "loss": 0.3612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2293572425842285, "rewards/margins": 2.123176097869873, "rewards/rejected": 2.1061809062957764, "step": 49720 }, { "epoch": 2.3088351362644506, "grad_norm": 15.679200172424316, "learning_rate": 1.6153674729560334e-07, "logits/chosen": -19.213146209716797, "logits/rejected": -17.458208084106445, "logps/chosen": -429.73773193359375, "logps/rejected": -307.15789794921875, "loss": 0.4541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.820730686187744, "rewards/margins": 1.5659440755844116, "rewards/rejected": 2.254786729812622, "step": 49730 }, { "epoch": 2.3092994103718834, "grad_norm": 51.382686614990234, "learning_rate": 1.6150889084915732e-07, "logits/chosen": -20.619935989379883, "logits/rejected": -19.83434295654297, "logps/chosen": -357.81170654296875, "logps/rejected": -346.97528076171875, "loss": 0.5873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.731351375579834, "rewards/margins": 1.1462876796722412, "rewards/rejected": 2.5850634574890137, "step": 49740 }, { "epoch": 2.3097636844793166, "grad_norm": 46.716697692871094, "learning_rate": 1.6148103440271134e-07, "logits/chosen": -18.960987091064453, "logits/rejected": -17.334774017333984, "logps/chosen": -559.6229248046875, "logps/rejected": -383.69439697265625, "loss": 0.165, "rewards/accuracies": 1.0, "rewards/chosen": 5.080877780914307, "rewards/margins": 2.722555160522461, "rewards/rejected": 2.3583226203918457, "step": 49750 }, { "epoch": 2.31022795858675, "grad_norm": 18.52036476135254, "learning_rate": 1.6145317795626538e-07, "logits/chosen": -19.248783111572266, "logits/rejected": -18.063608169555664, "logps/chosen": -437.4019470214844, "logps/rejected": -252.98388671875, "loss": 0.346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.778125047683716, "rewards/margins": 1.9127590656280518, "rewards/rejected": 1.865365743637085, "step": 49760 }, { "epoch": 2.3106922326941826, "grad_norm": 45.48088073730469, "learning_rate": 1.614253215098194e-07, "logits/chosen": -20.21503448486328, "logits/rejected": -18.897666931152344, "logps/chosen": -365.5703430175781, "logps/rejected": -226.8099822998047, "loss": 0.2919, "rewards/accuracies": 1.0, "rewards/chosen": 4.353083610534668, "rewards/margins": 1.7495214939117432, "rewards/rejected": 2.603562831878662, "step": 49770 }, { "epoch": 2.3111565068016158, "grad_norm": 16.3326416015625, "learning_rate": 1.6139746506337338e-07, "logits/chosen": -20.91180992126465, "logits/rejected": -20.03644371032715, "logps/chosen": -383.7320251464844, "logps/rejected": -352.05914306640625, "loss": 0.4939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.573421955108643, "rewards/margins": 1.604271650314331, "rewards/rejected": 2.9691503047943115, "step": 49780 }, { "epoch": 2.3116207809090485, "grad_norm": 42.827980041503906, "learning_rate": 1.6136960861692742e-07, "logits/chosen": -19.568065643310547, "logits/rejected": -18.054574966430664, "logps/chosen": -394.5525817871094, "logps/rejected": -272.21624755859375, "loss": 0.4887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2817740440368652, "rewards/margins": 1.3685243129730225, "rewards/rejected": 1.913250207901001, "step": 49790 }, { "epoch": 2.3120850550164818, "grad_norm": 13.784852981567383, "learning_rate": 1.6134175217048144e-07, "logits/chosen": -18.44724464416504, "logits/rejected": -16.803165435791016, "logps/chosen": -406.86090087890625, "logps/rejected": -247.404052734375, "loss": 0.3645, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.339675426483154, "rewards/margins": 2.7845213413238525, "rewards/rejected": 1.5551540851593018, "step": 49800 }, { "epoch": 2.3125493291239145, "grad_norm": 4.088434219360352, "learning_rate": 1.6131389572403548e-07, "logits/chosen": -19.36551856994629, "logits/rejected": -19.19100570678711, "logps/chosen": -428.2771911621094, "logps/rejected": -378.68804931640625, "loss": 0.8224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.899075746536255, "rewards/margins": 0.9928451776504517, "rewards/rejected": 2.9062304496765137, "step": 49810 }, { "epoch": 2.3130136032313477, "grad_norm": 171.6305389404297, "learning_rate": 1.6128603927758947e-07, "logits/chosen": -19.47100257873535, "logits/rejected": -18.47840690612793, "logps/chosen": -342.8242492675781, "logps/rejected": -278.52783203125, "loss": 0.449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.696930408477783, "rewards/margins": 1.0936882495880127, "rewards/rejected": 2.6032419204711914, "step": 49820 }, { "epoch": 2.313477877338781, "grad_norm": 0.10131034255027771, "learning_rate": 1.612581828311435e-07, "logits/chosen": -19.437091827392578, "logits/rejected": -18.55306625366211, "logps/chosen": -341.09686279296875, "logps/rejected": -246.78121948242188, "loss": 0.4367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8204214572906494, "rewards/margins": 1.7580219507217407, "rewards/rejected": 1.0623996257781982, "step": 49830 }, { "epoch": 2.3139421514462137, "grad_norm": 113.2118911743164, "learning_rate": 1.6123032638469752e-07, "logits/chosen": -18.92619514465332, "logits/rejected": -18.289297103881836, "logps/chosen": -428.9356994628906, "logps/rejected": -428.23858642578125, "loss": 0.9713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1442360877990723, "rewards/margins": 0.27919572591781616, "rewards/rejected": 2.8650403022766113, "step": 49840 }, { "epoch": 2.314406425553647, "grad_norm": 2.1192171573638916, "learning_rate": 1.6120246993825156e-07, "logits/chosen": -20.436481475830078, "logits/rejected": -19.012229919433594, "logps/chosen": -381.94537353515625, "logps/rejected": -233.00759887695312, "loss": 0.2451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.048450469970703, "rewards/margins": 2.269533634185791, "rewards/rejected": 1.7789169549942017, "step": 49850 }, { "epoch": 2.3148706996610797, "grad_norm": 148.0137176513672, "learning_rate": 1.6117461349180555e-07, "logits/chosen": -19.131635665893555, "logits/rejected": -18.502025604248047, "logps/chosen": -477.049072265625, "logps/rejected": -438.90625, "loss": 0.7586, "rewards/accuracies": 0.5, "rewards/chosen": 3.8249149322509766, "rewards/margins": 0.9552105069160461, "rewards/rejected": 2.869704484939575, "step": 49860 }, { "epoch": 2.315334973768513, "grad_norm": 19.262962341308594, "learning_rate": 1.6114675704535957e-07, "logits/chosen": -18.903644561767578, "logits/rejected": -17.91689109802246, "logps/chosen": -285.0047912597656, "logps/rejected": -150.92367553710938, "loss": 0.4479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3455255031585693, "rewards/margins": 1.6204192638397217, "rewards/rejected": 0.7251060605049133, "step": 49870 }, { "epoch": 2.3157992478759457, "grad_norm": 207.3544158935547, "learning_rate": 1.611189005989136e-07, "logits/chosen": -19.170339584350586, "logits/rejected": -18.52260398864746, "logps/chosen": -397.571044921875, "logps/rejected": -356.42828369140625, "loss": 0.7822, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.107595205307007, "rewards/margins": 0.28949061036109924, "rewards/rejected": 2.8181052207946777, "step": 49880 }, { "epoch": 2.316263521983379, "grad_norm": 63.78715515136719, "learning_rate": 1.6109104415246762e-07, "logits/chosen": -19.17090606689453, "logits/rejected": -18.7882022857666, "logps/chosen": -386.41131591796875, "logps/rejected": -282.0798034667969, "loss": 0.9246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2473995685577393, "rewards/margins": 0.6468349695205688, "rewards/rejected": 2.600564479827881, "step": 49890 }, { "epoch": 2.316727796090812, "grad_norm": 117.25616455078125, "learning_rate": 1.610631877060216e-07, "logits/chosen": -18.827552795410156, "logits/rejected": -18.08173942565918, "logps/chosen": -453.5984802246094, "logps/rejected": -431.8389587402344, "loss": 0.957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4624722003936768, "rewards/margins": 0.23718972504138947, "rewards/rejected": 3.2252821922302246, "step": 49900 }, { "epoch": 2.317192070198245, "grad_norm": 9.752402305603027, "learning_rate": 1.6103533125957565e-07, "logits/chosen": -18.25082015991211, "logits/rejected": -17.353458404541016, "logps/chosen": -419.95880126953125, "logps/rejected": -278.3805236816406, "loss": 0.2449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.161649703979492, "rewards/margins": 2.5088768005371094, "rewards/rejected": 1.6527729034423828, "step": 49910 }, { "epoch": 2.317656344305678, "grad_norm": 0.990190327167511, "learning_rate": 1.6100747481312967e-07, "logits/chosen": -19.22265625, "logits/rejected": -17.39718246459961, "logps/chosen": -371.78314208984375, "logps/rejected": -240.39120483398438, "loss": 0.2538, "rewards/accuracies": 1.0, "rewards/chosen": 3.717219114303589, "rewards/margins": 2.188483715057373, "rewards/rejected": 1.528735876083374, "step": 49920 }, { "epoch": 2.3181206184131113, "grad_norm": 19.009071350097656, "learning_rate": 1.6097961836668368e-07, "logits/chosen": -18.283710479736328, "logits/rejected": -17.438203811645508, "logps/chosen": -392.33905029296875, "logps/rejected": -283.19293212890625, "loss": 0.5098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3397414684295654, "rewards/margins": 1.5063960552215576, "rewards/rejected": 1.8333451747894287, "step": 49930 }, { "epoch": 2.318584892520544, "grad_norm": 41.49345779418945, "learning_rate": 1.609517619202377e-07, "logits/chosen": -18.119850158691406, "logits/rejected": -17.45059585571289, "logps/chosen": -459.343017578125, "logps/rejected": -380.38958740234375, "loss": 0.6095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.036228656768799, "rewards/margins": 0.4852978587150574, "rewards/rejected": 2.5509307384490967, "step": 49940 }, { "epoch": 2.3190491666279773, "grad_norm": 37.1809196472168, "learning_rate": 1.609239054737917e-07, "logits/chosen": -19.035247802734375, "logits/rejected": -18.33576011657715, "logps/chosen": -456.50091552734375, "logps/rejected": -343.4733581542969, "loss": 0.5944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5209269523620605, "rewards/margins": 1.1964246034622192, "rewards/rejected": 3.324502468109131, "step": 49950 }, { "epoch": 2.31951344073541, "grad_norm": 35.946170806884766, "learning_rate": 1.6089604902734575e-07, "logits/chosen": -19.337934494018555, "logits/rejected": -18.738624572753906, "logps/chosen": -391.9515686035156, "logps/rejected": -293.21844482421875, "loss": 0.7736, "rewards/accuracies": 0.5, "rewards/chosen": 2.2866291999816895, "rewards/margins": 0.3196951448917389, "rewards/rejected": 1.9669339656829834, "step": 49960 }, { "epoch": 2.3199777148428433, "grad_norm": 11.070830345153809, "learning_rate": 1.6086819258089974e-07, "logits/chosen": -18.162860870361328, "logits/rejected": -16.662891387939453, "logps/chosen": -364.81494140625, "logps/rejected": -210.01376342773438, "loss": 0.4003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9493801593780518, "rewards/margins": 1.7777376174926758, "rewards/rejected": 1.1716426610946655, "step": 49970 }, { "epoch": 2.320441988950276, "grad_norm": 15.155145645141602, "learning_rate": 1.6084033613445378e-07, "logits/chosen": -19.502605438232422, "logits/rejected": -19.137638092041016, "logps/chosen": -471.18017578125, "logps/rejected": -454.3821716308594, "loss": 0.6565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.197749137878418, "rewards/margins": 0.4891657829284668, "rewards/rejected": 3.708583116531372, "step": 49980 }, { "epoch": 2.3209062630577093, "grad_norm": 72.84614562988281, "learning_rate": 1.608124796880078e-07, "logits/chosen": -19.100032806396484, "logits/rejected": -17.999481201171875, "logps/chosen": -429.077880859375, "logps/rejected": -271.2392272949219, "loss": 0.4057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9218335151672363, "rewards/margins": 1.9662559032440186, "rewards/rejected": 1.9555776119232178, "step": 49990 }, { "epoch": 2.3213705371651425, "grad_norm": 82.93096923828125, "learning_rate": 1.6078462324156183e-07, "logits/chosen": -18.96643829345703, "logits/rejected": -18.60353660583496, "logps/chosen": -382.4389343261719, "logps/rejected": -296.6251220703125, "loss": 0.7509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6925711631774902, "rewards/margins": 0.5104337930679321, "rewards/rejected": 3.1821377277374268, "step": 50000 }, { "epoch": 2.3218348112725753, "grad_norm": 18.636533737182617, "learning_rate": 1.6075676679511582e-07, "logits/chosen": -18.939138412475586, "logits/rejected": -18.47483253479004, "logps/chosen": -392.03839111328125, "logps/rejected": -294.59716796875, "loss": 0.5718, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.413240432739258, "rewards/margins": 1.216570258140564, "rewards/rejected": 2.1966700553894043, "step": 50010 }, { "epoch": 2.3222990853800085, "grad_norm": 1.3712140321731567, "learning_rate": 1.6072891034866984e-07, "logits/chosen": -20.132617950439453, "logits/rejected": -19.20831871032715, "logps/chosen": -453.54931640625, "logps/rejected": -321.91790771484375, "loss": 0.7328, "rewards/accuracies": 0.5, "rewards/chosen": 3.8782799243927, "rewards/margins": 1.1777069568634033, "rewards/rejected": 2.700573205947876, "step": 50020 }, { "epoch": 2.3227633594874413, "grad_norm": 17.211854934692383, "learning_rate": 1.6070105390222388e-07, "logits/chosen": -19.043378829956055, "logits/rejected": -18.434091567993164, "logps/chosen": -417.5437927246094, "logps/rejected": -379.9689025878906, "loss": 0.3735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.458573579788208, "rewards/margins": 0.9977332949638367, "rewards/rejected": 2.4608407020568848, "step": 50030 }, { "epoch": 2.3232276335948745, "grad_norm": 57.106807708740234, "learning_rate": 1.606731974557779e-07, "logits/chosen": -18.855100631713867, "logits/rejected": -18.146800994873047, "logps/chosen": -416.126220703125, "logps/rejected": -342.54962158203125, "loss": 0.4931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4221959114074707, "rewards/margins": 1.3116223812103271, "rewards/rejected": 2.1105735301971436, "step": 50040 }, { "epoch": 2.3236919077023073, "grad_norm": 231.82064819335938, "learning_rate": 1.6064534100933188e-07, "logits/chosen": -19.12906265258789, "logits/rejected": -18.669727325439453, "logps/chosen": -453.85968017578125, "logps/rejected": -324.67047119140625, "loss": 0.3974, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.197579383850098, "rewards/margins": 1.937147855758667, "rewards/rejected": 2.2604312896728516, "step": 50050 }, { "epoch": 2.3241561818097405, "grad_norm": 48.84840393066406, "learning_rate": 1.6061748456288592e-07, "logits/chosen": -18.52515983581543, "logits/rejected": -18.667999267578125, "logps/chosen": -345.0284118652344, "logps/rejected": -356.3661193847656, "loss": 0.859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6175811290740967, "rewards/margins": 0.1795414388179779, "rewards/rejected": 2.438039541244507, "step": 50060 }, { "epoch": 2.3246204559171737, "grad_norm": 307.81561279296875, "learning_rate": 1.6058962811643994e-07, "logits/chosen": -19.004243850708008, "logits/rejected": -17.61421775817871, "logps/chosen": -321.4962463378906, "logps/rejected": -216.68447875976562, "loss": 0.4154, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.560969114303589, "rewards/margins": 2.0072379112243652, "rewards/rejected": 1.5537309646606445, "step": 50070 }, { "epoch": 2.3250847300246065, "grad_norm": 10.139962196350098, "learning_rate": 1.6056177166999398e-07, "logits/chosen": -18.5450382232666, "logits/rejected": -18.620115280151367, "logps/chosen": -378.1143798828125, "logps/rejected": -391.56597900390625, "loss": 1.3993, "rewards/accuracies": 0.5, "rewards/chosen": 3.069120168685913, "rewards/margins": -0.1775863617658615, "rewards/rejected": 3.246706485748291, "step": 50080 }, { "epoch": 2.3255490041320397, "grad_norm": 37.59958267211914, "learning_rate": 1.6053391522354797e-07, "logits/chosen": -18.036563873291016, "logits/rejected": -18.883134841918945, "logps/chosen": -428.259033203125, "logps/rejected": -435.50811767578125, "loss": 1.3641, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.4762771129608154, "rewards/margins": -0.7873655557632446, "rewards/rejected": 4.263642311096191, "step": 50090 }, { "epoch": 2.3260132782394725, "grad_norm": 3.6266050338745117, "learning_rate": 1.60506058777102e-07, "logits/chosen": -18.56934928894043, "logits/rejected": -16.967100143432617, "logps/chosen": -419.57391357421875, "logps/rejected": -253.5518341064453, "loss": 0.4097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.455939292907715, "rewards/margins": 2.3744733333587646, "rewards/rejected": 2.0814664363861084, "step": 50100 }, { "epoch": 2.3264775523469057, "grad_norm": 149.16966247558594, "learning_rate": 1.6047820233065602e-07, "logits/chosen": -19.050052642822266, "logits/rejected": -18.490955352783203, "logps/chosen": -462.76953125, "logps/rejected": -406.3130798339844, "loss": 0.8304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.165553092956543, "rewards/margins": 0.8553472757339478, "rewards/rejected": 3.3102059364318848, "step": 50110 }, { "epoch": 2.3269418264543384, "grad_norm": 107.5709228515625, "learning_rate": 1.6045034588421e-07, "logits/chosen": -18.108055114746094, "logits/rejected": -18.22499656677246, "logps/chosen": -336.7943115234375, "logps/rejected": -374.7925720214844, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6176376342773438, "rewards/margins": 0.6572350859642029, "rewards/rejected": 2.960402488708496, "step": 50120 }, { "epoch": 2.3274061005617717, "grad_norm": 54.0079345703125, "learning_rate": 1.6042248943776405e-07, "logits/chosen": -19.633499145507812, "logits/rejected": -18.87359619140625, "logps/chosen": -366.73248291015625, "logps/rejected": -279.4595642089844, "loss": 0.4649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2949113845825195, "rewards/margins": 2.1159725189208984, "rewards/rejected": 2.1789393424987793, "step": 50130 }, { "epoch": 2.327870374669205, "grad_norm": 46.92037582397461, "learning_rate": 1.6039463299131806e-07, "logits/chosen": -18.605852127075195, "logits/rejected": -17.533985137939453, "logps/chosen": -402.18975830078125, "logps/rejected": -264.83770751953125, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7259974479675293, "rewards/margins": 0.897411048412323, "rewards/rejected": 1.8285865783691406, "step": 50140 }, { "epoch": 2.3283346487766376, "grad_norm": 62.85940170288086, "learning_rate": 1.603667765448721e-07, "logits/chosen": -20.511152267456055, "logits/rejected": -19.21609878540039, "logps/chosen": -428.602294921875, "logps/rejected": -330.0610656738281, "loss": 0.6573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.327829360961914, "rewards/margins": 1.391512155532837, "rewards/rejected": 2.9363174438476562, "step": 50150 }, { "epoch": 2.328798922884071, "grad_norm": 51.78130340576172, "learning_rate": 1.603389200984261e-07, "logits/chosen": -18.743408203125, "logits/rejected": -19.342693328857422, "logps/chosen": -250.40975952148438, "logps/rejected": -286.88482666015625, "loss": 1.0096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.247788906097412, "rewards/margins": -0.046441495418548584, "rewards/rejected": 2.2942302227020264, "step": 50160 }, { "epoch": 2.3292631969915036, "grad_norm": 178.20338439941406, "learning_rate": 1.603110636519801e-07, "logits/chosen": -18.288108825683594, "logits/rejected": -18.163921356201172, "logps/chosen": -451.6343688964844, "logps/rejected": -382.81927490234375, "loss": 0.912, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.605194330215454, "rewards/margins": 0.452353298664093, "rewards/rejected": 2.152841091156006, "step": 50170 }, { "epoch": 2.329727471098937, "grad_norm": 104.68458557128906, "learning_rate": 1.6028320720553415e-07, "logits/chosen": -19.46399688720703, "logits/rejected": -17.781740188598633, "logps/chosen": -398.76702880859375, "logps/rejected": -273.57012939453125, "loss": 0.6217, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.940523147583008, "rewards/margins": 1.5089857578277588, "rewards/rejected": 2.43153715133667, "step": 50180 }, { "epoch": 2.3301917452063696, "grad_norm": 9.707619667053223, "learning_rate": 1.6025535075908816e-07, "logits/chosen": -19.581249237060547, "logits/rejected": -18.827856063842773, "logps/chosen": -419.49090576171875, "logps/rejected": -340.8147888183594, "loss": 0.4148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.803370237350464, "rewards/margins": 1.6988204717636108, "rewards/rejected": 2.1045498847961426, "step": 50190 }, { "epoch": 2.330656019313803, "grad_norm": 101.55548095703125, "learning_rate": 1.6022749431264215e-07, "logits/chosen": -19.587661743164062, "logits/rejected": -18.384031295776367, "logps/chosen": -401.99725341796875, "logps/rejected": -348.6486511230469, "loss": 0.3168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.766552925109863, "rewards/margins": 2.202498435974121, "rewards/rejected": 2.564054489135742, "step": 50200 }, { "epoch": 2.331120293421236, "grad_norm": 4.08879280090332, "learning_rate": 1.601996378661962e-07, "logits/chosen": -18.23369598388672, "logits/rejected": -18.101648330688477, "logps/chosen": -329.3234558105469, "logps/rejected": -374.3542175292969, "loss": 1.0355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.990168809890747, "rewards/margins": 0.42388296127319336, "rewards/rejected": 3.566286087036133, "step": 50210 }, { "epoch": 2.331584567528669, "grad_norm": 76.0179672241211, "learning_rate": 1.601717814197502e-07, "logits/chosen": -19.449953079223633, "logits/rejected": -19.207162857055664, "logps/chosen": -311.3315124511719, "logps/rejected": -314.53533935546875, "loss": 0.9726, "rewards/accuracies": 0.5, "rewards/chosen": 2.7424824237823486, "rewards/margins": -0.13623137772083282, "rewards/rejected": 2.878713846206665, "step": 50220 }, { "epoch": 2.332048841636102, "grad_norm": 43.04042434692383, "learning_rate": 1.6014392497330425e-07, "logits/chosen": -18.4874210357666, "logits/rejected": -18.408884048461914, "logps/chosen": -356.717529296875, "logps/rejected": -377.6504821777344, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8821818828582764, "rewards/margins": 0.7640377879142761, "rewards/rejected": 3.1181440353393555, "step": 50230 }, { "epoch": 2.332513115743535, "grad_norm": 46.223609924316406, "learning_rate": 1.6011606852685824e-07, "logits/chosen": -18.214197158813477, "logits/rejected": -18.27883529663086, "logps/chosen": -372.96441650390625, "logps/rejected": -381.3620910644531, "loss": 0.8156, "rewards/accuracies": 0.5, "rewards/chosen": 2.6125667095184326, "rewards/margins": 0.20758172869682312, "rewards/rejected": 2.404984951019287, "step": 50240 }, { "epoch": 2.332977389850968, "grad_norm": 106.32125091552734, "learning_rate": 1.6008821208041228e-07, "logits/chosen": -19.324026107788086, "logits/rejected": -18.285207748413086, "logps/chosen": -356.52227783203125, "logps/rejected": -269.8247985839844, "loss": 0.465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3977253437042236, "rewards/margins": 1.6132351160049438, "rewards/rejected": 1.7844903469085693, "step": 50250 }, { "epoch": 2.333441663958401, "grad_norm": 47.5622673034668, "learning_rate": 1.600603556339663e-07, "logits/chosen": -18.31797981262207, "logits/rejected": -18.012723922729492, "logps/chosen": -454.3349609375, "logps/rejected": -321.46234130859375, "loss": 1.015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1756937503814697, "rewards/margins": 0.8018102645874023, "rewards/rejected": 2.3738837242126465, "step": 50260 }, { "epoch": 2.333905938065834, "grad_norm": 77.23239135742188, "learning_rate": 1.6003249918752033e-07, "logits/chosen": -18.931224822998047, "logits/rejected": -19.185382843017578, "logps/chosen": -405.27166748046875, "logps/rejected": -391.6365661621094, "loss": 0.6642, "rewards/accuracies": 0.5, "rewards/chosen": 3.437171220779419, "rewards/margins": 0.8627169728279114, "rewards/rejected": 2.5744540691375732, "step": 50270 }, { "epoch": 2.3343702121732672, "grad_norm": 7.795340538024902, "learning_rate": 1.6000464274107432e-07, "logits/chosen": -18.59735679626465, "logits/rejected": -17.857379913330078, "logps/chosen": -356.2691955566406, "logps/rejected": -226.4646759033203, "loss": 0.743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7108588218688965, "rewards/margins": 1.607269287109375, "rewards/rejected": 2.1035895347595215, "step": 50280 }, { "epoch": 2.3348344862807, "grad_norm": 16.55256462097168, "learning_rate": 1.5997678629462834e-07, "logits/chosen": -18.714101791381836, "logits/rejected": -18.593185424804688, "logps/chosen": -418.249755859375, "logps/rejected": -373.135009765625, "loss": 0.8982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.779134750366211, "rewards/margins": 0.44174250960350037, "rewards/rejected": 3.3373923301696777, "step": 50290 }, { "epoch": 2.335298760388133, "grad_norm": 211.81434631347656, "learning_rate": 1.5994892984818238e-07, "logits/chosen": -19.330917358398438, "logits/rejected": -18.150146484375, "logps/chosen": -463.39849853515625, "logps/rejected": -341.07562255859375, "loss": 1.0341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.013688087463379, "rewards/margins": 1.283158779144287, "rewards/rejected": 2.7305290699005127, "step": 50300 }, { "epoch": 2.335763034495566, "grad_norm": 183.20599365234375, "learning_rate": 1.5992107340173636e-07, "logits/chosen": -19.68037223815918, "logits/rejected": -18.0781307220459, "logps/chosen": -428.0708923339844, "logps/rejected": -369.39923095703125, "loss": 0.7179, "rewards/accuracies": 0.5, "rewards/chosen": 4.071491241455078, "rewards/margins": 1.101386547088623, "rewards/rejected": 2.970104694366455, "step": 50310 }, { "epoch": 2.336227308602999, "grad_norm": 15.298952102661133, "learning_rate": 1.5989321695529038e-07, "logits/chosen": -18.754892349243164, "logits/rejected": -18.3675537109375, "logps/chosen": -282.84906005859375, "logps/rejected": -197.60952758789062, "loss": 0.5022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9524166584014893, "rewards/margins": 0.7466217279434204, "rewards/rejected": 1.2057949304580688, "step": 50320 }, { "epoch": 2.3366915827104324, "grad_norm": 233.08544921875, "learning_rate": 1.5986536050884442e-07, "logits/chosen": -19.146711349487305, "logits/rejected": -18.476566314697266, "logps/chosen": -473.52001953125, "logps/rejected": -418.3614807128906, "loss": 0.6959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.893496513366699, "rewards/margins": 0.8035159111022949, "rewards/rejected": 4.089980602264404, "step": 50330 }, { "epoch": 2.337155856817865, "grad_norm": 21.7593936920166, "learning_rate": 1.5983750406239843e-07, "logits/chosen": -19.617616653442383, "logits/rejected": -18.322952270507812, "logps/chosen": -292.58514404296875, "logps/rejected": -244.22006225585938, "loss": 0.5705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.794214963912964, "rewards/margins": 1.134965181350708, "rewards/rejected": 1.659250020980835, "step": 50340 }, { "epoch": 2.3376201309252984, "grad_norm": 53.001155853271484, "learning_rate": 1.5980964761595245e-07, "logits/chosen": -18.35513687133789, "logits/rejected": -17.761945724487305, "logps/chosen": -260.6976318359375, "logps/rejected": -236.19204711914062, "loss": 0.6801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.409292697906494, "rewards/margins": 1.01116144657135, "rewards/rejected": 1.398131251335144, "step": 50350 }, { "epoch": 2.338084405032731, "grad_norm": 34.09529495239258, "learning_rate": 1.5978179116950646e-07, "logits/chosen": -19.228513717651367, "logits/rejected": -18.785053253173828, "logps/chosen": -471.73681640625, "logps/rejected": -342.4677734375, "loss": 0.3324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.6152544021606445, "rewards/margins": 1.8192968368530273, "rewards/rejected": 2.795957326889038, "step": 50360 }, { "epoch": 2.3385486791401644, "grad_norm": 59.559452056884766, "learning_rate": 1.5975393472306048e-07, "logits/chosen": -18.8673152923584, "logits/rejected": -17.655046463012695, "logps/chosen": -336.53106689453125, "logps/rejected": -198.0478973388672, "loss": 0.4597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3009941577911377, "rewards/margins": 1.673248291015625, "rewards/rejected": 1.6277456283569336, "step": 50370 }, { "epoch": 2.3390129532475976, "grad_norm": 131.98764038085938, "learning_rate": 1.5972607827661452e-07, "logits/chosen": -18.847213745117188, "logits/rejected": -17.671245574951172, "logps/chosen": -479.00799560546875, "logps/rejected": -351.0550537109375, "loss": 0.4452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.41524076461792, "rewards/margins": 1.7897189855575562, "rewards/rejected": 2.625521659851074, "step": 50380 }, { "epoch": 2.3394772273550304, "grad_norm": 14.203718185424805, "learning_rate": 1.596982218301685e-07, "logits/chosen": -18.909526824951172, "logits/rejected": -17.893123626708984, "logps/chosen": -353.33184814453125, "logps/rejected": -278.6692199707031, "loss": 0.4062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1041197776794434, "rewards/margins": 1.4749767780303955, "rewards/rejected": 1.6291431188583374, "step": 50390 }, { "epoch": 2.3399415014624636, "grad_norm": 22.191986083984375, "learning_rate": 1.5967036538372255e-07, "logits/chosen": -19.540796279907227, "logits/rejected": -18.711872100830078, "logps/chosen": -421.0687561035156, "logps/rejected": -341.263427734375, "loss": 0.49, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9291012287139893, "rewards/margins": 0.8827710151672363, "rewards/rejected": 2.046330213546753, "step": 50400 }, { "epoch": 2.3404057755698964, "grad_norm": 110.06968688964844, "learning_rate": 1.5964250893727656e-07, "logits/chosen": -18.730871200561523, "logits/rejected": -17.93021583557129, "logps/chosen": -448.30755615234375, "logps/rejected": -285.03314208984375, "loss": 0.5194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9141249656677246, "rewards/margins": 1.6915686130523682, "rewards/rejected": 2.2225568294525146, "step": 50410 }, { "epoch": 2.3408700496773296, "grad_norm": 63.9290885925293, "learning_rate": 1.596146524908306e-07, "logits/chosen": -19.22899627685547, "logits/rejected": -18.549442291259766, "logps/chosen": -390.6260070800781, "logps/rejected": -300.01129150390625, "loss": 0.3183, "rewards/accuracies": 1.0, "rewards/chosen": 3.1414880752563477, "rewards/margins": 1.4197423458099365, "rewards/rejected": 1.7217457294464111, "step": 50420 }, { "epoch": 2.3413343237847624, "grad_norm": 204.95787048339844, "learning_rate": 1.595867960443846e-07, "logits/chosen": -17.83747100830078, "logits/rejected": -18.399898529052734, "logps/chosen": -257.3872375488281, "logps/rejected": -347.7727355957031, "loss": 1.5688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.106255054473877, "rewards/margins": -0.11007878929376602, "rewards/rejected": 2.2163338661193848, "step": 50430 }, { "epoch": 2.3417985978921956, "grad_norm": 80.37519836425781, "learning_rate": 1.595589395979386e-07, "logits/chosen": -18.54876136779785, "logits/rejected": -18.40755844116211, "logps/chosen": -364.79437255859375, "logps/rejected": -388.6471862792969, "loss": 1.0827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6037466526031494, "rewards/margins": -0.2417701929807663, "rewards/rejected": 2.8455166816711426, "step": 50440 }, { "epoch": 2.342262871999629, "grad_norm": 0.1661890745162964, "learning_rate": 1.5953108315149265e-07, "logits/chosen": -19.482402801513672, "logits/rejected": -18.443445205688477, "logps/chosen": -373.4615783691406, "logps/rejected": -273.14569091796875, "loss": 0.9589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.955873489379883, "rewards/margins": 1.0978977680206299, "rewards/rejected": 2.857975721359253, "step": 50450 }, { "epoch": 2.3427271461070616, "grad_norm": 55.17729949951172, "learning_rate": 1.5950322670504666e-07, "logits/chosen": -18.541030883789062, "logits/rejected": -17.80034065246582, "logps/chosen": -370.40753173828125, "logps/rejected": -235.67971801757812, "loss": 0.3876, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7726645469665527, "rewards/margins": 1.006232738494873, "rewards/rejected": 1.7664318084716797, "step": 50460 }, { "epoch": 2.343191420214495, "grad_norm": 30.060836791992188, "learning_rate": 1.5947537025860065e-07, "logits/chosen": -19.137836456298828, "logits/rejected": -18.626462936401367, "logps/chosen": -357.1856994628906, "logps/rejected": -293.368408203125, "loss": 0.9563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.25825572013855, "rewards/margins": 0.5242732763290405, "rewards/rejected": 2.733982563018799, "step": 50470 }, { "epoch": 2.3436556943219276, "grad_norm": 37.41155242919922, "learning_rate": 1.594475138121547e-07, "logits/chosen": -18.778514862060547, "logits/rejected": -18.526689529418945, "logps/chosen": -493.23504638671875, "logps/rejected": -461.29693603515625, "loss": 1.2614, "rewards/accuracies": 0.5, "rewards/chosen": 3.805513858795166, "rewards/margins": 0.11215074360370636, "rewards/rejected": 3.6933631896972656, "step": 50480 }, { "epoch": 2.3441199684293608, "grad_norm": 28.443105697631836, "learning_rate": 1.594196573657087e-07, "logits/chosen": -18.81827163696289, "logits/rejected": -17.941064834594727, "logps/chosen": -430.642822265625, "logps/rejected": -324.66741943359375, "loss": 0.5219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9002177715301514, "rewards/margins": 1.92996084690094, "rewards/rejected": 1.9702571630477905, "step": 50490 }, { "epoch": 2.3445842425367935, "grad_norm": 7.590516090393066, "learning_rate": 1.5939180091926272e-07, "logits/chosen": -19.463253021240234, "logits/rejected": -17.86874771118164, "logps/chosen": -362.25347900390625, "logps/rejected": -291.51153564453125, "loss": 0.2242, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6412911415100098, "rewards/margins": 1.8376861810684204, "rewards/rejected": 1.803605318069458, "step": 50500 }, { "epoch": 2.3450485166442268, "grad_norm": 64.24591827392578, "learning_rate": 1.5936394447281673e-07, "logits/chosen": -18.939105987548828, "logits/rejected": -18.623571395874023, "logps/chosen": -384.5340576171875, "logps/rejected": -321.47479248046875, "loss": 0.869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.848906993865967, "rewards/margins": 1.0747902393341064, "rewards/rejected": 2.7741172313690186, "step": 50510 }, { "epoch": 2.34551279075166, "grad_norm": 8.357701301574707, "learning_rate": 1.5933608802637078e-07, "logits/chosen": -19.772579193115234, "logits/rejected": -18.554759979248047, "logps/chosen": -434.35174560546875, "logps/rejected": -351.9649658203125, "loss": 0.6991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8374085426330566, "rewards/margins": 0.9148145914077759, "rewards/rejected": 2.9225940704345703, "step": 50520 }, { "epoch": 2.3459770648590927, "grad_norm": 91.40349578857422, "learning_rate": 1.593082315799248e-07, "logits/chosen": -19.269107818603516, "logits/rejected": -17.564952850341797, "logps/chosen": -472.11419677734375, "logps/rejected": -312.50579833984375, "loss": 0.4848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9979405403137207, "rewards/margins": 2.0467429161071777, "rewards/rejected": 1.951197862625122, "step": 50530 }, { "epoch": 2.346441338966526, "grad_norm": 12.23354721069336, "learning_rate": 1.5928037513347878e-07, "logits/chosen": -19.931562423706055, "logits/rejected": -18.33915138244629, "logps/chosen": -332.57208251953125, "logps/rejected": -197.96945190429688, "loss": 0.2719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7774569988250732, "rewards/margins": 2.9546761512756348, "rewards/rejected": 0.8227810859680176, "step": 50540 }, { "epoch": 2.3469056130739587, "grad_norm": 70.62212371826172, "learning_rate": 1.5925251868703282e-07, "logits/chosen": -18.966426849365234, "logits/rejected": -17.287364959716797, "logps/chosen": -413.419677734375, "logps/rejected": -312.91546630859375, "loss": 1.0785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5437419414520264, "rewards/margins": 0.762368381023407, "rewards/rejected": 2.7813735008239746, "step": 50550 }, { "epoch": 2.347369887181392, "grad_norm": 135.74209594726562, "learning_rate": 1.5922466224058683e-07, "logits/chosen": -17.948522567749023, "logits/rejected": -18.575653076171875, "logps/chosen": -322.641357421875, "logps/rejected": -395.5244140625, "loss": 1.0431, "rewards/accuracies": 0.5, "rewards/chosen": 2.619896173477173, "rewards/margins": 0.0751456469297409, "rewards/rejected": 2.544750213623047, "step": 50560 }, { "epoch": 2.3478341612888247, "grad_norm": 0.04499373584985733, "learning_rate": 1.5919680579414087e-07, "logits/chosen": -19.40009307861328, "logits/rejected": -17.496234893798828, "logps/chosen": -462.8170471191406, "logps/rejected": -190.3672332763672, "loss": 0.2002, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.901740312576294, "rewards/margins": 2.69128155708313, "rewards/rejected": 1.2104586362838745, "step": 50570 }, { "epoch": 2.348298435396258, "grad_norm": 88.52941131591797, "learning_rate": 1.5916894934769486e-07, "logits/chosen": -18.464595794677734, "logits/rejected": -17.621442794799805, "logps/chosen": -383.03143310546875, "logps/rejected": -323.6205749511719, "loss": 0.3978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4629452228546143, "rewards/margins": 1.8951361179351807, "rewards/rejected": 1.5678093433380127, "step": 50580 }, { "epoch": 2.348762709503691, "grad_norm": 9.45872688293457, "learning_rate": 1.5914109290124888e-07, "logits/chosen": -19.70026397705078, "logits/rejected": -18.58538055419922, "logps/chosen": -543.4940185546875, "logps/rejected": -371.0071105957031, "loss": 0.8538, "rewards/accuracies": 0.5, "rewards/chosen": 3.4511115550994873, "rewards/margins": 1.0319769382476807, "rewards/rejected": 2.4191346168518066, "step": 50590 }, { "epoch": 2.349226983611124, "grad_norm": 43.74530792236328, "learning_rate": 1.5911323645480292e-07, "logits/chosen": -18.692739486694336, "logits/rejected": -19.012798309326172, "logps/chosen": -394.17462158203125, "logps/rejected": -412.39117431640625, "loss": 1.1731, "rewards/accuracies": 0.5, "rewards/chosen": 2.506333827972412, "rewards/margins": -0.4490947127342224, "rewards/rejected": 2.9554286003112793, "step": 50600 }, { "epoch": 2.349691257718557, "grad_norm": 37.2510986328125, "learning_rate": 1.5908538000835693e-07, "logits/chosen": -19.714887619018555, "logits/rejected": -18.9355525970459, "logps/chosen": -344.7846374511719, "logps/rejected": -316.2711486816406, "loss": 0.5686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.372788906097412, "rewards/margins": 0.8576892614364624, "rewards/rejected": 2.5150997638702393, "step": 50610 }, { "epoch": 2.35015553182599, "grad_norm": 177.51473999023438, "learning_rate": 1.5905752356191092e-07, "logits/chosen": -19.845827102661133, "logits/rejected": -18.9919376373291, "logps/chosen": -413.9380798339844, "logps/rejected": -367.6326599121094, "loss": 0.5991, "rewards/accuracies": 0.5, "rewards/chosen": 3.5378291606903076, "rewards/margins": 0.6660305857658386, "rewards/rejected": 2.871798276901245, "step": 50620 }, { "epoch": 2.350619805933423, "grad_norm": 32.85700988769531, "learning_rate": 1.5902966711546496e-07, "logits/chosen": -20.768781661987305, "logits/rejected": -19.522727966308594, "logps/chosen": -525.1693725585938, "logps/rejected": -350.2921142578125, "loss": 0.2935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.093293190002441, "rewards/margins": 1.90605890750885, "rewards/rejected": 2.187234401702881, "step": 50630 }, { "epoch": 2.351084080040856, "grad_norm": 166.77590942382812, "learning_rate": 1.5900181066901898e-07, "logits/chosen": -18.239131927490234, "logits/rejected": -17.5629940032959, "logps/chosen": -471.87969970703125, "logps/rejected": -406.6654968261719, "loss": 0.8434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.639646530151367, "rewards/margins": 1.2837976217269897, "rewards/rejected": 3.355848789215088, "step": 50640 }, { "epoch": 2.351548354148289, "grad_norm": 62.87008285522461, "learning_rate": 1.5897395422257302e-07, "logits/chosen": -18.29676628112793, "logits/rejected": -17.765777587890625, "logps/chosen": -379.59588623046875, "logps/rejected": -337.9076843261719, "loss": 1.7469, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.673206090927124, "rewards/margins": -0.634533703327179, "rewards/rejected": 3.3077399730682373, "step": 50650 }, { "epoch": 2.3520126282557223, "grad_norm": 39.394752502441406, "learning_rate": 1.58946097776127e-07, "logits/chosen": -18.385025024414062, "logits/rejected": -17.999584197998047, "logps/chosen": -297.9925842285156, "logps/rejected": -274.71221923828125, "loss": 1.072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.462334394454956, "rewards/margins": 0.1077260971069336, "rewards/rejected": 1.3546082973480225, "step": 50660 }, { "epoch": 2.352476902363155, "grad_norm": 167.9298858642578, "learning_rate": 1.5891824132968105e-07, "logits/chosen": -18.632524490356445, "logits/rejected": -18.638561248779297, "logps/chosen": -358.0423583984375, "logps/rejected": -314.7579040527344, "loss": 0.7758, "rewards/accuracies": 0.5, "rewards/chosen": 2.7695605754852295, "rewards/margins": 0.9577852487564087, "rewards/rejected": 1.8117752075195312, "step": 50670 }, { "epoch": 2.3529411764705883, "grad_norm": 11.117161750793457, "learning_rate": 1.5889038488323506e-07, "logits/chosen": -17.555500030517578, "logits/rejected": -17.182613372802734, "logps/chosen": -299.1391296386719, "logps/rejected": -286.4394226074219, "loss": 0.5034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.807600498199463, "rewards/margins": 1.4400275945663452, "rewards/rejected": 1.367572546005249, "step": 50680 }, { "epoch": 2.353405450578021, "grad_norm": 37.2521858215332, "learning_rate": 1.588625284367891e-07, "logits/chosen": -20.220518112182617, "logits/rejected": -18.610933303833008, "logps/chosen": -551.9969482421875, "logps/rejected": -403.90338134765625, "loss": 0.337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.890468597412109, "rewards/margins": 2.0013866424560547, "rewards/rejected": 2.889082193374634, "step": 50690 }, { "epoch": 2.3538697246854543, "grad_norm": 67.16207122802734, "learning_rate": 1.588346719903431e-07, "logits/chosen": -18.17731285095215, "logits/rejected": -18.19113540649414, "logps/chosen": -332.724365234375, "logps/rejected": -372.3261413574219, "loss": 1.025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3825926780700684, "rewards/margins": 0.5530655980110168, "rewards/rejected": 1.8295272588729858, "step": 50700 }, { "epoch": 2.354333998792887, "grad_norm": 0.343769907951355, "learning_rate": 1.588068155438971e-07, "logits/chosen": -18.524694442749023, "logits/rejected": -17.38602066040039, "logps/chosen": -437.4776916503906, "logps/rejected": -271.645263671875, "loss": 0.391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.33882999420166, "rewards/margins": 2.752349376678467, "rewards/rejected": 1.586480975151062, "step": 50710 }, { "epoch": 2.3547982729003203, "grad_norm": 50.466514587402344, "learning_rate": 1.5877895909745115e-07, "logits/chosen": -20.606231689453125, "logits/rejected": -19.085407257080078, "logps/chosen": -276.3260192871094, "logps/rejected": -247.59201049804688, "loss": 0.6019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.023902416229248, "rewards/margins": 1.0496113300323486, "rewards/rejected": 1.9742908477783203, "step": 50720 }, { "epoch": 2.3552625470077535, "grad_norm": 41.437843322753906, "learning_rate": 1.5875110265100513e-07, "logits/chosen": -19.14271354675293, "logits/rejected": -18.146413803100586, "logps/chosen": -412.73065185546875, "logps/rejected": -346.64862060546875, "loss": 0.4602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.397129535675049, "rewards/margins": 1.6638915538787842, "rewards/rejected": 2.7332377433776855, "step": 50730 }, { "epoch": 2.3557268211151863, "grad_norm": 131.20462036132812, "learning_rate": 1.5872324620455915e-07, "logits/chosen": -18.75772476196289, "logits/rejected": -18.04583168029785, "logps/chosen": -460.4707946777344, "logps/rejected": -397.3710632324219, "loss": 0.8534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7365965843200684, "rewards/margins": 0.9002925753593445, "rewards/rejected": 2.836304187774658, "step": 50740 }, { "epoch": 2.3561910952226195, "grad_norm": 230.2840576171875, "learning_rate": 1.586953897581132e-07, "logits/chosen": -18.91798210144043, "logits/rejected": -18.37200927734375, "logps/chosen": -373.39947509765625, "logps/rejected": -273.85943603515625, "loss": 1.0229, "rewards/accuracies": 0.5, "rewards/chosen": 4.401178359985352, "rewards/margins": 1.58627450466156, "rewards/rejected": 2.814903736114502, "step": 50750 }, { "epoch": 2.3566553693300527, "grad_norm": 192.2322540283203, "learning_rate": 1.586675333116672e-07, "logits/chosen": -18.727453231811523, "logits/rejected": -18.505582809448242, "logps/chosen": -351.3011779785156, "logps/rejected": -359.43804931640625, "loss": 0.7617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.863974094390869, "rewards/margins": 0.3352193832397461, "rewards/rejected": 2.528754711151123, "step": 50760 }, { "epoch": 2.3571196434374855, "grad_norm": 160.22579956054688, "learning_rate": 1.5863967686522122e-07, "logits/chosen": -19.03335952758789, "logits/rejected": -18.351802825927734, "logps/chosen": -404.99871826171875, "logps/rejected": -369.06329345703125, "loss": 0.9541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.399669647216797, "rewards/margins": 1.0941128730773926, "rewards/rejected": 3.3055567741394043, "step": 50770 }, { "epoch": 2.3575839175449187, "grad_norm": 81.40995025634766, "learning_rate": 1.5861182041877523e-07, "logits/chosen": -18.649808883666992, "logits/rejected": -18.596141815185547, "logps/chosen": -365.80499267578125, "logps/rejected": -330.21722412109375, "loss": 1.152, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.298510789871216, "rewards/margins": -0.12656784057617188, "rewards/rejected": 2.4250781536102295, "step": 50780 }, { "epoch": 2.3580481916523515, "grad_norm": 15.279769897460938, "learning_rate": 1.5858396397232925e-07, "logits/chosen": -18.596817016601562, "logits/rejected": -18.214061737060547, "logps/chosen": -379.03564453125, "logps/rejected": -294.99969482421875, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7901928424835205, "rewards/margins": 1.061059832572937, "rewards/rejected": 2.729133129119873, "step": 50790 }, { "epoch": 2.3585124657597847, "grad_norm": 74.39765167236328, "learning_rate": 1.585561075258833e-07, "logits/chosen": -18.64910125732422, "logits/rejected": -18.211183547973633, "logps/chosen": -396.177978515625, "logps/rejected": -425.1646423339844, "loss": 0.8855, "rewards/accuracies": 0.5, "rewards/chosen": 3.3615310192108154, "rewards/margins": -0.03373271971940994, "rewards/rejected": 3.395263671875, "step": 50800 }, { "epoch": 2.3589767398672175, "grad_norm": 9.90788745880127, "learning_rate": 1.5852825107943728e-07, "logits/chosen": -19.290285110473633, "logits/rejected": -18.150020599365234, "logps/chosen": -397.6984558105469, "logps/rejected": -277.45147705078125, "loss": 0.6105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.278054237365723, "rewards/margins": 1.8788951635360718, "rewards/rejected": 2.3991591930389404, "step": 50810 }, { "epoch": 2.3594410139746507, "grad_norm": 6.990574359893799, "learning_rate": 1.5850039463299132e-07, "logits/chosen": -19.298023223876953, "logits/rejected": -18.46904945373535, "logps/chosen": -341.35699462890625, "logps/rejected": -288.742919921875, "loss": 0.4175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7261109352111816, "rewards/margins": 1.6616195440292358, "rewards/rejected": 2.064490795135498, "step": 50820 }, { "epoch": 2.359905288082084, "grad_norm": 0.5314731001853943, "learning_rate": 1.5847253818654533e-07, "logits/chosen": -18.60323715209961, "logits/rejected": -17.756935119628906, "logps/chosen": -406.94873046875, "logps/rejected": -344.35894775390625, "loss": 0.6741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9362387657165527, "rewards/margins": 1.1932131052017212, "rewards/rejected": 1.743025779724121, "step": 50830 }, { "epoch": 2.3603695621895167, "grad_norm": 59.37677764892578, "learning_rate": 1.5844468174009937e-07, "logits/chosen": -19.591827392578125, "logits/rejected": -19.33322525024414, "logps/chosen": -423.04095458984375, "logps/rejected": -387.5992431640625, "loss": 0.9473, "rewards/accuracies": 0.5, "rewards/chosen": 3.335536241531372, "rewards/margins": 0.15807215869426727, "rewards/rejected": 3.177464485168457, "step": 50840 }, { "epoch": 2.36083383629695, "grad_norm": 8.654829978942871, "learning_rate": 1.5841682529365336e-07, "logits/chosen": -19.681243896484375, "logits/rejected": -16.869308471679688, "logps/chosen": -382.61102294921875, "logps/rejected": -166.72415161132812, "loss": 0.1809, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.599971055984497, "rewards/margins": 2.576482057571411, "rewards/rejected": 1.0234891176223755, "step": 50850 }, { "epoch": 2.3612981104043826, "grad_norm": 17.86022186279297, "learning_rate": 1.5838896884720738e-07, "logits/chosen": -18.394412994384766, "logits/rejected": -17.989168167114258, "logps/chosen": -391.8075866699219, "logps/rejected": -344.51776123046875, "loss": 0.6248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4711239337921143, "rewards/margins": 0.5471392869949341, "rewards/rejected": 1.9239851236343384, "step": 50860 }, { "epoch": 2.361762384511816, "grad_norm": 52.76749801635742, "learning_rate": 1.5836111240076142e-07, "logits/chosen": -19.147289276123047, "logits/rejected": -19.089712142944336, "logps/chosen": -340.8020324707031, "logps/rejected": -347.82916259765625, "loss": 0.3469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.507126569747925, "rewards/margins": 1.414513111114502, "rewards/rejected": 2.092613697052002, "step": 50870 }, { "epoch": 2.3622266586192486, "grad_norm": 40.86972427368164, "learning_rate": 1.5833325595431543e-07, "logits/chosen": -17.375446319580078, "logits/rejected": -17.327396392822266, "logps/chosen": -390.09613037109375, "logps/rejected": -335.4814147949219, "loss": 1.152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3748061656951904, "rewards/margins": 0.3638904094696045, "rewards/rejected": 2.010915756225586, "step": 50880 }, { "epoch": 2.362690932726682, "grad_norm": 201.59262084960938, "learning_rate": 1.5830539950786942e-07, "logits/chosen": -19.033132553100586, "logits/rejected": -19.095487594604492, "logps/chosen": -406.1591796875, "logps/rejected": -431.4056701660156, "loss": 0.9573, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.375760078430176, "rewards/margins": 0.21315374970436096, "rewards/rejected": 4.162606716156006, "step": 50890 }, { "epoch": 2.363155206834115, "grad_norm": 59.93116760253906, "learning_rate": 1.5827754306142346e-07, "logits/chosen": -18.474468231201172, "logits/rejected": -18.027339935302734, "logps/chosen": -314.5852355957031, "logps/rejected": -308.6156005859375, "loss": 0.7017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1034297943115234, "rewards/margins": 1.0871456861495972, "rewards/rejected": 2.016284465789795, "step": 50900 }, { "epoch": 2.363619480941548, "grad_norm": 254.30923461914062, "learning_rate": 1.5824968661497748e-07, "logits/chosen": -19.215145111083984, "logits/rejected": -18.770519256591797, "logps/chosen": -435.84222412109375, "logps/rejected": -368.11175537109375, "loss": 0.5388, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2734785079956055, "rewards/margins": 1.2077839374542236, "rewards/rejected": 3.0656943321228027, "step": 50910 }, { "epoch": 2.364083755048981, "grad_norm": 69.4820785522461, "learning_rate": 1.582218301685315e-07, "logits/chosen": -19.258581161499023, "logits/rejected": -18.00261878967285, "logps/chosen": -401.70037841796875, "logps/rejected": -302.642578125, "loss": 0.5193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.719510316848755, "rewards/margins": 1.2988859415054321, "rewards/rejected": 2.420624256134033, "step": 50920 }, { "epoch": 2.364548029156414, "grad_norm": 230.75375366210938, "learning_rate": 1.581939737220855e-07, "logits/chosen": -18.732707977294922, "logits/rejected": -18.51235580444336, "logps/chosen": -414.0220642089844, "logps/rejected": -388.51788330078125, "loss": 0.8616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1112723350524902, "rewards/margins": 0.5512979626655579, "rewards/rejected": 2.5599746704101562, "step": 50930 }, { "epoch": 2.365012303263847, "grad_norm": 51.874820709228516, "learning_rate": 1.5816611727563955e-07, "logits/chosen": -18.25766372680664, "logits/rejected": -17.64600944519043, "logps/chosen": -393.9187927246094, "logps/rejected": -331.2581481933594, "loss": 0.5819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0583431720733643, "rewards/margins": 1.3615057468414307, "rewards/rejected": 1.6968374252319336, "step": 50940 }, { "epoch": 2.36547657737128, "grad_norm": 31.998472213745117, "learning_rate": 1.5813826082919356e-07, "logits/chosen": -18.97522735595703, "logits/rejected": -18.837383270263672, "logps/chosen": -442.0556640625, "logps/rejected": -379.3299865722656, "loss": 0.3291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9655003547668457, "rewards/margins": 1.8757317066192627, "rewards/rejected": 2.089768886566162, "step": 50950 }, { "epoch": 2.365940851478713, "grad_norm": 40.49138641357422, "learning_rate": 1.5811040438274755e-07, "logits/chosen": -19.37447166442871, "logits/rejected": -18.574268341064453, "logps/chosen": -314.013427734375, "logps/rejected": -236.12203979492188, "loss": 0.3359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.569939374923706, "rewards/margins": 2.2306251525878906, "rewards/rejected": 1.3393144607543945, "step": 50960 }, { "epoch": 2.3664051255861462, "grad_norm": 60.37934112548828, "learning_rate": 1.580825479363016e-07, "logits/chosen": -20.002315521240234, "logits/rejected": -18.746057510375977, "logps/chosen": -330.25762939453125, "logps/rejected": -222.37094116210938, "loss": 0.5575, "rewards/accuracies": 0.5, "rewards/chosen": 2.547672986984253, "rewards/margins": 0.9940013885498047, "rewards/rejected": 1.5536717176437378, "step": 50970 }, { "epoch": 2.366869399693579, "grad_norm": 276.841796875, "learning_rate": 1.580546914898556e-07, "logits/chosen": -18.436370849609375, "logits/rejected": -18.261980056762695, "logps/chosen": -417.9476013183594, "logps/rejected": -380.37298583984375, "loss": 0.7543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.357958793640137, "rewards/margins": 1.5996036529541016, "rewards/rejected": 2.7583553791046143, "step": 50980 }, { "epoch": 2.3673336738010122, "grad_norm": 64.05914306640625, "learning_rate": 1.5802683504340964e-07, "logits/chosen": -18.796335220336914, "logits/rejected": -17.827831268310547, "logps/chosen": -317.6981201171875, "logps/rejected": -266.5017395019531, "loss": 0.5563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6224255561828613, "rewards/margins": 1.1013298034667969, "rewards/rejected": 1.5210957527160645, "step": 50990 }, { "epoch": 2.367797947908445, "grad_norm": 79.55864715576172, "learning_rate": 1.5799897859696363e-07, "logits/chosen": -18.896018981933594, "logits/rejected": -19.60622215270996, "logps/chosen": -382.7682800292969, "logps/rejected": -343.68719482421875, "loss": 0.7665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8729214668273926, "rewards/margins": 0.9655828475952148, "rewards/rejected": 1.9073385000228882, "step": 51000 }, { "epoch": 2.368262222015878, "grad_norm": 45.901588439941406, "learning_rate": 1.5797112215051765e-07, "logits/chosen": -19.781076431274414, "logits/rejected": -19.070079803466797, "logps/chosen": -359.10479736328125, "logps/rejected": -383.4630432128906, "loss": 0.7895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7153961658477783, "rewards/margins": 0.7184383869171143, "rewards/rejected": 2.996957778930664, "step": 51010 }, { "epoch": 2.368726496123311, "grad_norm": 104.63080596923828, "learning_rate": 1.579432657040717e-07, "logits/chosen": -18.737018585205078, "logits/rejected": -17.806339263916016, "logps/chosen": -402.1170959472656, "logps/rejected": -380.52764892578125, "loss": 0.3711, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.552865505218506, "rewards/margins": 1.090585470199585, "rewards/rejected": 2.4622802734375, "step": 51020 }, { "epoch": 2.369190770230744, "grad_norm": 21.468486785888672, "learning_rate": 1.579154092576257e-07, "logits/chosen": -19.03260612487793, "logits/rejected": -18.293346405029297, "logps/chosen": -303.57904052734375, "logps/rejected": -308.38525390625, "loss": 0.5133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6293015480041504, "rewards/margins": 0.8154524564743042, "rewards/rejected": 1.8138492107391357, "step": 51030 }, { "epoch": 2.3696550443381774, "grad_norm": 103.31358337402344, "learning_rate": 1.578875528111797e-07, "logits/chosen": -19.523082733154297, "logits/rejected": -18.75772476196289, "logps/chosen": -358.73175048828125, "logps/rejected": -347.93316650390625, "loss": 1.3163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.153543472290039, "rewards/margins": 0.273385614156723, "rewards/rejected": 2.880157947540283, "step": 51040 }, { "epoch": 2.37011931844561, "grad_norm": 8.169642448425293, "learning_rate": 1.5785969636473373e-07, "logits/chosen": -19.498085021972656, "logits/rejected": -18.199398040771484, "logps/chosen": -387.70391845703125, "logps/rejected": -273.02459716796875, "loss": 0.3384, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.910604000091553, "rewards/margins": 2.074096918106079, "rewards/rejected": 2.8365073204040527, "step": 51050 }, { "epoch": 2.3705835925530434, "grad_norm": 63.6712760925293, "learning_rate": 1.5783183991828775e-07, "logits/chosen": -18.782878875732422, "logits/rejected": -18.033720016479492, "logps/chosen": -354.46453857421875, "logps/rejected": -371.55194091796875, "loss": 0.9729, "rewards/accuracies": 0.5, "rewards/chosen": 2.4616215229034424, "rewards/margins": 0.37006598711013794, "rewards/rejected": 2.09155535697937, "step": 51060 }, { "epoch": 2.371047866660476, "grad_norm": 86.67292022705078, "learning_rate": 1.578039834718418e-07, "logits/chosen": -19.403379440307617, "logits/rejected": -18.05588722229004, "logps/chosen": -498.1617126464844, "logps/rejected": -353.66082763671875, "loss": 0.5562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.179073810577393, "rewards/margins": 1.5689762830734253, "rewards/rejected": 2.6100974082946777, "step": 51070 }, { "epoch": 2.3715121407679094, "grad_norm": 49.10537338256836, "learning_rate": 1.5777612702539578e-07, "logits/chosen": -19.680908203125, "logits/rejected": -18.068859100341797, "logps/chosen": -421.4432067871094, "logps/rejected": -298.76629638671875, "loss": 0.2284, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.187331676483154, "rewards/margins": 2.40362811088562, "rewards/rejected": 1.7837035655975342, "step": 51080 }, { "epoch": 2.371976414875342, "grad_norm": 3.5785953998565674, "learning_rate": 1.5774827057894982e-07, "logits/chosen": -18.881732940673828, "logits/rejected": -17.81375503540039, "logps/chosen": -412.43328857421875, "logps/rejected": -294.10308837890625, "loss": 0.5056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5499558448791504, "rewards/margins": 1.517896056175232, "rewards/rejected": 2.032059907913208, "step": 51090 }, { "epoch": 2.3724406889827754, "grad_norm": 154.76051330566406, "learning_rate": 1.5772041413250383e-07, "logits/chosen": -19.011568069458008, "logits/rejected": -18.142114639282227, "logps/chosen": -370.72296142578125, "logps/rejected": -314.2913818359375, "loss": 1.2974, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1250758171081543, "rewards/margins": -0.08637547492980957, "rewards/rejected": 3.211451292037964, "step": 51100 }, { "epoch": 2.3729049630902086, "grad_norm": 1.9303734302520752, "learning_rate": 1.5769255768605782e-07, "logits/chosen": -18.620075225830078, "logits/rejected": -16.744487762451172, "logps/chosen": -501.98681640625, "logps/rejected": -317.9485778808594, "loss": 0.3387, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.029027462005615, "rewards/margins": 3.253343105316162, "rewards/rejected": 1.7756843566894531, "step": 51110 }, { "epoch": 2.3733692371976414, "grad_norm": 78.25831604003906, "learning_rate": 1.5766470123961186e-07, "logits/chosen": -18.627960205078125, "logits/rejected": -17.587284088134766, "logps/chosen": -407.9888000488281, "logps/rejected": -262.5634460449219, "loss": 0.3542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.36091947555542, "rewards/margins": 2.3158373832702637, "rewards/rejected": 2.0450823307037354, "step": 51120 }, { "epoch": 2.3738335113050746, "grad_norm": 12.09037971496582, "learning_rate": 1.5763684479316587e-07, "logits/chosen": -19.180910110473633, "logits/rejected": -18.571359634399414, "logps/chosen": -391.713134765625, "logps/rejected": -358.9219665527344, "loss": 0.5222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.445542812347412, "rewards/margins": 1.685307264328003, "rewards/rejected": 2.7602357864379883, "step": 51130 }, { "epoch": 2.3742977854125074, "grad_norm": 13.263880729675293, "learning_rate": 1.5760898834671992e-07, "logits/chosen": -19.263286590576172, "logits/rejected": -18.13695526123047, "logps/chosen": -357.3327941894531, "logps/rejected": -300.19091796875, "loss": 0.5689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.085716485977173, "rewards/margins": 0.7900797128677368, "rewards/rejected": 2.2956366539001465, "step": 51140 }, { "epoch": 2.3747620595199406, "grad_norm": 0.839267373085022, "learning_rate": 1.575811319002739e-07, "logits/chosen": -18.38370704650879, "logits/rejected": -18.135852813720703, "logps/chosen": -350.20501708984375, "logps/rejected": -295.30133056640625, "loss": 1.0757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.780378818511963, "rewards/margins": 0.6615477800369263, "rewards/rejected": 2.118831157684326, "step": 51150 }, { "epoch": 2.375226333627374, "grad_norm": 5.053526878356934, "learning_rate": 1.5755327545382792e-07, "logits/chosen": -19.53402328491211, "logits/rejected": -19.202003479003906, "logps/chosen": -371.542236328125, "logps/rejected": -355.6012268066406, "loss": 0.7996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.642827272415161, "rewards/margins": 0.5177985429763794, "rewards/rejected": 3.1250288486480713, "step": 51160 }, { "epoch": 2.3756906077348066, "grad_norm": 227.006103515625, "learning_rate": 1.5752541900738196e-07, "logits/chosen": -18.99485969543457, "logits/rejected": -18.401851654052734, "logps/chosen": -492.5580139160156, "logps/rejected": -437.40771484375, "loss": 0.7312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.377241134643555, "rewards/margins": 1.0189058780670166, "rewards/rejected": 3.358335018157959, "step": 51170 }, { "epoch": 2.3761548818422398, "grad_norm": 0.9882519841194153, "learning_rate": 1.5749756256093597e-07, "logits/chosen": -18.571693420410156, "logits/rejected": -18.22972297668457, "logps/chosen": -336.3825378417969, "logps/rejected": -308.24530029296875, "loss": 0.9781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.28767728805542, "rewards/margins": 0.887188732624054, "rewards/rejected": 2.4004883766174316, "step": 51180 }, { "epoch": 2.3766191559496725, "grad_norm": 0.03954398259520531, "learning_rate": 1.5746970611449e-07, "logits/chosen": -18.395177841186523, "logits/rejected": -17.74747657775879, "logps/chosen": -449.19598388671875, "logps/rejected": -366.04791259765625, "loss": 0.5473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.56712007522583, "rewards/margins": 1.5710887908935547, "rewards/rejected": 2.9960312843322754, "step": 51190 }, { "epoch": 2.3770834300571058, "grad_norm": 19.098691940307617, "learning_rate": 1.57441849668044e-07, "logits/chosen": -19.931026458740234, "logits/rejected": -18.46902847290039, "logps/chosen": -476.1864318847656, "logps/rejected": -374.8934020996094, "loss": 0.6512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.072720527648926, "rewards/margins": 1.5092743635177612, "rewards/rejected": 2.563445568084717, "step": 51200 }, { "epoch": 2.377547704164539, "grad_norm": 241.20985412597656, "learning_rate": 1.5741399322159802e-07, "logits/chosen": -19.374794006347656, "logits/rejected": -18.076929092407227, "logps/chosen": -428.2884216308594, "logps/rejected": -288.80535888671875, "loss": 0.9603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7851815223693848, "rewards/margins": 0.7325716614723206, "rewards/rejected": 3.052609443664551, "step": 51210 }, { "epoch": 2.3780119782719717, "grad_norm": 19.125751495361328, "learning_rate": 1.5738613677515206e-07, "logits/chosen": -18.735349655151367, "logits/rejected": -16.940425872802734, "logps/chosen": -388.4999084472656, "logps/rejected": -217.53268432617188, "loss": 0.2443, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.830051422119141, "rewards/margins": 2.918520212173462, "rewards/rejected": 1.9115312099456787, "step": 51220 }, { "epoch": 2.378476252379405, "grad_norm": 5.8782782554626465, "learning_rate": 1.5735828032870605e-07, "logits/chosen": -19.264408111572266, "logits/rejected": -19.299959182739258, "logps/chosen": -387.7820739746094, "logps/rejected": -303.9815368652344, "loss": 0.6679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.470362901687622, "rewards/margins": 0.994350790977478, "rewards/rejected": 2.4760117530822754, "step": 51230 }, { "epoch": 2.3789405264868377, "grad_norm": 69.36280059814453, "learning_rate": 1.573304238822601e-07, "logits/chosen": -18.38985824584961, "logits/rejected": -18.43895149230957, "logps/chosen": -359.3125915527344, "logps/rejected": -401.3561096191406, "loss": 0.7397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.820211172103882, "rewards/margins": 0.22907212376594543, "rewards/rejected": 2.591139078140259, "step": 51240 }, { "epoch": 2.379404800594271, "grad_norm": 50.120567321777344, "learning_rate": 1.573025674358141e-07, "logits/chosen": -19.517406463623047, "logits/rejected": -18.163902282714844, "logps/chosen": -464.1378479003906, "logps/rejected": -314.1368713378906, "loss": 0.3329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.689389705657959, "rewards/margins": 1.3642070293426514, "rewards/rejected": 2.3251824378967285, "step": 51250 }, { "epoch": 2.3798690747017037, "grad_norm": 11.805145263671875, "learning_rate": 1.5727471098936814e-07, "logits/chosen": -19.20765495300293, "logits/rejected": -19.199216842651367, "logps/chosen": -359.96875, "logps/rejected": -319.5314636230469, "loss": 0.9024, "rewards/accuracies": 0.5, "rewards/chosen": 2.7047791481018066, "rewards/margins": 0.14533765614032745, "rewards/rejected": 2.559441089630127, "step": 51260 }, { "epoch": 2.380333348809137, "grad_norm": 166.62603759765625, "learning_rate": 1.5724685454292213e-07, "logits/chosen": -18.58897590637207, "logits/rejected": -17.90366554260254, "logps/chosen": -542.7489624023438, "logps/rejected": -410.26593017578125, "loss": 0.4834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.451268196105957, "rewards/margins": 0.8829666376113892, "rewards/rejected": 3.5683014392852783, "step": 51270 }, { "epoch": 2.38079762291657, "grad_norm": 0.9670182466506958, "learning_rate": 1.5721899809647615e-07, "logits/chosen": -18.428497314453125, "logits/rejected": -17.417598724365234, "logps/chosen": -363.11505126953125, "logps/rejected": -278.3144226074219, "loss": 0.6602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.152390718460083, "rewards/margins": 1.5111528635025024, "rewards/rejected": 1.6412378549575806, "step": 51280 }, { "epoch": 2.381261897024003, "grad_norm": 63.29927444458008, "learning_rate": 1.5719114165003019e-07, "logits/chosen": -17.976024627685547, "logits/rejected": -17.050621032714844, "logps/chosen": -391.04205322265625, "logps/rejected": -253.35317993164062, "loss": 0.5507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0715014934539795, "rewards/margins": 1.3576569557189941, "rewards/rejected": 1.713844656944275, "step": 51290 }, { "epoch": 2.381726171131436, "grad_norm": 157.29550170898438, "learning_rate": 1.5716328520358417e-07, "logits/chosen": -19.105911254882812, "logits/rejected": -19.26554298400879, "logps/chosen": -487.05902099609375, "logps/rejected": -413.9693908691406, "loss": 0.5822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.361098289489746, "rewards/margins": 0.5738484859466553, "rewards/rejected": 3.78725004196167, "step": 51300 }, { "epoch": 2.382190445238869, "grad_norm": 19.87238121032715, "learning_rate": 1.571354287571382e-07, "logits/chosen": -18.902067184448242, "logits/rejected": -18.415468215942383, "logps/chosen": -318.9419860839844, "logps/rejected": -267.6864318847656, "loss": 0.9427, "rewards/accuracies": 0.5, "rewards/chosen": 2.9162259101867676, "rewards/margins": 1.0098321437835693, "rewards/rejected": 1.9063940048217773, "step": 51310 }, { "epoch": 2.382654719346302, "grad_norm": 80.03924560546875, "learning_rate": 1.5710757231069223e-07, "logits/chosen": -19.418930053710938, "logits/rejected": -19.144973754882812, "logps/chosen": -495.4405822753906, "logps/rejected": -441.3031311035156, "loss": 0.8889, "rewards/accuracies": 0.5, "rewards/chosen": 3.6863131523132324, "rewards/margins": 0.5023923516273499, "rewards/rejected": 3.1839210987091064, "step": 51320 }, { "epoch": 2.383118993453735, "grad_norm": 15.9688138961792, "learning_rate": 1.5707971586424624e-07, "logits/chosen": -18.56361961364746, "logits/rejected": -17.94314956665039, "logps/chosen": -293.44244384765625, "logps/rejected": -225.34463500976562, "loss": 0.7982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4655253887176514, "rewards/margins": 0.9409937858581543, "rewards/rejected": 1.524531602859497, "step": 51330 }, { "epoch": 2.383583267561168, "grad_norm": 122.52438354492188, "learning_rate": 1.5705185941780026e-07, "logits/chosen": -18.686737060546875, "logits/rejected": -18.15955924987793, "logps/chosen": -406.80230712890625, "logps/rejected": -358.6610412597656, "loss": 0.5866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.479548931121826, "rewards/margins": 0.8996186256408691, "rewards/rejected": 2.579930067062378, "step": 51340 }, { "epoch": 2.3840475416686013, "grad_norm": 56.69169616699219, "learning_rate": 1.5702400297135427e-07, "logits/chosen": -18.95024299621582, "logits/rejected": -18.2183837890625, "logps/chosen": -381.06463623046875, "logps/rejected": -349.9742431640625, "loss": 0.6279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.10972261428833, "rewards/margins": 0.8493145108222961, "rewards/rejected": 2.2604081630706787, "step": 51350 }, { "epoch": 2.384511815776034, "grad_norm": 6.911556243896484, "learning_rate": 1.5699614652490831e-07, "logits/chosen": -19.420543670654297, "logits/rejected": -18.03138542175293, "logps/chosen": -341.1811218261719, "logps/rejected": -211.04354858398438, "loss": 0.3488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7835445404052734, "rewards/margins": 1.2180126905441284, "rewards/rejected": 1.5655320882797241, "step": 51360 }, { "epoch": 2.3849760898834673, "grad_norm": 2.8139121532440186, "learning_rate": 1.5696829007846233e-07, "logits/chosen": -18.366661071777344, "logits/rejected": -18.460115432739258, "logps/chosen": -379.9358215332031, "logps/rejected": -347.0257568359375, "loss": 0.7641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2486190795898438, "rewards/margins": 0.9586722254753113, "rewards/rejected": 2.2899467945098877, "step": 51370 }, { "epoch": 2.3854403639909, "grad_norm": 99.15692901611328, "learning_rate": 1.5694043363201632e-07, "logits/chosen": -18.390005111694336, "logits/rejected": -17.456727981567383, "logps/chosen": -431.3595275878906, "logps/rejected": -329.6334533691406, "loss": 0.798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2171854972839355, "rewards/margins": 1.545373558998108, "rewards/rejected": 2.671811819076538, "step": 51380 }, { "epoch": 2.3859046380983333, "grad_norm": 97.92093658447266, "learning_rate": 1.5691257718557036e-07, "logits/chosen": -20.07461166381836, "logits/rejected": -19.866191864013672, "logps/chosen": -396.2065124511719, "logps/rejected": -391.1078186035156, "loss": 0.7674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.165714979171753, "rewards/margins": 0.3373335897922516, "rewards/rejected": 2.828381061553955, "step": 51390 }, { "epoch": 2.386368912205766, "grad_norm": 212.42398071289062, "learning_rate": 1.5688472073912437e-07, "logits/chosen": -20.28377342224121, "logits/rejected": -20.050762176513672, "logps/chosen": -294.0899353027344, "logps/rejected": -291.7650146484375, "loss": 0.6427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.259655714035034, "rewards/margins": 0.835608184337616, "rewards/rejected": 2.4240479469299316, "step": 51400 }, { "epoch": 2.3868331863131993, "grad_norm": 61.93568420410156, "learning_rate": 1.5685686429267841e-07, "logits/chosen": -18.693836212158203, "logits/rejected": -17.32012939453125, "logps/chosen": -359.5525207519531, "logps/rejected": -270.4967346191406, "loss": 1.0757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3982913494110107, "rewards/margins": 1.1855900287628174, "rewards/rejected": 2.2127017974853516, "step": 51410 }, { "epoch": 2.3872974604206325, "grad_norm": 210.3836669921875, "learning_rate": 1.568290078462324e-07, "logits/chosen": -19.49297523498535, "logits/rejected": -18.453989028930664, "logps/chosen": -383.19390869140625, "logps/rejected": -314.94525146484375, "loss": 0.95, "rewards/accuracies": 0.5, "rewards/chosen": 3.4558396339416504, "rewards/margins": 1.1718655824661255, "rewards/rejected": 2.283973217010498, "step": 51420 }, { "epoch": 2.3877617345280653, "grad_norm": 65.38206481933594, "learning_rate": 1.5680115139978642e-07, "logits/chosen": -19.080463409423828, "logits/rejected": -18.261301040649414, "logps/chosen": -411.8760681152344, "logps/rejected": -317.4175720214844, "loss": 0.4374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5562660694122314, "rewards/margins": 1.6451938152313232, "rewards/rejected": 1.9110723733901978, "step": 51430 }, { "epoch": 2.3882260086354985, "grad_norm": 121.75740051269531, "learning_rate": 1.5677329495334046e-07, "logits/chosen": -19.83421516418457, "logits/rejected": -18.79813575744629, "logps/chosen": -375.63983154296875, "logps/rejected": -273.5047302246094, "loss": 0.6125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6473536491394043, "rewards/margins": 0.6781960725784302, "rewards/rejected": 2.9691576957702637, "step": 51440 }, { "epoch": 2.3886902827429313, "grad_norm": 5.922598838806152, "learning_rate": 1.5674543850689447e-07, "logits/chosen": -18.99922752380371, "logits/rejected": -18.629974365234375, "logps/chosen": -449.8229064941406, "logps/rejected": -363.88800048828125, "loss": 0.5448, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.258315563201904, "rewards/margins": 1.3977590799331665, "rewards/rejected": 2.8605563640594482, "step": 51450 }, { "epoch": 2.3891545568503645, "grad_norm": 41.40970993041992, "learning_rate": 1.5671758206044846e-07, "logits/chosen": -18.401798248291016, "logits/rejected": -18.438133239746094, "logps/chosen": -373.1869201660156, "logps/rejected": -316.30133056640625, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.195621967315674, "rewards/margins": 1.4317084550857544, "rewards/rejected": 2.76391339302063, "step": 51460 }, { "epoch": 2.3896188309577973, "grad_norm": 0.4776861369609833, "learning_rate": 1.566897256140025e-07, "logits/chosen": -19.225610733032227, "logits/rejected": -18.47917366027832, "logps/chosen": -379.3155822753906, "logps/rejected": -272.006591796875, "loss": 0.9835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.003693580627441, "rewards/margins": 1.3735218048095703, "rewards/rejected": 2.630171775817871, "step": 51470 }, { "epoch": 2.3900831050652305, "grad_norm": 205.48025512695312, "learning_rate": 1.5666186916755652e-07, "logits/chosen": -18.14273452758789, "logits/rejected": -17.231510162353516, "logps/chosen": -414.684814453125, "logps/rejected": -325.92437744140625, "loss": 0.6986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4381918907165527, "rewards/margins": 1.0489819049835205, "rewards/rejected": 2.3892099857330322, "step": 51480 }, { "epoch": 2.3905473791726637, "grad_norm": 172.27810668945312, "learning_rate": 1.5663401272111053e-07, "logits/chosen": -18.598247528076172, "logits/rejected": -17.935413360595703, "logps/chosen": -417.99114990234375, "logps/rejected": -353.1872253417969, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.486724376678467, "rewards/margins": 0.5016859173774719, "rewards/rejected": 3.9850382804870605, "step": 51490 }, { "epoch": 2.3910116532800965, "grad_norm": 87.8739242553711, "learning_rate": 1.5660615627466454e-07, "logits/chosen": -18.521286010742188, "logits/rejected": -17.69424057006836, "logps/chosen": -399.89031982421875, "logps/rejected": -321.91168212890625, "loss": 0.6573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.147203207015991, "rewards/margins": 0.8390780687332153, "rewards/rejected": 2.3081257343292236, "step": 51500 }, { "epoch": 2.3914759273875297, "grad_norm": 33.56450271606445, "learning_rate": 1.5657829982821859e-07, "logits/chosen": -18.890438079833984, "logits/rejected": -18.87416648864746, "logps/chosen": -499.21533203125, "logps/rejected": -426.50299072265625, "loss": 0.5303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.815390586853027, "rewards/margins": 0.9550563097000122, "rewards/rejected": 3.8603339195251465, "step": 51510 }, { "epoch": 2.3919402014949624, "grad_norm": 6.048530578613281, "learning_rate": 1.565504433817726e-07, "logits/chosen": -19.086994171142578, "logits/rejected": -18.570789337158203, "logps/chosen": -260.3722229003906, "logps/rejected": -246.80844116210938, "loss": 0.7879, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5253796577453613, "rewards/margins": 0.7876097559928894, "rewards/rejected": 1.7377700805664062, "step": 51520 }, { "epoch": 2.3924044756023957, "grad_norm": 195.91897583007812, "learning_rate": 1.565225869353266e-07, "logits/chosen": -19.28069496154785, "logits/rejected": -18.628725051879883, "logps/chosen": -393.9575500488281, "logps/rejected": -381.0025939941406, "loss": 0.9106, "rewards/accuracies": 0.5, "rewards/chosen": 3.6717326641082764, "rewards/margins": 1.2412676811218262, "rewards/rejected": 2.43046498298645, "step": 51530 }, { "epoch": 2.3928687497098284, "grad_norm": 18.809553146362305, "learning_rate": 1.5649473048888063e-07, "logits/chosen": -18.999469757080078, "logits/rejected": -18.367488861083984, "logps/chosen": -516.2985229492188, "logps/rejected": -365.07135009765625, "loss": 0.5648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4370181560516357, "rewards/margins": 0.7802441120147705, "rewards/rejected": 2.6567740440368652, "step": 51540 }, { "epoch": 2.3933330238172617, "grad_norm": 55.4985466003418, "learning_rate": 1.5646687404243464e-07, "logits/chosen": -18.871994018554688, "logits/rejected": -18.969898223876953, "logps/chosen": -359.69085693359375, "logps/rejected": -358.4396667480469, "loss": 0.8459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6082229614257812, "rewards/margins": 0.9449551701545715, "rewards/rejected": 2.6632676124572754, "step": 51550 }, { "epoch": 2.393797297924695, "grad_norm": 24.00994873046875, "learning_rate": 1.5643901759598868e-07, "logits/chosen": -18.785015106201172, "logits/rejected": -18.24801254272461, "logps/chosen": -402.01055908203125, "logps/rejected": -338.4836730957031, "loss": 1.0045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.206794023513794, "rewards/margins": 0.5332484245300293, "rewards/rejected": 2.6735455989837646, "step": 51560 }, { "epoch": 2.3942615720321276, "grad_norm": 132.9391632080078, "learning_rate": 1.5641116114954267e-07, "logits/chosen": -18.33514404296875, "logits/rejected": -18.49302101135254, "logps/chosen": -353.716552734375, "logps/rejected": -344.5335388183594, "loss": 1.0301, "rewards/accuracies": 0.5, "rewards/chosen": 3.0602970123291016, "rewards/margins": 0.5433467626571655, "rewards/rejected": 2.5169503688812256, "step": 51570 }, { "epoch": 2.394725846139561, "grad_norm": 32.395408630371094, "learning_rate": 1.563833047030967e-07, "logits/chosen": -19.372360229492188, "logits/rejected": -18.741060256958008, "logps/chosen": -504.9996032714844, "logps/rejected": -371.2862548828125, "loss": 0.8936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7350387573242188, "rewards/margins": 0.7134780883789062, "rewards/rejected": 3.0215611457824707, "step": 51580 }, { "epoch": 2.395190120246994, "grad_norm": 52.32352828979492, "learning_rate": 1.5635544825665073e-07, "logits/chosen": -17.90475082397461, "logits/rejected": -17.67976951599121, "logps/chosen": -321.68084716796875, "logps/rejected": -394.7749328613281, "loss": 1.0392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4106507301330566, "rewards/margins": -0.13187289237976074, "rewards/rejected": 2.5425233840942383, "step": 51590 }, { "epoch": 2.395654394354427, "grad_norm": 72.22542572021484, "learning_rate": 1.5632759181020474e-07, "logits/chosen": -18.887422561645508, "logits/rejected": -17.889629364013672, "logps/chosen": -429.5043029785156, "logps/rejected": -266.9339904785156, "loss": 0.475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.081094264984131, "rewards/margins": 1.4791953563690186, "rewards/rejected": 1.6018987894058228, "step": 51600 }, { "epoch": 2.39611866846186, "grad_norm": 16.67037582397461, "learning_rate": 1.5629973536375876e-07, "logits/chosen": -18.717998504638672, "logits/rejected": -17.488140106201172, "logps/chosen": -467.21099853515625, "logps/rejected": -261.27142333984375, "loss": 0.7447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.486608505249023, "rewards/margins": 1.6285756826400757, "rewards/rejected": 2.858032464981079, "step": 51610 }, { "epoch": 2.396582942569293, "grad_norm": 1.1283154487609863, "learning_rate": 1.5627187891731277e-07, "logits/chosen": -19.10833168029785, "logits/rejected": -19.066843032836914, "logps/chosen": -386.7817687988281, "logps/rejected": -323.74945068359375, "loss": 0.6459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3115947246551514, "rewards/margins": 1.5036327838897705, "rewards/rejected": 1.8079620599746704, "step": 51620 }, { "epoch": 2.397047216676726, "grad_norm": 72.54698944091797, "learning_rate": 1.5624402247086679e-07, "logits/chosen": -19.021728515625, "logits/rejected": -17.892276763916016, "logps/chosen": -299.9142761230469, "logps/rejected": -166.44068908691406, "loss": 0.548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2878670692443848, "rewards/margins": 1.9570682048797607, "rewards/rejected": 0.3307989239692688, "step": 51630 }, { "epoch": 2.397511490784159, "grad_norm": 39.897377014160156, "learning_rate": 1.5621616602442083e-07, "logits/chosen": -18.30782699584961, "logits/rejected": -17.743947982788086, "logps/chosen": -360.8270263671875, "logps/rejected": -289.8424377441406, "loss": 0.4239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.179555654525757, "rewards/margins": 1.2351123094558716, "rewards/rejected": 1.9444433450698853, "step": 51640 }, { "epoch": 2.397975764891592, "grad_norm": 120.91665649414062, "learning_rate": 1.5618830957797482e-07, "logits/chosen": -18.8426570892334, "logits/rejected": -18.602619171142578, "logps/chosen": -329.60064697265625, "logps/rejected": -353.6528015136719, "loss": 1.1503, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9358160495758057, "rewards/margins": -0.1217733845114708, "rewards/rejected": 3.057589292526245, "step": 51650 }, { "epoch": 2.3984400389990252, "grad_norm": 16.8330078125, "learning_rate": 1.5616045313152886e-07, "logits/chosen": -18.58091926574707, "logits/rejected": -17.801860809326172, "logps/chosen": -287.060302734375, "logps/rejected": -211.07833862304688, "loss": 0.3897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0208497047424316, "rewards/margins": 1.6298748254776, "rewards/rejected": 1.3909746408462524, "step": 51660 }, { "epoch": 2.398904313106458, "grad_norm": 25.316837310791016, "learning_rate": 1.5613259668508287e-07, "logits/chosen": -18.7650089263916, "logits/rejected": -18.1888370513916, "logps/chosen": -348.2430419921875, "logps/rejected": -300.8407287597656, "loss": 1.1051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1555166244506836, "rewards/margins": 0.8255147933959961, "rewards/rejected": 2.3300020694732666, "step": 51670 }, { "epoch": 2.3993685872138912, "grad_norm": 0.11321394890546799, "learning_rate": 1.561047402386369e-07, "logits/chosen": -19.38022232055664, "logits/rejected": -18.48145866394043, "logps/chosen": -496.2146911621094, "logps/rejected": -388.76397705078125, "loss": 0.583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.028210639953613, "rewards/margins": 1.744615912437439, "rewards/rejected": 2.2835946083068848, "step": 51680 }, { "epoch": 2.399832861321324, "grad_norm": 105.81696319580078, "learning_rate": 1.560768837921909e-07, "logits/chosen": -19.940481185913086, "logits/rejected": -18.75395965576172, "logps/chosen": -422.3224182128906, "logps/rejected": -270.29608154296875, "loss": 0.2713, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.783583641052246, "rewards/margins": 2.4803032875061035, "rewards/rejected": 2.3032805919647217, "step": 51690 }, { "epoch": 2.4002971354287572, "grad_norm": 11.579093933105469, "learning_rate": 1.5604902734574491e-07, "logits/chosen": -19.529996871948242, "logits/rejected": -18.44632911682129, "logps/chosen": -376.14410400390625, "logps/rejected": -290.8898010253906, "loss": 0.4829, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9433884620666504, "rewards/margins": 1.4350485801696777, "rewards/rejected": 1.5083398818969727, "step": 51700 }, { "epoch": 2.40076140953619, "grad_norm": 13.223981857299805, "learning_rate": 1.5602117089929896e-07, "logits/chosen": -18.714569091796875, "logits/rejected": -17.797115325927734, "logps/chosen": -356.92376708984375, "logps/rejected": -331.94683837890625, "loss": 0.7477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9076790809631348, "rewards/margins": 0.7346224784851074, "rewards/rejected": 2.1730566024780273, "step": 51710 }, { "epoch": 2.401225683643623, "grad_norm": 3.0036022663116455, "learning_rate": 1.5599331445285294e-07, "logits/chosen": -19.490612030029297, "logits/rejected": -19.017620086669922, "logps/chosen": -470.77734375, "logps/rejected": -397.38330078125, "loss": 0.3611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.707899808883667, "rewards/margins": 1.590211272239685, "rewards/rejected": 2.1176888942718506, "step": 51720 }, { "epoch": 2.4016899577510564, "grad_norm": 212.7918243408203, "learning_rate": 1.5596545800640696e-07, "logits/chosen": -18.780559539794922, "logits/rejected": -17.6528377532959, "logps/chosen": -398.7198181152344, "logps/rejected": -292.11993408203125, "loss": 0.5207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.172616004943848, "rewards/margins": 1.5552537441253662, "rewards/rejected": 2.6173624992370605, "step": 51730 }, { "epoch": 2.402154231858489, "grad_norm": 5.113818645477295, "learning_rate": 1.55937601559961e-07, "logits/chosen": -18.56534767150879, "logits/rejected": -18.12112808227539, "logps/chosen": -380.3385314941406, "logps/rejected": -305.02093505859375, "loss": 0.5192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5292277336120605, "rewards/margins": 1.027984857559204, "rewards/rejected": 1.501242756843567, "step": 51740 }, { "epoch": 2.4026185059659224, "grad_norm": 153.09872436523438, "learning_rate": 1.5590974511351501e-07, "logits/chosen": -18.261714935302734, "logits/rejected": -16.7613525390625, "logps/chosen": -469.9210510253906, "logps/rejected": -339.71783447265625, "loss": 0.7084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.396604537963867, "rewards/margins": 1.8533751964569092, "rewards/rejected": 2.543229103088379, "step": 51750 }, { "epoch": 2.403082780073355, "grad_norm": 62.61149215698242, "learning_rate": 1.5588188866706903e-07, "logits/chosen": -19.346309661865234, "logits/rejected": -18.570388793945312, "logps/chosen": -479.76544189453125, "logps/rejected": -319.2760314941406, "loss": 0.3911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8227429389953613, "rewards/margins": 1.9405815601348877, "rewards/rejected": 1.8821611404418945, "step": 51760 }, { "epoch": 2.4035470541807884, "grad_norm": 68.55574035644531, "learning_rate": 1.5585403222062304e-07, "logits/chosen": -19.41703224182129, "logits/rejected": -17.66546630859375, "logps/chosen": -439.1900939941406, "logps/rejected": -288.81964111328125, "loss": 0.4204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4478237628936768, "rewards/margins": 1.6899588108062744, "rewards/rejected": 1.7578651905059814, "step": 51770 }, { "epoch": 2.404011328288221, "grad_norm": 2.6574814319610596, "learning_rate": 1.5582617577417708e-07, "logits/chosen": -18.465412139892578, "logits/rejected": -17.722904205322266, "logps/chosen": -433.04986572265625, "logps/rejected": -289.80419921875, "loss": 0.8776, "rewards/accuracies": 0.5, "rewards/chosen": 3.114424228668213, "rewards/margins": 0.8953161239624023, "rewards/rejected": 2.2191081047058105, "step": 51780 }, { "epoch": 2.4044756023956544, "grad_norm": 64.1995849609375, "learning_rate": 1.557983193277311e-07, "logits/chosen": -19.163471221923828, "logits/rejected": -18.058805465698242, "logps/chosen": -345.06903076171875, "logps/rejected": -211.79605102539062, "loss": 0.6835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.667353391647339, "rewards/margins": 1.5020768642425537, "rewards/rejected": 1.1652765274047852, "step": 51790 }, { "epoch": 2.4049398765030876, "grad_norm": 117.58909606933594, "learning_rate": 1.5577046288128509e-07, "logits/chosen": -19.651363372802734, "logits/rejected": -19.99063491821289, "logps/chosen": -413.5614318847656, "logps/rejected": -484.353515625, "loss": 0.9201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4897472858428955, "rewards/margins": 0.07062634080648422, "rewards/rejected": 3.419121265411377, "step": 51800 }, { "epoch": 2.4054041506105204, "grad_norm": 4.050717353820801, "learning_rate": 1.5574260643483913e-07, "logits/chosen": -18.507892608642578, "logits/rejected": -18.62055778503418, "logps/chosen": -225.6289520263672, "logps/rejected": -224.2484893798828, "loss": 0.7124, "rewards/accuracies": 0.5, "rewards/chosen": 2.1352429389953613, "rewards/margins": 0.7333547472953796, "rewards/rejected": 1.4018882513046265, "step": 51810 }, { "epoch": 2.4058684247179536, "grad_norm": 59.409610748291016, "learning_rate": 1.5571474998839314e-07, "logits/chosen": -18.40212631225586, "logits/rejected": -17.320531845092773, "logps/chosen": -361.38824462890625, "logps/rejected": -246.1427764892578, "loss": 0.3566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.34356689453125, "rewards/margins": 1.7606592178344727, "rewards/rejected": 1.5829074382781982, "step": 51820 }, { "epoch": 2.4063326988253864, "grad_norm": 178.1857452392578, "learning_rate": 1.5568689354194718e-07, "logits/chosen": -20.30480194091797, "logits/rejected": -19.443126678466797, "logps/chosen": -526.5902099609375, "logps/rejected": -459.96795654296875, "loss": 0.5088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.839604377746582, "rewards/margins": 1.2602908611297607, "rewards/rejected": 3.579312801361084, "step": 51830 }, { "epoch": 2.4067969729328196, "grad_norm": 3.698531150817871, "learning_rate": 1.5565903709550117e-07, "logits/chosen": -18.48533058166504, "logits/rejected": -18.841083526611328, "logps/chosen": -421.2359313964844, "logps/rejected": -418.85601806640625, "loss": 0.8698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7118048667907715, "rewards/margins": 0.70018470287323, "rewards/rejected": 3.011620044708252, "step": 51840 }, { "epoch": 2.4072612470402523, "grad_norm": 95.23959350585938, "learning_rate": 1.5563118064905519e-07, "logits/chosen": -17.810047149658203, "logits/rejected": -18.043567657470703, "logps/chosen": -352.6786804199219, "logps/rejected": -372.0653381347656, "loss": 1.0276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9214320182800293, "rewards/margins": -0.1858375072479248, "rewards/rejected": 3.107269525527954, "step": 51850 }, { "epoch": 2.4077255211476856, "grad_norm": 75.99806213378906, "learning_rate": 1.5560332420260923e-07, "logits/chosen": -19.13688087463379, "logits/rejected": -18.80457305908203, "logps/chosen": -348.52301025390625, "logps/rejected": -287.87127685546875, "loss": 0.611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0422115325927734, "rewards/margins": 1.149254560470581, "rewards/rejected": 1.8929567337036133, "step": 51860 }, { "epoch": 2.408189795255119, "grad_norm": 31.542343139648438, "learning_rate": 1.5557546775616324e-07, "logits/chosen": -18.882659912109375, "logits/rejected": -17.653018951416016, "logps/chosen": -484.77239990234375, "logps/rejected": -328.0207214355469, "loss": 0.6107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4208946228027344, "rewards/margins": 1.2494637966156006, "rewards/rejected": 2.1714305877685547, "step": 51870 }, { "epoch": 2.4086540693625516, "grad_norm": 11.653729438781738, "learning_rate": 1.5554761130971723e-07, "logits/chosen": -18.978748321533203, "logits/rejected": -18.529735565185547, "logps/chosen": -368.28411865234375, "logps/rejected": -306.1295471191406, "loss": 0.4516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7961082458496094, "rewards/margins": 0.759792685508728, "rewards/rejected": 3.036315441131592, "step": 51880 }, { "epoch": 2.4091183434699848, "grad_norm": 17.592021942138672, "learning_rate": 1.5551975486327127e-07, "logits/chosen": -19.6623477935791, "logits/rejected": -18.506786346435547, "logps/chosen": -429.33306884765625, "logps/rejected": -371.5094299316406, "loss": 0.3586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.682596206665039, "rewards/margins": 1.4972889423370361, "rewards/rejected": 2.1853067874908447, "step": 51890 }, { "epoch": 2.4095826175774175, "grad_norm": 5.151429653167725, "learning_rate": 1.5549189841682528e-07, "logits/chosen": -18.726343154907227, "logits/rejected": -17.532451629638672, "logps/chosen": -406.9999084472656, "logps/rejected": -280.1834716796875, "loss": 0.463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.134154796600342, "rewards/margins": 1.4578803777694702, "rewards/rejected": 1.6762745380401611, "step": 51900 }, { "epoch": 2.4100468916848508, "grad_norm": 65.7249526977539, "learning_rate": 1.554640419703793e-07, "logits/chosen": -19.24809455871582, "logits/rejected": -18.024545669555664, "logps/chosen": -376.7288818359375, "logps/rejected": -274.2190246582031, "loss": 0.5691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3343124389648438, "rewards/margins": 1.5476114749908447, "rewards/rejected": 1.786700963973999, "step": 51910 }, { "epoch": 2.4105111657922835, "grad_norm": 6.141857147216797, "learning_rate": 1.5543618552393331e-07, "logits/chosen": -18.401700973510742, "logits/rejected": -17.829212188720703, "logps/chosen": -356.47833251953125, "logps/rejected": -281.40655517578125, "loss": 0.8423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5936474800109863, "rewards/margins": 0.7210363149642944, "rewards/rejected": 1.87261164188385, "step": 51920 }, { "epoch": 2.4109754398997167, "grad_norm": 9.609607696533203, "learning_rate": 1.5540832907748735e-07, "logits/chosen": -18.92684555053711, "logits/rejected": -19.074785232543945, "logps/chosen": -423.910888671875, "logps/rejected": -371.2474060058594, "loss": 0.5874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.656337022781372, "rewards/margins": 0.6709283590316772, "rewards/rejected": 2.9854092597961426, "step": 51930 }, { "epoch": 2.41143971400715, "grad_norm": 192.9984893798828, "learning_rate": 1.5538047263104137e-07, "logits/chosen": -18.26507568359375, "logits/rejected": -17.69737434387207, "logps/chosen": -430.50848388671875, "logps/rejected": -345.843994140625, "loss": 0.4971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.647365093231201, "rewards/margins": 1.242375135421753, "rewards/rejected": 2.4049899578094482, "step": 51940 }, { "epoch": 2.4119039881145827, "grad_norm": 36.04907989501953, "learning_rate": 1.5535261618459536e-07, "logits/chosen": -18.31083869934082, "logits/rejected": -17.50911521911621, "logps/chosen": -453.57958984375, "logps/rejected": -302.35760498046875, "loss": 0.9329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1691384315490723, "rewards/margins": 0.6129711270332336, "rewards/rejected": 2.5561671257019043, "step": 51950 }, { "epoch": 2.412368262222016, "grad_norm": 1.9074772596359253, "learning_rate": 1.553247597381494e-07, "logits/chosen": -18.708127975463867, "logits/rejected": -17.323060989379883, "logps/chosen": -435.52117919921875, "logps/rejected": -252.04116821289062, "loss": 0.4274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.007927656173706, "rewards/margins": 1.6001907587051392, "rewards/rejected": 1.407736897468567, "step": 51960 }, { "epoch": 2.4128325363294487, "grad_norm": 61.329593658447266, "learning_rate": 1.552969032917034e-07, "logits/chosen": -18.513460159301758, "logits/rejected": -17.956632614135742, "logps/chosen": -388.7508544921875, "logps/rejected": -320.6076354980469, "loss": 0.8451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.266883373260498, "rewards/margins": 1.0540461540222168, "rewards/rejected": 2.2128374576568604, "step": 51970 }, { "epoch": 2.413296810436882, "grad_norm": 1.5855076313018799, "learning_rate": 1.5526904684525745e-07, "logits/chosen": -18.755752563476562, "logits/rejected": -19.28579330444336, "logps/chosen": -328.32611083984375, "logps/rejected": -357.1782531738281, "loss": 1.0441, "rewards/accuracies": 0.5, "rewards/chosen": 2.3744914531707764, "rewards/margins": 0.6123418211936951, "rewards/rejected": 1.7621495723724365, "step": 51980 }, { "epoch": 2.413761084544315, "grad_norm": 89.85610961914062, "learning_rate": 1.5524119039881144e-07, "logits/chosen": -18.8668155670166, "logits/rejected": -17.414430618286133, "logps/chosen": -312.6844177246094, "logps/rejected": -261.5818176269531, "loss": 0.4107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.991767406463623, "rewards/margins": 1.47021484375, "rewards/rejected": 1.5215528011322021, "step": 51990 }, { "epoch": 2.414225358651748, "grad_norm": 125.77983856201172, "learning_rate": 1.5521333395236546e-07, "logits/chosen": -18.753124237060547, "logits/rejected": -17.229265213012695, "logps/chosen": -442.89923095703125, "logps/rejected": -303.41387939453125, "loss": 0.3271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8929152488708496, "rewards/margins": 2.185880184173584, "rewards/rejected": 1.7070350646972656, "step": 52000 }, { "epoch": 2.414689632759181, "grad_norm": 225.09469604492188, "learning_rate": 1.551854775059195e-07, "logits/chosen": -19.187755584716797, "logits/rejected": -18.781291961669922, "logps/chosen": -434.59814453125, "logps/rejected": -396.0185852050781, "loss": 0.7671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.513103485107422, "rewards/margins": 0.4839727282524109, "rewards/rejected": 3.029130458831787, "step": 52010 }, { "epoch": 2.415153906866614, "grad_norm": 59.05079650878906, "learning_rate": 1.551576210594735e-07, "logits/chosen": -19.35542869567871, "logits/rejected": -19.216575622558594, "logps/chosen": -355.7627868652344, "logps/rejected": -345.4905090332031, "loss": 0.7257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4636192321777344, "rewards/margins": 0.6124721169471741, "rewards/rejected": 2.851147174835205, "step": 52020 }, { "epoch": 2.415618180974047, "grad_norm": 14.18035888671875, "learning_rate": 1.5512976461302753e-07, "logits/chosen": -18.396671295166016, "logits/rejected": -18.238323211669922, "logps/chosen": -333.9384460449219, "logps/rejected": -360.0141906738281, "loss": 0.9635, "rewards/accuracies": 0.5, "rewards/chosen": 2.5557174682617188, "rewards/margins": 0.3963707983493805, "rewards/rejected": 2.159346580505371, "step": 52030 }, { "epoch": 2.4160824550814803, "grad_norm": 19.723642349243164, "learning_rate": 1.5510190816658154e-07, "logits/chosen": -18.413288116455078, "logits/rejected": -18.118709564208984, "logps/chosen": -387.82928466796875, "logps/rejected": -332.1501770019531, "loss": 0.8452, "rewards/accuracies": 0.5, "rewards/chosen": 3.0355420112609863, "rewards/margins": 0.8984387516975403, "rewards/rejected": 2.1371028423309326, "step": 52040 }, { "epoch": 2.416546729188913, "grad_norm": 60.605350494384766, "learning_rate": 1.5507405172013556e-07, "logits/chosen": -18.267541885375977, "logits/rejected": -16.978252410888672, "logps/chosen": -407.39215087890625, "logps/rejected": -282.2293395996094, "loss": 0.7761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.116916656494141, "rewards/margins": 1.6643364429473877, "rewards/rejected": 2.452580213546753, "step": 52050 }, { "epoch": 2.4170110032963463, "grad_norm": 157.69024658203125, "learning_rate": 1.550461952736896e-07, "logits/chosen": -18.936115264892578, "logits/rejected": -18.0546932220459, "logps/chosen": -475.69451904296875, "logps/rejected": -332.7651672363281, "loss": 0.5324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.494844913482666, "rewards/margins": 1.4252064228057861, "rewards/rejected": 3.069638967514038, "step": 52060 }, { "epoch": 2.417475277403779, "grad_norm": 209.59463500976562, "learning_rate": 1.5501833882724358e-07, "logits/chosen": -18.819324493408203, "logits/rejected": -18.980859756469727, "logps/chosen": -430.2840881347656, "logps/rejected": -381.1849060058594, "loss": 1.0121, "rewards/accuracies": 0.5, "rewards/chosen": 3.9781367778778076, "rewards/margins": 0.42560452222824097, "rewards/rejected": 3.5525317192077637, "step": 52070 }, { "epoch": 2.4179395515112123, "grad_norm": 8.48843002319336, "learning_rate": 1.5499048238079763e-07, "logits/chosen": -18.475605010986328, "logits/rejected": -17.352170944213867, "logps/chosen": -415.2696228027344, "logps/rejected": -246.2833251953125, "loss": 0.2998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.282101154327393, "rewards/margins": 2.7909579277038574, "rewards/rejected": 1.4911431074142456, "step": 52080 }, { "epoch": 2.418403825618645, "grad_norm": 192.6295166015625, "learning_rate": 1.5496262593435164e-07, "logits/chosen": -19.017024993896484, "logits/rejected": -18.08097267150879, "logps/chosen": -371.2187194824219, "logps/rejected": -254.1614990234375, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1809277534484863, "rewards/margins": 1.7035808563232422, "rewards/rejected": 1.477346420288086, "step": 52090 }, { "epoch": 2.4188680997260783, "grad_norm": 196.10971069335938, "learning_rate": 1.5493476948790563e-07, "logits/chosen": -19.877920150756836, "logits/rejected": -18.347583770751953, "logps/chosen": -437.056640625, "logps/rejected": -351.41021728515625, "loss": 0.9012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.019166946411133, "rewards/margins": 0.5755730867385864, "rewards/rejected": 3.443593978881836, "step": 52100 }, { "epoch": 2.4193323738335115, "grad_norm": 66.9885025024414, "learning_rate": 1.5490691304145967e-07, "logits/chosen": -19.248348236083984, "logits/rejected": -18.73495864868164, "logps/chosen": -396.9222106933594, "logps/rejected": -324.3853759765625, "loss": 0.9366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7371304035186768, "rewards/margins": 0.4157407879829407, "rewards/rejected": 3.321389675140381, "step": 52110 }, { "epoch": 2.4197966479409443, "grad_norm": 5.0826334953308105, "learning_rate": 1.5487905659501368e-07, "logits/chosen": -18.13918685913086, "logits/rejected": -17.09124183654785, "logps/chosen": -398.6355285644531, "logps/rejected": -315.1822509765625, "loss": 0.4057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.489550828933716, "rewards/margins": 1.6628398895263672, "rewards/rejected": 1.8267109394073486, "step": 52120 }, { "epoch": 2.4202609220483775, "grad_norm": 15.435962677001953, "learning_rate": 1.5485120014856773e-07, "logits/chosen": -19.61126136779785, "logits/rejected": -19.540102005004883, "logps/chosen": -451.6072692871094, "logps/rejected": -381.95526123046875, "loss": 0.8662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1274733543395996, "rewards/margins": 0.3753572404384613, "rewards/rejected": 2.7521159648895264, "step": 52130 }, { "epoch": 2.4207251961558103, "grad_norm": 73.18717956542969, "learning_rate": 1.548233437021217e-07, "logits/chosen": -18.606388092041016, "logits/rejected": -18.944561004638672, "logps/chosen": -377.36431884765625, "logps/rejected": -318.2039489746094, "loss": 0.534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.175530433654785, "rewards/margins": 1.2697241306304932, "rewards/rejected": 2.905805826187134, "step": 52140 }, { "epoch": 2.4211894702632435, "grad_norm": 6.524217128753662, "learning_rate": 1.5479548725567573e-07, "logits/chosen": -19.21392059326172, "logits/rejected": -18.006187438964844, "logps/chosen": -363.014404296875, "logps/rejected": -278.35931396484375, "loss": 0.9172, "rewards/accuracies": 0.5, "rewards/chosen": 2.9086976051330566, "rewards/margins": 0.879522442817688, "rewards/rejected": 2.029175043106079, "step": 52150 }, { "epoch": 2.4216537443706763, "grad_norm": 81.25633239746094, "learning_rate": 1.5476763080922977e-07, "logits/chosen": -19.28605079650879, "logits/rejected": -18.421579360961914, "logps/chosen": -384.96771240234375, "logps/rejected": -338.4394836425781, "loss": 0.3135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9621810913085938, "rewards/margins": 1.6675831079483032, "rewards/rejected": 2.29459810256958, "step": 52160 }, { "epoch": 2.4221180184781095, "grad_norm": 65.94570922851562, "learning_rate": 1.5473977436278378e-07, "logits/chosen": -18.04442596435547, "logits/rejected": -17.505687713623047, "logps/chosen": -362.49005126953125, "logps/rejected": -206.4272918701172, "loss": 0.5418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1766357421875, "rewards/margins": 0.8506065607070923, "rewards/rejected": 1.3260290622711182, "step": 52170 }, { "epoch": 2.4225822925855427, "grad_norm": 5.447660446166992, "learning_rate": 1.547119179163378e-07, "logits/chosen": -18.56693458557129, "logits/rejected": -19.30197525024414, "logps/chosen": -233.71981811523438, "logps/rejected": -270.015625, "loss": 1.4718, "rewards/accuracies": 0.5, "rewards/chosen": 2.2134156227111816, "rewards/margins": -0.4992331564426422, "rewards/rejected": 2.712648868560791, "step": 52180 }, { "epoch": 2.4230465666929755, "grad_norm": 323.79498291015625, "learning_rate": 1.546840614698918e-07, "logits/chosen": -18.80748748779297, "logits/rejected": -17.833911895751953, "logps/chosen": -448.53485107421875, "logps/rejected": -382.0146484375, "loss": 0.7079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4516232013702393, "rewards/margins": 1.6338112354278564, "rewards/rejected": 1.817811369895935, "step": 52190 }, { "epoch": 2.4235108408004087, "grad_norm": 55.42479705810547, "learning_rate": 1.5465620502344585e-07, "logits/chosen": -19.272357940673828, "logits/rejected": -18.63267707824707, "logps/chosen": -486.92974853515625, "logps/rejected": -448.11651611328125, "loss": 0.8107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.232596397399902, "rewards/margins": 0.3760971128940582, "rewards/rejected": 3.856499195098877, "step": 52200 }, { "epoch": 2.4239751149078415, "grad_norm": 251.68209838867188, "learning_rate": 1.5462834857699987e-07, "logits/chosen": -19.02312469482422, "logits/rejected": -18.51045799255371, "logps/chosen": -456.00128173828125, "logps/rejected": -423.54547119140625, "loss": 0.7595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.001765727996826, "rewards/margins": 1.032833218574524, "rewards/rejected": 2.968932628631592, "step": 52210 }, { "epoch": 2.4244393890152747, "grad_norm": 71.7366714477539, "learning_rate": 1.5460049213055386e-07, "logits/chosen": -19.455652236938477, "logits/rejected": -19.008930206298828, "logps/chosen": -411.7872009277344, "logps/rejected": -371.06243896484375, "loss": 0.8098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.461281776428223, "rewards/margins": 1.1195085048675537, "rewards/rejected": 3.341773509979248, "step": 52220 }, { "epoch": 2.4249036631227074, "grad_norm": 98.91507720947266, "learning_rate": 1.545726356841079e-07, "logits/chosen": -20.194238662719727, "logits/rejected": -19.216297149658203, "logps/chosen": -432.74322509765625, "logps/rejected": -420.263916015625, "loss": 0.7617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.715092182159424, "rewards/margins": 0.4144515097141266, "rewards/rejected": 3.3006412982940674, "step": 52230 }, { "epoch": 2.4253679372301407, "grad_norm": 51.37993621826172, "learning_rate": 1.545447792376619e-07, "logits/chosen": -18.94570541381836, "logits/rejected": -18.5338191986084, "logps/chosen": -500.5592346191406, "logps/rejected": -418.7018127441406, "loss": 0.5101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.539339065551758, "rewards/margins": 0.783469021320343, "rewards/rejected": 2.7558698654174805, "step": 52240 }, { "epoch": 2.425832211337574, "grad_norm": 24.927526473999023, "learning_rate": 1.5451692279121595e-07, "logits/chosen": -19.9262638092041, "logits/rejected": -18.25281524658203, "logps/chosen": -384.5993957519531, "logps/rejected": -313.12994384765625, "loss": 0.4234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.466463804244995, "rewards/margins": 1.181074619293213, "rewards/rejected": 1.2853893041610718, "step": 52250 }, { "epoch": 2.4262964854450066, "grad_norm": 272.7013244628906, "learning_rate": 1.5448906634476994e-07, "logits/chosen": -19.431903839111328, "logits/rejected": -19.57750701904297, "logps/chosen": -404.3009033203125, "logps/rejected": -377.167236328125, "loss": 0.9815, "rewards/accuracies": 0.5, "rewards/chosen": 3.305633544921875, "rewards/margins": 0.6178286075592041, "rewards/rejected": 2.687804698944092, "step": 52260 }, { "epoch": 2.42676075955244, "grad_norm": 90.83125305175781, "learning_rate": 1.5446120989832396e-07, "logits/chosen": -18.32948875427246, "logits/rejected": -17.066852569580078, "logps/chosen": -343.27142333984375, "logps/rejected": -207.61111450195312, "loss": 0.312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2124767303466797, "rewards/margins": 2.406975746154785, "rewards/rejected": 0.805500864982605, "step": 52270 }, { "epoch": 2.4272250336598726, "grad_norm": 24.076778411865234, "learning_rate": 1.54433353451878e-07, "logits/chosen": -18.861957550048828, "logits/rejected": -18.142118453979492, "logps/chosen": -535.5298461914062, "logps/rejected": -443.17083740234375, "loss": 0.7025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.127203941345215, "rewards/margins": 1.423633337020874, "rewards/rejected": 2.70357084274292, "step": 52280 }, { "epoch": 2.427689307767306, "grad_norm": 214.30010986328125, "learning_rate": 1.5440549700543198e-07, "logits/chosen": -18.168407440185547, "logits/rejected": -17.644641876220703, "logps/chosen": -398.79254150390625, "logps/rejected": -451.75335693359375, "loss": 1.0516, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3362209796905518, "rewards/margins": -0.06999737024307251, "rewards/rejected": 3.4062182903289795, "step": 52290 }, { "epoch": 2.4281535818747386, "grad_norm": 259.0141296386719, "learning_rate": 1.54377640558986e-07, "logits/chosen": -18.93486213684082, "logits/rejected": -19.169424057006836, "logps/chosen": -367.84136962890625, "logps/rejected": -360.910400390625, "loss": 1.3355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4091274738311768, "rewards/margins": -0.13802257180213928, "rewards/rejected": 3.5471503734588623, "step": 52300 }, { "epoch": 2.428617855982172, "grad_norm": 1.2842767238616943, "learning_rate": 1.5434978411254004e-07, "logits/chosen": -19.338483810424805, "logits/rejected": -17.69796371459961, "logps/chosen": -362.7227783203125, "logps/rejected": -235.0254364013672, "loss": 0.2495, "rewards/accuracies": 1.0, "rewards/chosen": 4.21519660949707, "rewards/margins": 2.1769204139709473, "rewards/rejected": 2.038276195526123, "step": 52310 }, { "epoch": 2.429082130089605, "grad_norm": 13.710769653320312, "learning_rate": 1.5432192766609405e-07, "logits/chosen": -18.809160232543945, "logits/rejected": -17.482177734375, "logps/chosen": -404.3230895996094, "logps/rejected": -250.67349243164062, "loss": 0.3408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.842207431793213, "rewards/margins": 2.1770482063293457, "rewards/rejected": 1.6651592254638672, "step": 52320 }, { "epoch": 2.429546404197038, "grad_norm": 13.845491409301758, "learning_rate": 1.5429407121964807e-07, "logits/chosen": -18.72447967529297, "logits/rejected": -17.669965744018555, "logps/chosen": -266.8048095703125, "logps/rejected": -217.941162109375, "loss": 0.502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2218451499938965, "rewards/margins": 1.5770299434661865, "rewards/rejected": 0.6448155045509338, "step": 52330 }, { "epoch": 2.430010678304471, "grad_norm": 58.28616714477539, "learning_rate": 1.5426621477320208e-07, "logits/chosen": -19.348358154296875, "logits/rejected": -18.288803100585938, "logps/chosen": -383.3180236816406, "logps/rejected": -374.0240173339844, "loss": 0.4082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6296920776367188, "rewards/margins": 1.4791589975357056, "rewards/rejected": 2.1505331993103027, "step": 52340 }, { "epoch": 2.430474952411904, "grad_norm": 6.1254377365112305, "learning_rate": 1.5423835832675612e-07, "logits/chosen": -19.292097091674805, "logits/rejected": -18.311634063720703, "logps/chosen": -325.54217529296875, "logps/rejected": -233.3027801513672, "loss": 0.407, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0235037803649902, "rewards/margins": 1.389167070388794, "rewards/rejected": 1.6343364715576172, "step": 52350 }, { "epoch": 2.430939226519337, "grad_norm": 50.98072052001953, "learning_rate": 1.5421050188031014e-07, "logits/chosen": -18.3941650390625, "logits/rejected": -17.84178352355957, "logps/chosen": -369.64794921875, "logps/rejected": -317.3180236816406, "loss": 0.9569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.136584520339966, "rewards/margins": 0.4420955777168274, "rewards/rejected": 2.694489002227783, "step": 52360 }, { "epoch": 2.43140350062677, "grad_norm": 4.740877628326416, "learning_rate": 1.5418264543386413e-07, "logits/chosen": -19.88174819946289, "logits/rejected": -18.81058692932129, "logps/chosen": -392.18231201171875, "logps/rejected": -334.1051330566406, "loss": 0.5125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.206611633300781, "rewards/margins": 1.719940423965454, "rewards/rejected": 2.486671209335327, "step": 52370 }, { "epoch": 2.431867774734203, "grad_norm": 66.05014038085938, "learning_rate": 1.5415478898741817e-07, "logits/chosen": -18.563512802124023, "logits/rejected": -18.16373062133789, "logps/chosen": -399.87664794921875, "logps/rejected": -322.9303283691406, "loss": 0.7463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2709763050079346, "rewards/margins": 0.8131658434867859, "rewards/rejected": 2.457810163497925, "step": 52380 }, { "epoch": 2.4323320488416362, "grad_norm": 34.91372299194336, "learning_rate": 1.5412693254097218e-07, "logits/chosen": -19.36277198791504, "logits/rejected": -17.79551124572754, "logps/chosen": -472.2594299316406, "logps/rejected": -294.6164245605469, "loss": 0.6172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8521666526794434, "rewards/margins": 1.5415141582489014, "rewards/rejected": 2.310652256011963, "step": 52390 }, { "epoch": 2.432796322949069, "grad_norm": 1.8394269943237305, "learning_rate": 1.5409907609452622e-07, "logits/chosen": -18.90438461303711, "logits/rejected": -18.358867645263672, "logps/chosen": -470.3541564941406, "logps/rejected": -489.3980407714844, "loss": 0.753, "rewards/accuracies": 0.5, "rewards/chosen": 4.443826675415039, "rewards/margins": 0.9800138473510742, "rewards/rejected": 3.4638125896453857, "step": 52400 }, { "epoch": 2.433260597056502, "grad_norm": 53.92784881591797, "learning_rate": 1.540712196480802e-07, "logits/chosen": -18.735334396362305, "logits/rejected": -17.778959274291992, "logps/chosen": -385.021484375, "logps/rejected": -297.0096130371094, "loss": 0.9687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4834907054901123, "rewards/margins": 0.5532680749893188, "rewards/rejected": 2.930222988128662, "step": 52410 }, { "epoch": 2.4337248711639354, "grad_norm": 22.212135314941406, "learning_rate": 1.5404336320163423e-07, "logits/chosen": -19.240015029907227, "logits/rejected": -17.709312438964844, "logps/chosen": -441.8147888183594, "logps/rejected": -276.81842041015625, "loss": 0.4145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.382537841796875, "rewards/margins": 1.5594074726104736, "rewards/rejected": 1.8231303691864014, "step": 52420 }, { "epoch": 2.434189145271368, "grad_norm": 11.117981910705566, "learning_rate": 1.5401550675518827e-07, "logits/chosen": -18.910625457763672, "logits/rejected": -19.47915267944336, "logps/chosen": -407.647705078125, "logps/rejected": -379.63995361328125, "loss": 1.0864, "rewards/accuracies": 0.5, "rewards/chosen": 3.301926851272583, "rewards/margins": 0.17421841621398926, "rewards/rejected": 3.127708673477173, "step": 52430 }, { "epoch": 2.4346534193788014, "grad_norm": 63.19638442993164, "learning_rate": 1.5398765030874228e-07, "logits/chosen": -18.407451629638672, "logits/rejected": -17.814729690551758, "logps/chosen": -410.76507568359375, "logps/rejected": -357.2228088378906, "loss": 0.6269, "rewards/accuracies": 0.5, "rewards/chosen": 2.839855670928955, "rewards/margins": 1.211218237876892, "rewards/rejected": 1.628637671470642, "step": 52440 }, { "epoch": 2.435117693486234, "grad_norm": 80.64283752441406, "learning_rate": 1.539597938622963e-07, "logits/chosen": -18.744972229003906, "logits/rejected": -18.94941520690918, "logps/chosen": -441.1548767089844, "logps/rejected": -427.53887939453125, "loss": 0.7194, "rewards/accuracies": 0.5, "rewards/chosen": 2.873157024383545, "rewards/margins": 0.24572964012622833, "rewards/rejected": 2.627427339553833, "step": 52450 }, { "epoch": 2.4355819675936674, "grad_norm": 21.126615524291992, "learning_rate": 1.539319374158503e-07, "logits/chosen": -18.488239288330078, "logits/rejected": -17.7900390625, "logps/chosen": -342.3638610839844, "logps/rejected": -289.9810485839844, "loss": 1.0895, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5748302936553955, "rewards/margins": 0.8314003944396973, "rewards/rejected": 1.7434298992156982, "step": 52460 }, { "epoch": 2.4360462417011, "grad_norm": 55.47982406616211, "learning_rate": 1.5390408096940433e-07, "logits/chosen": -18.687118530273438, "logits/rejected": -18.0006160736084, "logps/chosen": -412.60028076171875, "logps/rejected": -360.14483642578125, "loss": 0.3653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9424076080322266, "rewards/margins": 1.2110531330108643, "rewards/rejected": 2.7313544750213623, "step": 52470 }, { "epoch": 2.4365105158085334, "grad_norm": 230.6354522705078, "learning_rate": 1.5387622452295834e-07, "logits/chosen": -19.698898315429688, "logits/rejected": -18.711498260498047, "logps/chosen": -375.90167236328125, "logps/rejected": -318.361083984375, "loss": 0.8581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.895920753479004, "rewards/margins": 0.5921662449836731, "rewards/rejected": 2.3037548065185547, "step": 52480 }, { "epoch": 2.4369747899159666, "grad_norm": 156.79788208007812, "learning_rate": 1.5384836807651235e-07, "logits/chosen": -18.201988220214844, "logits/rejected": -17.880041122436523, "logps/chosen": -310.43450927734375, "logps/rejected": -275.65130615234375, "loss": 0.7326, "rewards/accuracies": 0.5, "rewards/chosen": 2.744779109954834, "rewards/margins": 0.9217388033866882, "rewards/rejected": 1.8230403661727905, "step": 52490 }, { "epoch": 2.4374390640233994, "grad_norm": 79.185546875, "learning_rate": 1.538205116300664e-07, "logits/chosen": -17.8370361328125, "logits/rejected": -18.437458038330078, "logps/chosen": -380.8116149902344, "logps/rejected": -445.7484436035156, "loss": 0.9523, "rewards/accuracies": 0.5, "rewards/chosen": 2.940582752227783, "rewards/margins": -0.13864389061927795, "rewards/rejected": 3.079226493835449, "step": 52500 }, { "epoch": 2.4379033381308326, "grad_norm": 45.13273239135742, "learning_rate": 1.53795440828265e-07, "logits/chosen": -19.321731567382812, "logits/rejected": -18.24490737915039, "logps/chosen": -475.34503173828125, "logps/rejected": -331.4073486328125, "loss": 0.5323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.279645919799805, "rewards/margins": 1.6790075302124023, "rewards/rejected": 2.6006386280059814, "step": 52510 }, { "epoch": 2.4383676122382654, "grad_norm": 30.052492141723633, "learning_rate": 1.5376758438181903e-07, "logits/chosen": -18.538454055786133, "logits/rejected": -18.731863021850586, "logps/chosen": -347.76422119140625, "logps/rejected": -351.1366271972656, "loss": 1.1281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.43668270111084, "rewards/margins": 0.17991571128368378, "rewards/rejected": 4.256767272949219, "step": 52520 }, { "epoch": 2.4388318863456986, "grad_norm": 241.24301147460938, "learning_rate": 1.5373972793537305e-07, "logits/chosen": -19.900772094726562, "logits/rejected": -18.89101791381836, "logps/chosen": -326.1449890136719, "logps/rejected": -320.35369873046875, "loss": 0.5927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2133240699768066, "rewards/margins": 0.7079474925994873, "rewards/rejected": 2.5053765773773193, "step": 52530 }, { "epoch": 2.4392961604531314, "grad_norm": 36.98577117919922, "learning_rate": 1.5371187148892706e-07, "logits/chosen": -19.35567283630371, "logits/rejected": -18.307758331298828, "logps/chosen": -406.5379638671875, "logps/rejected": -290.5667724609375, "loss": 0.4057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1995654106140137, "rewards/margins": 1.3750859498977661, "rewards/rejected": 1.8244794607162476, "step": 52540 }, { "epoch": 2.4397604345605646, "grad_norm": 17.47770118713379, "learning_rate": 1.5368401504248107e-07, "logits/chosen": -18.398563385009766, "logits/rejected": -18.10322380065918, "logps/chosen": -506.43695068359375, "logps/rejected": -404.65020751953125, "loss": 0.3758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.0833024978637695, "rewards/margins": 1.508949637413025, "rewards/rejected": 3.574352741241455, "step": 52550 }, { "epoch": 2.440224708667998, "grad_norm": 122.54739379882812, "learning_rate": 1.536561585960351e-07, "logits/chosen": -19.839523315429688, "logits/rejected": -18.847454071044922, "logps/chosen": -406.9482421875, "logps/rejected": -332.47479248046875, "loss": 0.5517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4556617736816406, "rewards/margins": 0.954878032207489, "rewards/rejected": 2.500784158706665, "step": 52560 }, { "epoch": 2.4406889827754306, "grad_norm": 10.698472023010254, "learning_rate": 1.536283021495891e-07, "logits/chosen": -18.5467472076416, "logits/rejected": -18.195114135742188, "logps/chosen": -272.289794921875, "logps/rejected": -270.6100158691406, "loss": 0.7533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.902837038040161, "rewards/margins": 0.5968375205993652, "rewards/rejected": 2.305999517440796, "step": 52570 }, { "epoch": 2.4411532568828638, "grad_norm": 0.12239547073841095, "learning_rate": 1.5360044570314312e-07, "logits/chosen": -18.85623550415039, "logits/rejected": -17.151514053344727, "logps/chosen": -392.6638488769531, "logps/rejected": -235.0405731201172, "loss": 0.3256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.449139595031738, "rewards/margins": 2.4089365005493164, "rewards/rejected": 2.0402028560638428, "step": 52580 }, { "epoch": 2.4416175309902965, "grad_norm": 96.74617004394531, "learning_rate": 1.5357258925669716e-07, "logits/chosen": -18.958454132080078, "logits/rejected": -18.647737503051758, "logps/chosen": -381.13385009765625, "logps/rejected": -364.2287902832031, "loss": 1.2031, "rewards/accuracies": 0.5, "rewards/chosen": 2.67214298248291, "rewards/margins": -0.27122658491134644, "rewards/rejected": 2.9433693885803223, "step": 52590 }, { "epoch": 2.4420818050977298, "grad_norm": 53.544254302978516, "learning_rate": 1.5354473281025117e-07, "logits/chosen": -19.237001419067383, "logits/rejected": -19.354522705078125, "logps/chosen": -451.363037109375, "logps/rejected": -428.06427001953125, "loss": 0.747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6363143920898438, "rewards/margins": 0.18834049999713898, "rewards/rejected": 3.4479739665985107, "step": 52600 }, { "epoch": 2.4425460792051625, "grad_norm": 160.04574584960938, "learning_rate": 1.5351687636380516e-07, "logits/chosen": -18.61923599243164, "logits/rejected": -18.200885772705078, "logps/chosen": -378.8976135253906, "logps/rejected": -327.7674865722656, "loss": 1.0564, "rewards/accuracies": 0.5, "rewards/chosen": 4.311285972595215, "rewards/margins": 0.6309016346931458, "rewards/rejected": 3.6803836822509766, "step": 52610 }, { "epoch": 2.4430103533125958, "grad_norm": 96.00594329833984, "learning_rate": 1.534890199173592e-07, "logits/chosen": -18.911163330078125, "logits/rejected": -18.812114715576172, "logps/chosen": -400.1741638183594, "logps/rejected": -375.1615295410156, "loss": 0.8996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.594433307647705, "rewards/margins": 0.22216124832630157, "rewards/rejected": 2.37227201461792, "step": 52620 }, { "epoch": 2.443474627420029, "grad_norm": 47.047462463378906, "learning_rate": 1.5346116347091322e-07, "logits/chosen": -18.703960418701172, "logits/rejected": -17.887325286865234, "logps/chosen": -525.2533569335938, "logps/rejected": -417.71905517578125, "loss": 0.9323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.805794715881348, "rewards/margins": 1.615282416343689, "rewards/rejected": 3.1905126571655273, "step": 52630 }, { "epoch": 2.4439389015274617, "grad_norm": 133.52784729003906, "learning_rate": 1.5343330702446726e-07, "logits/chosen": -18.2224063873291, "logits/rejected": -17.598533630371094, "logps/chosen": -360.70465087890625, "logps/rejected": -282.0909118652344, "loss": 0.573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0433034896850586, "rewards/margins": 1.0776466131210327, "rewards/rejected": 1.9656568765640259, "step": 52640 }, { "epoch": 2.444403175634895, "grad_norm": 27.857364654541016, "learning_rate": 1.5340545057802125e-07, "logits/chosen": -18.453439712524414, "logits/rejected": -17.867238998413086, "logps/chosen": -482.1963806152344, "logps/rejected": -450.0603942871094, "loss": 0.683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.063554763793945, "rewards/margins": 1.0025255680084229, "rewards/rejected": 3.0610291957855225, "step": 52650 }, { "epoch": 2.4448674497423277, "grad_norm": 62.877288818359375, "learning_rate": 1.5337759413157526e-07, "logits/chosen": -18.724401473999023, "logits/rejected": -18.30324363708496, "logps/chosen": -394.90362548828125, "logps/rejected": -338.27532958984375, "loss": 0.762, "rewards/accuracies": 0.5, "rewards/chosen": 2.730281352996826, "rewards/margins": 0.6417984962463379, "rewards/rejected": 2.0884828567504883, "step": 52660 }, { "epoch": 2.445331723849761, "grad_norm": 2.882622718811035, "learning_rate": 1.533497376851293e-07, "logits/chosen": -18.514421463012695, "logits/rejected": -18.35893440246582, "logps/chosen": -247.74716186523438, "logps/rejected": -274.15240478515625, "loss": 1.1442, "rewards/accuracies": 0.5, "rewards/chosen": 2.161987543106079, "rewards/margins": 0.4625827670097351, "rewards/rejected": 1.6994049549102783, "step": 52670 }, { "epoch": 2.4457959979571937, "grad_norm": 86.5567626953125, "learning_rate": 1.5332188123868332e-07, "logits/chosen": -19.282306671142578, "logits/rejected": -18.599590301513672, "logps/chosen": -435.34014892578125, "logps/rejected": -408.9538269042969, "loss": 0.5236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.138615608215332, "rewards/margins": 1.2958399057388306, "rewards/rejected": 2.84277606010437, "step": 52680 }, { "epoch": 2.446260272064627, "grad_norm": 0.2896636724472046, "learning_rate": 1.5329402479223733e-07, "logits/chosen": -18.548555374145508, "logits/rejected": -17.237224578857422, "logps/chosen": -357.56536865234375, "logps/rejected": -223.5631866455078, "loss": 0.5168, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.497147798538208, "rewards/margins": 1.8325906991958618, "rewards/rejected": 1.6645572185516357, "step": 52690 }, { "epoch": 2.44672454617206, "grad_norm": 8.375426292419434, "learning_rate": 1.5326616834579135e-07, "logits/chosen": -19.683452606201172, "logits/rejected": -19.35930824279785, "logps/chosen": -405.79559326171875, "logps/rejected": -293.2606201171875, "loss": 0.6822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1828932762145996, "rewards/margins": 1.2352540493011475, "rewards/rejected": 1.9476392269134521, "step": 52700 }, { "epoch": 2.447188820279493, "grad_norm": 41.17783737182617, "learning_rate": 1.5323831189934539e-07, "logits/chosen": -18.04738998413086, "logits/rejected": -18.470272064208984, "logps/chosen": -339.78936767578125, "logps/rejected": -374.51287841796875, "loss": 0.9157, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5423614978790283, "rewards/margins": 0.048345260322093964, "rewards/rejected": 2.494016170501709, "step": 52710 }, { "epoch": 2.447653094386926, "grad_norm": 47.82102966308594, "learning_rate": 1.532104554528994e-07, "logits/chosen": -20.493545532226562, "logits/rejected": -18.73556137084961, "logps/chosen": -579.02294921875, "logps/rejected": -361.0633850097656, "loss": 0.2543, "rewards/accuracies": 1.0, "rewards/chosen": 5.303746700286865, "rewards/margins": 2.421639919281006, "rewards/rejected": 2.8821070194244385, "step": 52720 }, { "epoch": 2.448117368494359, "grad_norm": 28.44483757019043, "learning_rate": 1.531825990064534e-07, "logits/chosen": -18.247142791748047, "logits/rejected": -17.948745727539062, "logps/chosen": -402.7361145019531, "logps/rejected": -361.04083251953125, "loss": 0.8704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1458942890167236, "rewards/margins": 0.37094175815582275, "rewards/rejected": 2.7749524116516113, "step": 52730 }, { "epoch": 2.448581642601792, "grad_norm": 108.90760803222656, "learning_rate": 1.5315474256000743e-07, "logits/chosen": -18.82939338684082, "logits/rejected": -19.429981231689453, "logps/chosen": -392.20477294921875, "logps/rejected": -396.1156005859375, "loss": 0.7001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9863414764404297, "rewards/margins": 0.2838068902492523, "rewards/rejected": 3.7025341987609863, "step": 52740 }, { "epoch": 2.449045916709225, "grad_norm": 90.47371673583984, "learning_rate": 1.5312688611356144e-07, "logits/chosen": -19.727794647216797, "logits/rejected": -19.760906219482422, "logps/chosen": -471.7347717285156, "logps/rejected": -379.4471740722656, "loss": 0.6743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.846400737762451, "rewards/margins": 0.7150611877441406, "rewards/rejected": 3.1313397884368896, "step": 52750 }, { "epoch": 2.449510190816658, "grad_norm": 111.20680236816406, "learning_rate": 1.5309902966711543e-07, "logits/chosen": -18.537044525146484, "logits/rejected": -18.051767349243164, "logps/chosen": -393.3750915527344, "logps/rejected": -317.88800048828125, "loss": 0.6064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7905631065368652, "rewards/margins": 1.3371036052703857, "rewards/rejected": 2.4534599781036377, "step": 52760 }, { "epoch": 2.4499744649240913, "grad_norm": 75.62590026855469, "learning_rate": 1.5307117322066947e-07, "logits/chosen": -18.797405242919922, "logits/rejected": -18.531475067138672, "logps/chosen": -438.45892333984375, "logps/rejected": -397.9231262207031, "loss": 0.6714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3872859477996826, "rewards/margins": 0.5705360770225525, "rewards/rejected": 2.8167500495910645, "step": 52770 }, { "epoch": 2.450438739031524, "grad_norm": 50.01184844970703, "learning_rate": 1.530433167742235e-07, "logits/chosen": -18.017332077026367, "logits/rejected": -17.386404037475586, "logps/chosen": -374.20263671875, "logps/rejected": -288.06500244140625, "loss": 0.7163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.254767656326294, "rewards/margins": 1.084354281425476, "rewards/rejected": 2.170413017272949, "step": 52780 }, { "epoch": 2.4509030131389573, "grad_norm": 20.309741973876953, "learning_rate": 1.5301546032777753e-07, "logits/chosen": -19.285593032836914, "logits/rejected": -18.564266204833984, "logps/chosen": -473.06256103515625, "logps/rejected": -439.44891357421875, "loss": 0.7689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.626114368438721, "rewards/margins": 1.3457542657852173, "rewards/rejected": 3.280359983444214, "step": 52790 }, { "epoch": 2.45136728724639, "grad_norm": 24.964574813842773, "learning_rate": 1.5298760388133152e-07, "logits/chosen": -19.46308708190918, "logits/rejected": -18.806869506835938, "logps/chosen": -412.47735595703125, "logps/rejected": -423.820556640625, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 3.6204025745391846, "rewards/margins": 0.7512421607971191, "rewards/rejected": 2.8691601753234863, "step": 52800 }, { "epoch": 2.4518315613538233, "grad_norm": 91.59869384765625, "learning_rate": 1.5295974743488553e-07, "logits/chosen": -18.780960083007812, "logits/rejected": -18.31259536743164, "logps/chosen": -379.24774169921875, "logps/rejected": -280.0138854980469, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.94635009765625, "rewards/margins": 1.8869565725326538, "rewards/rejected": 2.0593936443328857, "step": 52810 }, { "epoch": 2.4522958354612565, "grad_norm": 18.809967041015625, "learning_rate": 1.5293189098843957e-07, "logits/chosen": -18.810928344726562, "logits/rejected": -17.805179595947266, "logps/chosen": -384.27783203125, "logps/rejected": -295.45111083984375, "loss": 0.5785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9689674377441406, "rewards/margins": 1.5429761409759521, "rewards/rejected": 2.4259915351867676, "step": 52820 }, { "epoch": 2.4527601095686893, "grad_norm": 109.81309509277344, "learning_rate": 1.529040345419936e-07, "logits/chosen": -18.522241592407227, "logits/rejected": -19.06538200378418, "logps/chosen": -356.4695129394531, "logps/rejected": -398.4049377441406, "loss": 0.6633, "rewards/accuracies": 0.5, "rewards/chosen": 3.126410484313965, "rewards/margins": 0.21124053001403809, "rewards/rejected": 2.9151697158813477, "step": 52830 }, { "epoch": 2.4532243836761225, "grad_norm": 2.027993679046631, "learning_rate": 1.528761780955476e-07, "logits/chosen": -20.28399085998535, "logits/rejected": -17.828203201293945, "logps/chosen": -521.99755859375, "logps/rejected": -317.705322265625, "loss": 0.3526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.3171281814575195, "rewards/margins": 2.756812334060669, "rewards/rejected": 2.5603160858154297, "step": 52840 }, { "epoch": 2.4536886577835553, "grad_norm": 99.71749114990234, "learning_rate": 1.5284832164910162e-07, "logits/chosen": -18.761884689331055, "logits/rejected": -18.178911209106445, "logps/chosen": -353.0662536621094, "logps/rejected": -422.6612243652344, "loss": 0.8433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8267922401428223, "rewards/margins": 0.4052836298942566, "rewards/rejected": 3.421508312225342, "step": 52850 }, { "epoch": 2.4541529318909885, "grad_norm": 1.3042361736297607, "learning_rate": 1.5282046520265566e-07, "logits/chosen": -18.714157104492188, "logits/rejected": -18.154918670654297, "logps/chosen": -465.314453125, "logps/rejected": -334.77386474609375, "loss": 0.4943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1868369579315186, "rewards/margins": 1.295332670211792, "rewards/rejected": 1.8915040493011475, "step": 52860 }, { "epoch": 2.4546172059984217, "grad_norm": 156.22921752929688, "learning_rate": 1.5279260875620967e-07, "logits/chosen": -19.021154403686523, "logits/rejected": -17.482040405273438, "logps/chosen": -434.0990295410156, "logps/rejected": -289.4286804199219, "loss": 0.8411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7180843353271484, "rewards/margins": 1.7332645654678345, "rewards/rejected": 1.984820008277893, "step": 52870 }, { "epoch": 2.4550814801058545, "grad_norm": 200.79220581054688, "learning_rate": 1.5276475230976366e-07, "logits/chosen": -19.09995460510254, "logits/rejected": -18.176815032958984, "logps/chosen": -405.1393127441406, "logps/rejected": -313.9993591308594, "loss": 0.5785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4428234100341797, "rewards/margins": 1.7159541845321655, "rewards/rejected": 1.7268693447113037, "step": 52880 }, { "epoch": 2.4555457542132877, "grad_norm": 32.53290557861328, "learning_rate": 1.527368958633177e-07, "logits/chosen": -19.29952049255371, "logits/rejected": -17.437692642211914, "logps/chosen": -451.46441650390625, "logps/rejected": -259.30706787109375, "loss": 0.4408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7188713550567627, "rewards/margins": 1.73073410987854, "rewards/rejected": 1.9881374835968018, "step": 52890 }, { "epoch": 2.4560100283207205, "grad_norm": 5.243346214294434, "learning_rate": 1.5270903941687172e-07, "logits/chosen": -18.598878860473633, "logits/rejected": -17.69761085510254, "logps/chosen": -342.70733642578125, "logps/rejected": -275.8945617675781, "loss": 0.7126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.373247146606445, "rewards/margins": 1.2907531261444092, "rewards/rejected": 3.082494020462036, "step": 52900 }, { "epoch": 2.4564743024281537, "grad_norm": 62.089256286621094, "learning_rate": 1.5268118297042576e-07, "logits/chosen": -19.002246856689453, "logits/rejected": -18.26821517944336, "logps/chosen": -372.65167236328125, "logps/rejected": -320.4031677246094, "loss": 1.1012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.222604274749756, "rewards/margins": 0.67537522315979, "rewards/rejected": 2.547229051589966, "step": 52910 }, { "epoch": 2.4569385765355864, "grad_norm": 68.79812622070312, "learning_rate": 1.5265332652397974e-07, "logits/chosen": -18.512285232543945, "logits/rejected": -18.822978973388672, "logps/chosen": -346.9849548339844, "logps/rejected": -334.6480407714844, "loss": 1.3028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.028242826461792, "rewards/margins": 0.27852320671081543, "rewards/rejected": 2.7497196197509766, "step": 52920 }, { "epoch": 2.4574028506430197, "grad_norm": 5.1003193855285645, "learning_rate": 1.5262547007753376e-07, "logits/chosen": -18.54053497314453, "logits/rejected": -17.690391540527344, "logps/chosen": -312.39678955078125, "logps/rejected": -185.31471252441406, "loss": 0.3059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.740591049194336, "rewards/margins": 2.3809237480163574, "rewards/rejected": 1.359667420387268, "step": 52930 }, { "epoch": 2.457867124750453, "grad_norm": 59.409366607666016, "learning_rate": 1.525976136310878e-07, "logits/chosen": -18.73593521118164, "logits/rejected": -19.308956146240234, "logps/chosen": -368.8781433105469, "logps/rejected": -386.9950866699219, "loss": 1.8585, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.5430848598480225, "rewards/margins": -0.8971344828605652, "rewards/rejected": 3.4402191638946533, "step": 52940 }, { "epoch": 2.4583313988578857, "grad_norm": 2.7601277828216553, "learning_rate": 1.525697571846418e-07, "logits/chosen": -19.436582565307617, "logits/rejected": -18.965890884399414, "logps/chosen": -306.67572021484375, "logps/rejected": -298.7400817871094, "loss": 1.1878, "rewards/accuracies": 0.5, "rewards/chosen": 4.218566417694092, "rewards/margins": 0.687540590763092, "rewards/rejected": 3.5310256481170654, "step": 52950 }, { "epoch": 2.458795672965319, "grad_norm": 4.3458991050720215, "learning_rate": 1.5254190073819583e-07, "logits/chosen": -19.080886840820312, "logits/rejected": -19.296072006225586, "logps/chosen": -488.63720703125, "logps/rejected": -491.7525939941406, "loss": 0.6543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.125566005706787, "rewards/margins": 0.889388382434845, "rewards/rejected": 3.236177921295166, "step": 52960 }, { "epoch": 2.4592599470727516, "grad_norm": 9.652195930480957, "learning_rate": 1.5251404429174984e-07, "logits/chosen": -18.10638999938965, "logits/rejected": -17.671213150024414, "logps/chosen": -283.3946838378906, "logps/rejected": -246.8407440185547, "loss": 1.0638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1399660110473633, "rewards/margins": 0.5976107716560364, "rewards/rejected": 1.5423551797866821, "step": 52970 }, { "epoch": 2.459724221180185, "grad_norm": 170.4635772705078, "learning_rate": 1.5248618784530386e-07, "logits/chosen": -19.51816749572754, "logits/rejected": -19.249082565307617, "logps/chosen": -458.987060546875, "logps/rejected": -322.73199462890625, "loss": 0.4739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8675758838653564, "rewards/margins": 1.7533893585205078, "rewards/rejected": 2.1141865253448486, "step": 52980 }, { "epoch": 2.4601884952876176, "grad_norm": 33.96393966674805, "learning_rate": 1.5245833139885787e-07, "logits/chosen": -19.89284896850586, "logits/rejected": -18.911890029907227, "logps/chosen": -414.52099609375, "logps/rejected": -340.1491394042969, "loss": 0.4113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1975836753845215, "rewards/margins": 1.4338195323944092, "rewards/rejected": 1.7637643814086914, "step": 52990 }, { "epoch": 2.460652769395051, "grad_norm": 1.5785975456237793, "learning_rate": 1.524304749524119e-07, "logits/chosen": -20.33722496032715, "logits/rejected": -18.82451057434082, "logps/chosen": -374.85015869140625, "logps/rejected": -247.63619995117188, "loss": 0.4258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4343104362487793, "rewards/margins": 1.0607149600982666, "rewards/rejected": 1.3735958337783813, "step": 53000 }, { "epoch": 2.461117043502484, "grad_norm": 3.2675559520721436, "learning_rate": 1.5240261850596593e-07, "logits/chosen": -19.841903686523438, "logits/rejected": -19.009445190429688, "logps/chosen": -313.42327880859375, "logps/rejected": -223.9126434326172, "loss": 0.5128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0554232597351074, "rewards/margins": 1.5564677715301514, "rewards/rejected": 1.4989551305770874, "step": 53010 }, { "epoch": 2.461581317609917, "grad_norm": 91.21602630615234, "learning_rate": 1.5237476205951994e-07, "logits/chosen": -17.796335220336914, "logits/rejected": -17.972267150878906, "logps/chosen": -421.8002014160156, "logps/rejected": -311.14532470703125, "loss": 1.1713, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.420983076095581, "rewards/margins": -0.4206262230873108, "rewards/rejected": 2.841609239578247, "step": 53020 }, { "epoch": 2.46204559171735, "grad_norm": 24.524385452270508, "learning_rate": 1.5234690561307393e-07, "logits/chosen": -19.05816650390625, "logits/rejected": -18.84406089782715, "logps/chosen": -338.69476318359375, "logps/rejected": -325.76690673828125, "loss": 0.5592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.771409273147583, "rewards/margins": 0.8668280839920044, "rewards/rejected": 1.904581069946289, "step": 53030 }, { "epoch": 2.462509865824783, "grad_norm": 118.60974884033203, "learning_rate": 1.5231904916662797e-07, "logits/chosen": -18.49324607849121, "logits/rejected": -17.778982162475586, "logps/chosen": -423.5546875, "logps/rejected": -338.5749816894531, "loss": 0.5908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5604069232940674, "rewards/margins": 1.352645754814148, "rewards/rejected": 2.20776104927063, "step": 53040 }, { "epoch": 2.462974139932216, "grad_norm": 68.48793029785156, "learning_rate": 1.5229119272018199e-07, "logits/chosen": -18.60799789428711, "logits/rejected": -18.560441970825195, "logps/chosen": -361.5080871582031, "logps/rejected": -320.5355529785156, "loss": 0.7213, "rewards/accuracies": 0.5, "rewards/chosen": 3.1445555686950684, "rewards/margins": 0.8804010152816772, "rewards/rejected": 2.2641544342041016, "step": 53050 }, { "epoch": 2.463438414039649, "grad_norm": 21.363378524780273, "learning_rate": 1.5226333627373603e-07, "logits/chosen": -19.48700523376465, "logits/rejected": -18.658035278320312, "logps/chosen": -383.213134765625, "logps/rejected": -340.8616943359375, "loss": 1.0195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.822653293609619, "rewards/margins": 0.15646040439605713, "rewards/rejected": 3.6661930084228516, "step": 53060 }, { "epoch": 2.463902688147082, "grad_norm": 19.437332153320312, "learning_rate": 1.5223547982729002e-07, "logits/chosen": -19.01873207092285, "logits/rejected": -18.563053131103516, "logps/chosen": -375.3715515136719, "logps/rejected": -358.8817138671875, "loss": 0.557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8319907188415527, "rewards/margins": 0.3928110599517822, "rewards/rejected": 2.4391794204711914, "step": 53070 }, { "epoch": 2.4643669622545152, "grad_norm": 0.05393039435148239, "learning_rate": 1.5220762338084403e-07, "logits/chosen": -18.485471725463867, "logits/rejected": -17.716983795166016, "logps/chosen": -359.09674072265625, "logps/rejected": -206.71456909179688, "loss": 0.2941, "rewards/accuracies": 1.0, "rewards/chosen": 4.4988203048706055, "rewards/margins": 2.655359983444214, "rewards/rejected": 1.8434604406356812, "step": 53080 }, { "epoch": 2.464831236361948, "grad_norm": 27.230865478515625, "learning_rate": 1.5217976693439807e-07, "logits/chosen": -19.336198806762695, "logits/rejected": -18.142929077148438, "logps/chosen": -339.5447692871094, "logps/rejected": -280.6222839355469, "loss": 0.6348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.154191493988037, "rewards/margins": 0.9791128039360046, "rewards/rejected": 2.175078868865967, "step": 53090 }, { "epoch": 2.4652955104693812, "grad_norm": 7.751309394836426, "learning_rate": 1.5215191048795209e-07, "logits/chosen": -18.803190231323242, "logits/rejected": -18.036090850830078, "logps/chosen": -339.02935791015625, "logps/rejected": -290.2982482910156, "loss": 0.5543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.89493727684021, "rewards/margins": 1.2339547872543335, "rewards/rejected": 1.6609824895858765, "step": 53100 }, { "epoch": 2.465759784576814, "grad_norm": 49.90041732788086, "learning_rate": 1.521240540415061e-07, "logits/chosen": -18.49759864807129, "logits/rejected": -18.262439727783203, "logps/chosen": -489.3728942871094, "logps/rejected": -407.63043212890625, "loss": 0.8586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6581196784973145, "rewards/margins": 0.20165696740150452, "rewards/rejected": 3.456462860107422, "step": 53110 }, { "epoch": 2.466224058684247, "grad_norm": 3.2822470664978027, "learning_rate": 1.5209619759506011e-07, "logits/chosen": -18.346302032470703, "logits/rejected": -17.47702980041504, "logps/chosen": -441.7188415527344, "logps/rejected": -343.7308044433594, "loss": 0.5532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0907809734344482, "rewards/margins": 1.2596296072006226, "rewards/rejected": 1.8311512470245361, "step": 53120 }, { "epoch": 2.46668833279168, "grad_norm": 17.747478485107422, "learning_rate": 1.5206834114861416e-07, "logits/chosen": -18.954429626464844, "logits/rejected": -17.750789642333984, "logps/chosen": -382.74957275390625, "logps/rejected": -271.5684509277344, "loss": 0.4627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.05587100982666, "rewards/margins": 1.7290124893188477, "rewards/rejected": 2.3268589973449707, "step": 53130 }, { "epoch": 2.467152606899113, "grad_norm": 271.3600158691406, "learning_rate": 1.5204048470216814e-07, "logits/chosen": -19.01087760925293, "logits/rejected": -18.50437355041504, "logps/chosen": -294.8229064941406, "logps/rejected": -264.5226745605469, "loss": 0.614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7240819931030273, "rewards/margins": 0.3756498396396637, "rewards/rejected": 2.3484318256378174, "step": 53140 }, { "epoch": 2.4676168810065464, "grad_norm": 23.959321975708008, "learning_rate": 1.5201262825572216e-07, "logits/chosen": -18.232515335083008, "logits/rejected": -18.454227447509766, "logps/chosen": -386.91558837890625, "logps/rejected": -340.16461181640625, "loss": 1.0678, "rewards/accuracies": 0.5, "rewards/chosen": 2.620936155319214, "rewards/margins": 0.39454373717308044, "rewards/rejected": 2.2263922691345215, "step": 53150 }, { "epoch": 2.468081155113979, "grad_norm": 114.9000473022461, "learning_rate": 1.519847718092762e-07, "logits/chosen": -18.1705379486084, "logits/rejected": -16.950233459472656, "logps/chosen": -309.58221435546875, "logps/rejected": -187.06576538085938, "loss": 0.3385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7327024936676025, "rewards/margins": 1.791184425354004, "rewards/rejected": 0.9415181875228882, "step": 53160 }, { "epoch": 2.4685454292214124, "grad_norm": 59.98787307739258, "learning_rate": 1.5195691536283021e-07, "logits/chosen": -18.648395538330078, "logits/rejected": -18.31624984741211, "logps/chosen": -364.13323974609375, "logps/rejected": -371.28521728515625, "loss": 0.8562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.379162549972534, "rewards/margins": 0.5320199131965637, "rewards/rejected": 2.8471426963806152, "step": 53170 }, { "epoch": 2.469009703328845, "grad_norm": 157.03709411621094, "learning_rate": 1.519290589163842e-07, "logits/chosen": -20.241260528564453, "logits/rejected": -19.23177146911621, "logps/chosen": -317.3421630859375, "logps/rejected": -262.449462890625, "loss": 1.0615, "rewards/accuracies": 0.5, "rewards/chosen": 2.475494146347046, "rewards/margins": 0.004118537995964289, "rewards/rejected": 2.4713757038116455, "step": 53180 }, { "epoch": 2.4694739774362784, "grad_norm": 137.802734375, "learning_rate": 1.5190120246993824e-07, "logits/chosen": -18.19420623779297, "logits/rejected": -18.245573043823242, "logps/chosen": -386.58514404296875, "logps/rejected": -292.14764404296875, "loss": 0.8381, "rewards/accuracies": 0.5, "rewards/chosen": 3.7442145347595215, "rewards/margins": 0.310010701417923, "rewards/rejected": 3.4342041015625, "step": 53190 }, { "epoch": 2.469938251543711, "grad_norm": 23.271169662475586, "learning_rate": 1.5187334602349226e-07, "logits/chosen": -19.268354415893555, "logits/rejected": -18.912870407104492, "logps/chosen": -409.56341552734375, "logps/rejected": -382.77508544921875, "loss": 0.7155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.862351179122925, "rewards/margins": 0.5390551090240479, "rewards/rejected": 3.323296308517456, "step": 53200 }, { "epoch": 2.4704025256511444, "grad_norm": 283.57183837890625, "learning_rate": 1.518454895770463e-07, "logits/chosen": -19.780437469482422, "logits/rejected": -19.432056427001953, "logps/chosen": -354.0049743652344, "logps/rejected": -344.3140869140625, "loss": 0.9466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2338192462921143, "rewards/margins": 0.22670695185661316, "rewards/rejected": 3.0071122646331787, "step": 53210 }, { "epoch": 2.4708667997585776, "grad_norm": 80.3846435546875, "learning_rate": 1.5181763313060029e-07, "logits/chosen": -19.088542938232422, "logits/rejected": -18.43424415588379, "logps/chosen": -483.80712890625, "logps/rejected": -329.8394470214844, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 4.059691429138184, "rewards/margins": 1.7014967203140259, "rewards/rejected": 2.3581948280334473, "step": 53220 }, { "epoch": 2.4713310738660104, "grad_norm": 26.587261199951172, "learning_rate": 1.517897766841543e-07, "logits/chosen": -19.387537002563477, "logits/rejected": -18.722349166870117, "logps/chosen": -426.0630798339844, "logps/rejected": -373.1173095703125, "loss": 0.6362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.390768051147461, "rewards/margins": 0.8751047253608704, "rewards/rejected": 2.5156631469726562, "step": 53230 }, { "epoch": 2.4717953479734436, "grad_norm": 3.1613786220550537, "learning_rate": 1.5176192023770834e-07, "logits/chosen": -18.96152114868164, "logits/rejected": -19.098819732666016, "logps/chosen": -378.5482177734375, "logps/rejected": -364.9604187011719, "loss": 0.7777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8154330253601074, "rewards/margins": 1.1488354206085205, "rewards/rejected": 2.6665971279144287, "step": 53240 }, { "epoch": 2.472259622080877, "grad_norm": 14.168865203857422, "learning_rate": 1.5173406379126236e-07, "logits/chosen": -20.095556259155273, "logits/rejected": -19.126922607421875, "logps/chosen": -460.15283203125, "logps/rejected": -345.11920166015625, "loss": 0.4903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.803921222686768, "rewards/margins": 1.370313048362732, "rewards/rejected": 3.4336085319519043, "step": 53250 }, { "epoch": 2.4727238961883096, "grad_norm": 6.501919269561768, "learning_rate": 1.5170620734481637e-07, "logits/chosen": -17.984384536743164, "logits/rejected": -17.295217514038086, "logps/chosen": -348.76300048828125, "logps/rejected": -304.1932373046875, "loss": 0.6784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1305110454559326, "rewards/margins": 1.6167300939559937, "rewards/rejected": 1.513781189918518, "step": 53260 }, { "epoch": 2.473188170295743, "grad_norm": 243.9580078125, "learning_rate": 1.5167835089837039e-07, "logits/chosen": -17.140230178833008, "logits/rejected": -17.822895050048828, "logps/chosen": -345.28607177734375, "logps/rejected": -363.5699462890625, "loss": 1.0123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.466212749481201, "rewards/margins": 0.26543116569519043, "rewards/rejected": 2.2007815837860107, "step": 53270 }, { "epoch": 2.4736524444031756, "grad_norm": 90.91515350341797, "learning_rate": 1.5165049445192443e-07, "logits/chosen": -19.325021743774414, "logits/rejected": -18.72970962524414, "logps/chosen": -433.0101013183594, "logps/rejected": -388.9977111816406, "loss": 0.7395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8359718322753906, "rewards/margins": 0.4127058982849121, "rewards/rejected": 3.4232661724090576, "step": 53280 }, { "epoch": 2.4741167185106088, "grad_norm": 76.48554992675781, "learning_rate": 1.5162263800547844e-07, "logits/chosen": -18.992446899414062, "logits/rejected": -18.257137298583984, "logps/chosen": -429.4844665527344, "logps/rejected": -425.03912353515625, "loss": 0.5219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6461288928985596, "rewards/margins": 1.2218105792999268, "rewards/rejected": 2.4243178367614746, "step": 53290 }, { "epoch": 2.4745809926180415, "grad_norm": 40.43666076660156, "learning_rate": 1.5159478155903243e-07, "logits/chosen": -19.616405487060547, "logits/rejected": -19.227764129638672, "logps/chosen": -551.7899169921875, "logps/rejected": -429.71240234375, "loss": 0.564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.260770320892334, "rewards/margins": 0.6749175786972046, "rewards/rejected": 3.5858535766601562, "step": 53300 }, { "epoch": 2.4750452667254748, "grad_norm": 66.29694366455078, "learning_rate": 1.5156692511258647e-07, "logits/chosen": -18.095674514770508, "logits/rejected": -18.369888305664062, "logps/chosen": -422.36932373046875, "logps/rejected": -397.7784423828125, "loss": 0.815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.163573741912842, "rewards/margins": 0.06455887854099274, "rewards/rejected": 3.0990147590637207, "step": 53310 }, { "epoch": 2.475509540832908, "grad_norm": 43.593292236328125, "learning_rate": 1.5153906866614048e-07, "logits/chosen": -18.512514114379883, "logits/rejected": -17.230253219604492, "logps/chosen": -298.7462158203125, "logps/rejected": -215.10775756835938, "loss": 0.5164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4123823642730713, "rewards/margins": 0.8513935804367065, "rewards/rejected": 1.5609887838363647, "step": 53320 }, { "epoch": 2.4759738149403407, "grad_norm": 93.74784088134766, "learning_rate": 1.5151121221969447e-07, "logits/chosen": -18.93421745300293, "logits/rejected": -18.595487594604492, "logps/chosen": -334.59576416015625, "logps/rejected": -282.84112548828125, "loss": 0.4879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8331737518310547, "rewards/margins": 1.7184215784072876, "rewards/rejected": 2.1147520542144775, "step": 53330 }, { "epoch": 2.476438089047774, "grad_norm": 245.9062042236328, "learning_rate": 1.5148335577324851e-07, "logits/chosen": -19.308849334716797, "logits/rejected": -18.917709350585938, "logps/chosen": -380.49859619140625, "logps/rejected": -322.7213439941406, "loss": 0.918, "rewards/accuracies": 0.5, "rewards/chosen": 2.648956060409546, "rewards/margins": 0.4937383532524109, "rewards/rejected": 2.1552176475524902, "step": 53340 }, { "epoch": 2.4769023631552067, "grad_norm": 4.91728401184082, "learning_rate": 1.5145549932680253e-07, "logits/chosen": -18.458341598510742, "logits/rejected": -17.845659255981445, "logps/chosen": -458.669677734375, "logps/rejected": -326.323486328125, "loss": 0.4617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.075366020202637, "rewards/margins": 1.8544670343399048, "rewards/rejected": 2.220898389816284, "step": 53350 }, { "epoch": 2.47736663726264, "grad_norm": 65.44906616210938, "learning_rate": 1.5142764288035657e-07, "logits/chosen": -19.00712013244629, "logits/rejected": -18.841224670410156, "logps/chosen": -466.3072814941406, "logps/rejected": -413.3201599121094, "loss": 0.6051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.29638934135437, "rewards/margins": 0.7891203165054321, "rewards/rejected": 2.5072693824768066, "step": 53360 }, { "epoch": 2.4778309113700727, "grad_norm": 91.09921264648438, "learning_rate": 1.5139978643391056e-07, "logits/chosen": -18.983667373657227, "logits/rejected": -18.57818603515625, "logps/chosen": -459.20843505859375, "logps/rejected": -402.07763671875, "loss": 0.7656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.230193138122559, "rewards/margins": 1.1321991682052612, "rewards/rejected": 3.097993850708008, "step": 53370 }, { "epoch": 2.478295185477506, "grad_norm": 81.16191864013672, "learning_rate": 1.513719299874646e-07, "logits/chosen": -18.536590576171875, "logits/rejected": -17.384952545166016, "logps/chosen": -436.9659729003906, "logps/rejected": -289.8564453125, "loss": 0.2831, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5424728393554688, "rewards/margins": 1.6899192333221436, "rewards/rejected": 1.852553367614746, "step": 53380 }, { "epoch": 2.478759459584939, "grad_norm": 3.873324155807495, "learning_rate": 1.513440735410186e-07, "logits/chosen": -19.243343353271484, "logits/rejected": -18.567218780517578, "logps/chosen": -416.13037109375, "logps/rejected": -325.8196105957031, "loss": 0.5902, "rewards/accuracies": 0.5, "rewards/chosen": 4.0438761711120605, "rewards/margins": 1.6665281057357788, "rewards/rejected": 2.377347946166992, "step": 53390 }, { "epoch": 2.479223733692372, "grad_norm": 136.1221923828125, "learning_rate": 1.5131621709457263e-07, "logits/chosen": -19.418643951416016, "logits/rejected": -19.74275779724121, "logps/chosen": -320.44964599609375, "logps/rejected": -322.13616943359375, "loss": 0.7746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2109763622283936, "rewards/margins": 0.3696301579475403, "rewards/rejected": 2.841346025466919, "step": 53400 }, { "epoch": 2.479688007799805, "grad_norm": 9.613317489624023, "learning_rate": 1.5128836064812664e-07, "logits/chosen": -18.83125877380371, "logits/rejected": -17.91550064086914, "logps/chosen": -403.9993591308594, "logps/rejected": -338.37628173828125, "loss": 0.42, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4810471534729004, "rewards/margins": 1.0003368854522705, "rewards/rejected": 2.480710029602051, "step": 53410 }, { "epoch": 2.480152281907238, "grad_norm": 48.99256134033203, "learning_rate": 1.5126050420168066e-07, "logits/chosen": -18.19478988647461, "logits/rejected": -17.329586029052734, "logps/chosen": -256.97308349609375, "logps/rejected": -168.1443634033203, "loss": 0.4152, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.426302671432495, "rewards/margins": 1.5799782276153564, "rewards/rejected": 0.8463243246078491, "step": 53420 }, { "epoch": 2.480616556014671, "grad_norm": 173.44874572753906, "learning_rate": 1.512326477552347e-07, "logits/chosen": -20.031295776367188, "logits/rejected": -19.404232025146484, "logps/chosen": -406.6309814453125, "logps/rejected": -334.64886474609375, "loss": 0.711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5552024841308594, "rewards/margins": 0.5575827956199646, "rewards/rejected": 2.997619390487671, "step": 53430 }, { "epoch": 2.481080830122104, "grad_norm": 209.2871856689453, "learning_rate": 1.512047913087887e-07, "logits/chosen": -19.02389144897461, "logits/rejected": -18.675304412841797, "logps/chosen": -335.86846923828125, "logps/rejected": -334.6323547363281, "loss": 1.0523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4697539806365967, "rewards/margins": 0.5343652963638306, "rewards/rejected": 2.9353885650634766, "step": 53440 }, { "epoch": 2.481545104229537, "grad_norm": 102.79752349853516, "learning_rate": 1.511769348623427e-07, "logits/chosen": -18.693737030029297, "logits/rejected": -18.642446517944336, "logps/chosen": -323.52099609375, "logps/rejected": -336.8939514160156, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 2.9487977027893066, "rewards/margins": 0.3407440483570099, "rewards/rejected": 2.608053684234619, "step": 53450 }, { "epoch": 2.4820093783369703, "grad_norm": 0.34038835763931274, "learning_rate": 1.5114907841589674e-07, "logits/chosen": -19.89442253112793, "logits/rejected": -18.09457778930664, "logps/chosen": -450.072265625, "logps/rejected": -340.65576171875, "loss": 0.6271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5977888107299805, "rewards/margins": 1.6897268295288086, "rewards/rejected": 2.908062219619751, "step": 53460 }, { "epoch": 2.482473652444403, "grad_norm": 22.391830444335938, "learning_rate": 1.5112122196945076e-07, "logits/chosen": -19.0581111907959, "logits/rejected": -18.546070098876953, "logps/chosen": -312.61395263671875, "logps/rejected": -211.9324493408203, "loss": 0.6143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.828965425491333, "rewards/margins": 1.4976680278778076, "rewards/rejected": 1.3312973976135254, "step": 53470 }, { "epoch": 2.4829379265518363, "grad_norm": 173.96363830566406, "learning_rate": 1.510933655230048e-07, "logits/chosen": -18.89065933227539, "logits/rejected": -18.789424896240234, "logps/chosen": -428.1337890625, "logps/rejected": -423.1568298339844, "loss": 0.5809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.248145580291748, "rewards/margins": 0.4695637822151184, "rewards/rejected": 2.7785820960998535, "step": 53480 }, { "epoch": 2.483402200659269, "grad_norm": 52.51253128051758, "learning_rate": 1.5106550907655878e-07, "logits/chosen": -18.558176040649414, "logits/rejected": -18.144319534301758, "logps/chosen": -300.22479248046875, "logps/rejected": -313.2921447753906, "loss": 1.1802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1504764556884766, "rewards/margins": 0.32504838705062866, "rewards/rejected": 1.8254282474517822, "step": 53490 }, { "epoch": 2.4838664747667023, "grad_norm": 103.9254150390625, "learning_rate": 1.510376526301128e-07, "logits/chosen": -19.14849090576172, "logits/rejected": -18.085126876831055, "logps/chosen": -476.231689453125, "logps/rejected": -363.49505615234375, "loss": 0.7046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3559391498565674, "rewards/margins": 0.5323630571365356, "rewards/rejected": 2.823575973510742, "step": 53500 }, { "epoch": 2.484330748874135, "grad_norm": 2.6985626220703125, "learning_rate": 1.5100979618366684e-07, "logits/chosen": -19.519411087036133, "logits/rejected": -18.39429473876953, "logps/chosen": -397.70379638671875, "logps/rejected": -241.304443359375, "loss": 0.7338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2905261516571045, "rewards/margins": 1.1906859874725342, "rewards/rejected": 2.0998404026031494, "step": 53510 }, { "epoch": 2.4847950229815683, "grad_norm": 113.08335876464844, "learning_rate": 1.5098193973722086e-07, "logits/chosen": -19.65835952758789, "logits/rejected": -19.590059280395508, "logps/chosen": -356.1983642578125, "logps/rejected": -306.76312255859375, "loss": 0.7005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.199644088745117, "rewards/margins": 1.1971477270126343, "rewards/rejected": 3.0024964809417725, "step": 53520 }, { "epoch": 2.4852592970890015, "grad_norm": 206.80960083007812, "learning_rate": 1.5095408329077487e-07, "logits/chosen": -18.776119232177734, "logits/rejected": -18.80817413330078, "logps/chosen": -378.0620422363281, "logps/rejected": -379.3570251464844, "loss": 0.9349, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.662904977798462, "rewards/margins": 0.24368087947368622, "rewards/rejected": 2.4192240238189697, "step": 53530 }, { "epoch": 2.4857235711964343, "grad_norm": 15.9241304397583, "learning_rate": 1.5092622684432888e-07, "logits/chosen": -18.97978401184082, "logits/rejected": -18.62192153930664, "logps/chosen": -337.76983642578125, "logps/rejected": -296.4058837890625, "loss": 0.7966, "rewards/accuracies": 0.5, "rewards/chosen": 2.9169037342071533, "rewards/margins": 0.620952844619751, "rewards/rejected": 2.2959511280059814, "step": 53540 }, { "epoch": 2.4861878453038675, "grad_norm": 75.42704772949219, "learning_rate": 1.5089837039788293e-07, "logits/chosen": -20.008115768432617, "logits/rejected": -19.363943099975586, "logps/chosen": -408.9835205078125, "logps/rejected": -356.58831787109375, "loss": 0.6089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8817286491394043, "rewards/margins": 0.9479169845581055, "rewards/rejected": 2.933811902999878, "step": 53550 }, { "epoch": 2.4866521194113003, "grad_norm": 270.9024658203125, "learning_rate": 1.508705139514369e-07, "logits/chosen": -19.205516815185547, "logits/rejected": -17.697477340698242, "logps/chosen": -398.82623291015625, "logps/rejected": -273.19049072265625, "loss": 0.4565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.427532196044922, "rewards/margins": 2.5485897064208984, "rewards/rejected": 1.8789424896240234, "step": 53560 }, { "epoch": 2.4871163935187335, "grad_norm": 97.94759368896484, "learning_rate": 1.5084265750499093e-07, "logits/chosen": -18.164976119995117, "logits/rejected": -17.887876510620117, "logps/chosen": -246.41397094726562, "logps/rejected": -174.9466094970703, "loss": 0.5346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.186814785003662, "rewards/margins": 0.8270017504692078, "rewards/rejected": 1.3598132133483887, "step": 53570 }, { "epoch": 2.4875806676261663, "grad_norm": 70.07247924804688, "learning_rate": 1.5081480105854497e-07, "logits/chosen": -18.848102569580078, "logits/rejected": -18.527042388916016, "logps/chosen": -314.2196960449219, "logps/rejected": -265.31011962890625, "loss": 0.4972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.447781801223755, "rewards/margins": 0.9613000750541687, "rewards/rejected": 2.4864816665649414, "step": 53580 }, { "epoch": 2.4880449417335995, "grad_norm": 347.547607421875, "learning_rate": 1.5078694461209898e-07, "logits/chosen": -19.16912269592285, "logits/rejected": -17.702770233154297, "logps/chosen": -353.0296325683594, "logps/rejected": -244.7626190185547, "loss": 0.4785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.866243839263916, "rewards/margins": 1.4135633707046509, "rewards/rejected": 1.4526804685592651, "step": 53590 }, { "epoch": 2.4885092158410327, "grad_norm": 109.603759765625, "learning_rate": 1.5075908816565297e-07, "logits/chosen": -17.725290298461914, "logits/rejected": -18.60300064086914, "logps/chosen": -301.75048828125, "logps/rejected": -361.55743408203125, "loss": 1.2139, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2971699237823486, "rewards/margins": -0.3565920293331146, "rewards/rejected": 3.653761386871338, "step": 53600 }, { "epoch": 2.4889734899484655, "grad_norm": 71.98108673095703, "learning_rate": 1.50731231719207e-07, "logits/chosen": -19.60439682006836, "logits/rejected": -18.60032081604004, "logps/chosen": -408.7611389160156, "logps/rejected": -261.67095947265625, "loss": 0.5202, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1597836017608643, "rewards/margins": 1.5076074600219727, "rewards/rejected": 1.6521761417388916, "step": 53610 }, { "epoch": 2.4894377640558987, "grad_norm": 49.14566421508789, "learning_rate": 1.5070337527276103e-07, "logits/chosen": -18.77524185180664, "logits/rejected": -18.405977249145508, "logps/chosen": -205.0816192626953, "logps/rejected": -184.04808044433594, "loss": 1.1917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3136374950408936, "rewards/margins": 0.27273085713386536, "rewards/rejected": 1.0409067869186401, "step": 53620 }, { "epoch": 2.4899020381633314, "grad_norm": 43.78213882446289, "learning_rate": 1.5067551882631507e-07, "logits/chosen": -19.42641258239746, "logits/rejected": -17.773902893066406, "logps/chosen": -355.23577880859375, "logps/rejected": -261.9610290527344, "loss": 0.4144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2535552978515625, "rewards/margins": 1.6596416234970093, "rewards/rejected": 1.5939136743545532, "step": 53630 }, { "epoch": 2.4903663122707647, "grad_norm": 26.55052947998047, "learning_rate": 1.5064766237986906e-07, "logits/chosen": -18.336383819580078, "logits/rejected": -18.01003646850586, "logps/chosen": -340.64886474609375, "logps/rejected": -266.50787353515625, "loss": 0.777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9291744232177734, "rewards/margins": 0.4459080100059509, "rewards/rejected": 1.4832665920257568, "step": 53640 }, { "epoch": 2.490830586378198, "grad_norm": 114.4173812866211, "learning_rate": 1.5061980593342307e-07, "logits/chosen": -18.809520721435547, "logits/rejected": -17.443044662475586, "logps/chosen": -370.3187255859375, "logps/rejected": -248.0553436279297, "loss": 0.4064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6167426109313965, "rewards/margins": 2.0415329933166504, "rewards/rejected": 1.575209617614746, "step": 53650 }, { "epoch": 2.4912948604856306, "grad_norm": 35.57258224487305, "learning_rate": 1.505919494869771e-07, "logits/chosen": -18.98088264465332, "logits/rejected": -18.04336166381836, "logps/chosen": -335.5199890136719, "logps/rejected": -214.71456909179688, "loss": 0.4408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9908251762390137, "rewards/margins": 1.9581515789031982, "rewards/rejected": 1.0326734781265259, "step": 53660 }, { "epoch": 2.491759134593064, "grad_norm": 194.9925994873047, "learning_rate": 1.5056409304053113e-07, "logits/chosen": -18.918376922607422, "logits/rejected": -17.79018211364746, "logps/chosen": -528.908203125, "logps/rejected": -308.13677978515625, "loss": 0.5084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.225207328796387, "rewards/margins": 2.2232561111450195, "rewards/rejected": 2.001950740814209, "step": 53670 }, { "epoch": 2.4922234087004966, "grad_norm": 18.24061393737793, "learning_rate": 1.5053623659408514e-07, "logits/chosen": -19.75051498413086, "logits/rejected": -18.26961898803711, "logps/chosen": -356.8913879394531, "logps/rejected": -273.35162353515625, "loss": 0.4273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.127206802368164, "rewards/margins": 2.1026859283447266, "rewards/rejected": 2.0245206356048584, "step": 53680 }, { "epoch": 2.49268768280793, "grad_norm": 2.1041784286499023, "learning_rate": 1.5050838014763916e-07, "logits/chosen": -19.391496658325195, "logits/rejected": -18.62272834777832, "logps/chosen": -410.2438049316406, "logps/rejected": -387.1341857910156, "loss": 0.9842, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1864571571350098, "rewards/margins": 0.6941107511520386, "rewards/rejected": 2.4923465251922607, "step": 53690 }, { "epoch": 2.493151956915363, "grad_norm": 16.024120330810547, "learning_rate": 1.504805237011932e-07, "logits/chosen": -19.644886016845703, "logits/rejected": -19.42866325378418, "logps/chosen": -350.8018798828125, "logps/rejected": -379.5585632324219, "loss": 0.6662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.03314733505249, "rewards/margins": 0.7005119919776917, "rewards/rejected": 3.3326354026794434, "step": 53700 }, { "epoch": 2.493616231022796, "grad_norm": 250.717041015625, "learning_rate": 1.504526672547472e-07, "logits/chosen": -19.024059295654297, "logits/rejected": -18.88973617553711, "logps/chosen": -309.87591552734375, "logps/rejected": -323.7796630859375, "loss": 1.0093, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0100584030151367, "rewards/margins": 0.3820304870605469, "rewards/rejected": 2.6280276775360107, "step": 53710 }, { "epoch": 2.494080505130229, "grad_norm": 52.66262435913086, "learning_rate": 1.504248108083012e-07, "logits/chosen": -19.307783126831055, "logits/rejected": -19.031841278076172, "logps/chosen": -402.0431213378906, "logps/rejected": -366.5565490722656, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 3.8796744346618652, "rewards/margins": 0.4414243698120117, "rewards/rejected": 3.4382503032684326, "step": 53720 }, { "epoch": 2.494544779237662, "grad_norm": 112.26301574707031, "learning_rate": 1.5039695436185524e-07, "logits/chosen": -19.23517608642578, "logits/rejected": -18.724681854248047, "logps/chosen": -365.7464904785156, "logps/rejected": -299.16900634765625, "loss": 0.5777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.03320050239563, "rewards/margins": 0.3461152911186218, "rewards/rejected": 2.6870851516723633, "step": 53730 }, { "epoch": 2.495009053345095, "grad_norm": 8.580418586730957, "learning_rate": 1.5036909791540925e-07, "logits/chosen": -20.076440811157227, "logits/rejected": -18.133249282836914, "logps/chosen": -440.3191833496094, "logps/rejected": -211.0631103515625, "loss": 0.3115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8800055980682373, "rewards/margins": 2.185532808303833, "rewards/rejected": 1.6944730281829834, "step": 53740 }, { "epoch": 2.495473327452528, "grad_norm": 113.4063949584961, "learning_rate": 1.5034124146896324e-07, "logits/chosen": -19.03194808959961, "logits/rejected": -18.33558464050293, "logps/chosen": -436.20111083984375, "logps/rejected": -413.18121337890625, "loss": 0.8175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.537142515182495, "rewards/margins": 1.0156891345977783, "rewards/rejected": 2.521453380584717, "step": 53750 }, { "epoch": 2.495937601559961, "grad_norm": 144.65394592285156, "learning_rate": 1.5031338502251728e-07, "logits/chosen": -18.015178680419922, "logits/rejected": -18.100011825561523, "logps/chosen": -423.5059509277344, "logps/rejected": -379.15863037109375, "loss": 1.2654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.045858860015869, "rewards/margins": 0.20647554099559784, "rewards/rejected": 3.839383602142334, "step": 53760 }, { "epoch": 2.4964018756673942, "grad_norm": 28.42469596862793, "learning_rate": 1.502855285760713e-07, "logits/chosen": -19.21200942993164, "logits/rejected": -18.304487228393555, "logps/chosen": -460.5523376464844, "logps/rejected": -341.4406433105469, "loss": 0.5053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3251500129699707, "rewards/margins": 1.3508234024047852, "rewards/rejected": 1.974326491355896, "step": 53770 }, { "epoch": 2.496866149774827, "grad_norm": 29.611114501953125, "learning_rate": 1.5025767212962534e-07, "logits/chosen": -18.771081924438477, "logits/rejected": -18.711580276489258, "logps/chosen": -397.75543212890625, "logps/rejected": -346.19561767578125, "loss": 0.651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.096616744995117, "rewards/margins": 0.5809457302093506, "rewards/rejected": 2.5156710147857666, "step": 53780 }, { "epoch": 2.4973304238822602, "grad_norm": 92.53409576416016, "learning_rate": 1.5022981568317933e-07, "logits/chosen": -18.788557052612305, "logits/rejected": -18.265117645263672, "logps/chosen": -403.9627380371094, "logps/rejected": -339.7233581542969, "loss": 0.5503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8692562580108643, "rewards/margins": 0.8628090023994446, "rewards/rejected": 2.0064470767974854, "step": 53790 }, { "epoch": 2.497794697989693, "grad_norm": 207.59471130371094, "learning_rate": 1.5020195923673337e-07, "logits/chosen": -19.48158836364746, "logits/rejected": -18.72591781616211, "logps/chosen": -371.3055725097656, "logps/rejected": -306.41949462890625, "loss": 0.9237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.263554096221924, "rewards/margins": 0.749496340751648, "rewards/rejected": 3.5140578746795654, "step": 53800 }, { "epoch": 2.498258972097126, "grad_norm": 125.04472351074219, "learning_rate": 1.5017410279028738e-07, "logits/chosen": -18.948486328125, "logits/rejected": -18.422412872314453, "logps/chosen": -387.09326171875, "logps/rejected": -292.91119384765625, "loss": 0.7845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7182788848876953, "rewards/margins": 1.6757030487060547, "rewards/rejected": 2.042576313018799, "step": 53810 }, { "epoch": 2.498723246204559, "grad_norm": 36.89535140991211, "learning_rate": 1.501462463438414e-07, "logits/chosen": -19.017536163330078, "logits/rejected": -18.4063663482666, "logps/chosen": -357.51715087890625, "logps/rejected": -369.83282470703125, "loss": 0.672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.502167224884033, "rewards/margins": 1.0616487264633179, "rewards/rejected": 2.440518617630005, "step": 53820 }, { "epoch": 2.499187520311992, "grad_norm": 254.72586059570312, "learning_rate": 1.501183898973954e-07, "logits/chosen": -18.82058334350586, "logits/rejected": -18.989774703979492, "logps/chosen": -359.00860595703125, "logps/rejected": -323.4208679199219, "loss": 1.0441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4459850788116455, "rewards/margins": -0.09779369831085205, "rewards/rejected": 3.543778896331787, "step": 53830 }, { "epoch": 2.4996517944194254, "grad_norm": 51.918479919433594, "learning_rate": 1.5009053345094943e-07, "logits/chosen": -19.562850952148438, "logits/rejected": -19.315914154052734, "logps/chosen": -337.1866149902344, "logps/rejected": -340.30218505859375, "loss": 1.0743, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.796506881713867, "rewards/margins": -0.30088767409324646, "rewards/rejected": 4.0973944664001465, "step": 53840 }, { "epoch": 2.500116068526858, "grad_norm": 326.90814208984375, "learning_rate": 1.5006267700450347e-07, "logits/chosen": -18.300241470336914, "logits/rejected": -17.58873176574707, "logps/chosen": -400.2995300292969, "logps/rejected": -314.4311218261719, "loss": 0.8047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.63006329536438, "rewards/margins": 0.6754180788993835, "rewards/rejected": 2.9546449184417725, "step": 53850 }, { "epoch": 2.5005803426342914, "grad_norm": 121.09236145019531, "learning_rate": 1.5003482055805748e-07, "logits/chosen": -20.011825561523438, "logits/rejected": -19.803470611572266, "logps/chosen": -444.5550842285156, "logps/rejected": -427.4664611816406, "loss": 0.4645, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.648501396179199, "rewards/margins": 0.9294945597648621, "rewards/rejected": 3.7190070152282715, "step": 53860 }, { "epoch": 2.501044616741724, "grad_norm": 104.07079315185547, "learning_rate": 1.5000696411161147e-07, "logits/chosen": -19.312824249267578, "logits/rejected": -18.796802520751953, "logps/chosen": -377.9786071777344, "logps/rejected": -323.4903869628906, "loss": 0.5216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.700188398361206, "rewards/margins": 1.299226999282837, "rewards/rejected": 2.400961399078369, "step": 53870 }, { "epoch": 2.5015088908491574, "grad_norm": 97.568115234375, "learning_rate": 1.499791076651655e-07, "logits/chosen": -18.09994125366211, "logits/rejected": -18.1093692779541, "logps/chosen": -360.9840393066406, "logps/rejected": -376.1943359375, "loss": 0.8428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.789210081100464, "rewards/margins": 0.7232701778411865, "rewards/rejected": 3.0659401416778564, "step": 53880 }, { "epoch": 2.50197316495659, "grad_norm": 60.53214645385742, "learning_rate": 1.4995125121871953e-07, "logits/chosen": -18.626399993896484, "logits/rejected": -18.239124298095703, "logps/chosen": -274.5327453613281, "logps/rejected": -307.5655212402344, "loss": 0.8508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1429591178894043, "rewards/margins": 0.20462436974048615, "rewards/rejected": 1.9383347034454346, "step": 53890 }, { "epoch": 2.5024374390640234, "grad_norm": 4.704360485076904, "learning_rate": 1.4992339477227354e-07, "logits/chosen": -19.378694534301758, "logits/rejected": -17.783893585205078, "logps/chosen": -432.3756408691406, "logps/rejected": -313.3398132324219, "loss": 0.441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.842341661453247, "rewards/margins": 1.9606469869613647, "rewards/rejected": 1.8816944360733032, "step": 53900 }, { "epoch": 2.5029017131714566, "grad_norm": 45.35842514038086, "learning_rate": 1.4989553832582758e-07, "logits/chosen": -17.97463607788086, "logits/rejected": -17.809816360473633, "logps/chosen": -406.4759216308594, "logps/rejected": -348.1817626953125, "loss": 0.9378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9100637435913086, "rewards/margins": 0.42787665128707886, "rewards/rejected": 2.482187271118164, "step": 53910 }, { "epoch": 2.5033659872788894, "grad_norm": 8.679061889648438, "learning_rate": 1.4986768187938157e-07, "logits/chosen": -18.942546844482422, "logits/rejected": -17.882946014404297, "logps/chosen": -375.91021728515625, "logps/rejected": -260.2882080078125, "loss": 0.3356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.056779384613037, "rewards/margins": 1.5153124332427979, "rewards/rejected": 1.5414669513702393, "step": 53920 }, { "epoch": 2.5038302613863226, "grad_norm": 167.2323760986328, "learning_rate": 1.4983982543293558e-07, "logits/chosen": -18.212051391601562, "logits/rejected": -18.463607788085938, "logps/chosen": -413.03045654296875, "logps/rejected": -347.24737548828125, "loss": 0.8908, "rewards/accuracies": 0.5, "rewards/chosen": 2.857534885406494, "rewards/margins": 0.17507095634937286, "rewards/rejected": 2.682464122772217, "step": 53930 }, { "epoch": 2.5042945354937554, "grad_norm": 21.10970687866211, "learning_rate": 1.4981196898648962e-07, "logits/chosen": -18.824016571044922, "logits/rejected": -17.53530502319336, "logps/chosen": -479.8282775878906, "logps/rejected": -371.31671142578125, "loss": 0.5841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.986411094665527, "rewards/margins": 2.27327036857605, "rewards/rejected": 2.7131407260894775, "step": 53940 }, { "epoch": 2.5047588096011886, "grad_norm": 120.88113403320312, "learning_rate": 1.4978411254004364e-07, "logits/chosen": -19.300554275512695, "logits/rejected": -18.24433135986328, "logps/chosen": -403.64208984375, "logps/rejected": -289.6347351074219, "loss": 0.3884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.087048053741455, "rewards/margins": 1.7343940734863281, "rewards/rejected": 2.352653980255127, "step": 53950 }, { "epoch": 2.5052230837086213, "grad_norm": 306.516357421875, "learning_rate": 1.4975625609359765e-07, "logits/chosen": -17.867963790893555, "logits/rejected": -18.48162078857422, "logps/chosen": -265.01031494140625, "logps/rejected": -314.04803466796875, "loss": 1.0372, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.371570348739624, "rewards/margins": -0.07674215734004974, "rewards/rejected": 2.448312520980835, "step": 53960 }, { "epoch": 2.5056873578160546, "grad_norm": 130.30445861816406, "learning_rate": 1.4972839964715167e-07, "logits/chosen": -18.956396102905273, "logits/rejected": -17.962854385375977, "logps/chosen": -331.0318603515625, "logps/rejected": -211.50332641601562, "loss": 0.4329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1397104263305664, "rewards/margins": 1.9837175607681274, "rewards/rejected": 1.1559925079345703, "step": 53970 }, { "epoch": 2.5061516319234878, "grad_norm": 99.01117706298828, "learning_rate": 1.4970054320070568e-07, "logits/chosen": -19.273883819580078, "logits/rejected": -18.366220474243164, "logps/chosen": -426.59906005859375, "logps/rejected": -323.6329650878906, "loss": 0.4665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.83721923828125, "rewards/margins": 1.0910632610321045, "rewards/rejected": 2.7461562156677246, "step": 53980 }, { "epoch": 2.5066159060309205, "grad_norm": 68.33365631103516, "learning_rate": 1.496726867542597e-07, "logits/chosen": -18.86447525024414, "logits/rejected": -18.73703384399414, "logps/chosen": -353.147216796875, "logps/rejected": -314.8617248535156, "loss": 0.9942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3674423694610596, "rewards/margins": 0.790867805480957, "rewards/rejected": 2.5765748023986816, "step": 53990 }, { "epoch": 2.5070801801383538, "grad_norm": 105.62544250488281, "learning_rate": 1.4964483030781374e-07, "logits/chosen": -19.40496253967285, "logits/rejected": -19.486122131347656, "logps/chosen": -351.34100341796875, "logps/rejected": -438.5125427246094, "loss": 1.0777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6557700634002686, "rewards/margins": -0.29161161184310913, "rewards/rejected": 3.9473819732666016, "step": 54000 }, { "epoch": 2.507544454245787, "grad_norm": 77.3538818359375, "learning_rate": 1.4961697386136775e-07, "logits/chosen": -18.695396423339844, "logits/rejected": -17.944515228271484, "logps/chosen": -315.6026306152344, "logps/rejected": -223.98486328125, "loss": 0.4045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.232616901397705, "rewards/margins": 1.2822431325912476, "rewards/rejected": 0.9503735303878784, "step": 54010 }, { "epoch": 2.5080087283532198, "grad_norm": 193.02537536621094, "learning_rate": 1.4958911741492177e-07, "logits/chosen": -19.98114585876465, "logits/rejected": -20.030031204223633, "logps/chosen": -421.802490234375, "logps/rejected": -414.33929443359375, "loss": 1.0188, "rewards/accuracies": 0.5, "rewards/chosen": 4.272820472717285, "rewards/margins": 0.8354585766792297, "rewards/rejected": 3.4373619556427, "step": 54020 }, { "epoch": 2.5084730024606525, "grad_norm": 38.95923614501953, "learning_rate": 1.4956126096847578e-07, "logits/chosen": -19.28605079650879, "logits/rejected": -18.999889373779297, "logps/chosen": -426.05645751953125, "logps/rejected": -334.4682312011719, "loss": 0.9055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4332377910614014, "rewards/margins": 0.40412282943725586, "rewards/rejected": 3.0291147232055664, "step": 54030 }, { "epoch": 2.5089372765680857, "grad_norm": 46.31911849975586, "learning_rate": 1.495334045220298e-07, "logits/chosen": -20.006755828857422, "logits/rejected": -18.993743896484375, "logps/chosen": -492.95819091796875, "logps/rejected": -414.18621826171875, "loss": 0.9851, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5007872581481934, "rewards/margins": 0.24663110077381134, "rewards/rejected": 3.2541561126708984, "step": 54040 }, { "epoch": 2.509401550675519, "grad_norm": 4.4397501945495605, "learning_rate": 1.495055480755838e-07, "logits/chosen": -19.501724243164062, "logits/rejected": -18.679969787597656, "logps/chosen": -386.97552490234375, "logps/rejected": -354.57525634765625, "loss": 0.5123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5973167419433594, "rewards/margins": 1.2069042921066284, "rewards/rejected": 2.3904123306274414, "step": 54050 }, { "epoch": 2.5098658247829517, "grad_norm": 64.05890655517578, "learning_rate": 1.4947769162913785e-07, "logits/chosen": -18.756898880004883, "logits/rejected": -17.349740982055664, "logps/chosen": -450.88323974609375, "logps/rejected": -292.7302551269531, "loss": 0.51, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.750967025756836, "rewards/margins": 1.6573066711425781, "rewards/rejected": 2.0936601161956787, "step": 54060 }, { "epoch": 2.510330098890385, "grad_norm": 21.89982795715332, "learning_rate": 1.4944983518269184e-07, "logits/chosen": -18.18926239013672, "logits/rejected": -18.132402420043945, "logps/chosen": -381.4098205566406, "logps/rejected": -409.64459228515625, "loss": 1.1395, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.301305055618286, "rewards/margins": 0.3746611177921295, "rewards/rejected": 2.9266438484191895, "step": 54070 }, { "epoch": 2.510794372997818, "grad_norm": 160.81761169433594, "learning_rate": 1.4942197873624588e-07, "logits/chosen": -19.975875854492188, "logits/rejected": -19.15445899963379, "logps/chosen": -447.2479553222656, "logps/rejected": -427.07672119140625, "loss": 0.878, "rewards/accuracies": 0.5, "rewards/chosen": 3.8763344287872314, "rewards/margins": 0.9304499626159668, "rewards/rejected": 2.9458847045898438, "step": 54080 }, { "epoch": 2.511258647105251, "grad_norm": 161.26048278808594, "learning_rate": 1.493941222897999e-07, "logits/chosen": -18.736963272094727, "logits/rejected": -17.878833770751953, "logps/chosen": -376.25042724609375, "logps/rejected": -306.3352966308594, "loss": 0.6135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2744956016540527, "rewards/margins": 0.8815266489982605, "rewards/rejected": 2.3929691314697266, "step": 54090 }, { "epoch": 2.5117229212126837, "grad_norm": 33.035377502441406, "learning_rate": 1.493662658433539e-07, "logits/chosen": -19.032203674316406, "logits/rejected": -17.730968475341797, "logps/chosen": -420.99627685546875, "logps/rejected": -291.0389099121094, "loss": 0.5054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.675006866455078, "rewards/margins": 1.8626453876495361, "rewards/rejected": 1.8123613595962524, "step": 54100 }, { "epoch": 2.512187195320117, "grad_norm": 2.0146961212158203, "learning_rate": 1.4933840939690792e-07, "logits/chosen": -19.661588668823242, "logits/rejected": -18.526098251342773, "logps/chosen": -366.85064697265625, "logps/rejected": -255.7930450439453, "loss": 0.5255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.790478467941284, "rewards/margins": 1.9045183658599854, "rewards/rejected": 1.8859602212905884, "step": 54110 }, { "epoch": 2.51265146942755, "grad_norm": 59.28238296508789, "learning_rate": 1.4931055295046194e-07, "logits/chosen": -18.94841766357422, "logits/rejected": -18.771770477294922, "logps/chosen": -343.42626953125, "logps/rejected": -334.410400390625, "loss": 0.6384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.691948413848877, "rewards/margins": 0.8137845993041992, "rewards/rejected": 2.8781638145446777, "step": 54120 }, { "epoch": 2.513115743534983, "grad_norm": 11.224679946899414, "learning_rate": 1.4928269650401595e-07, "logits/chosen": -19.583988189697266, "logits/rejected": -18.991863250732422, "logps/chosen": -375.98809814453125, "logps/rejected": -354.56329345703125, "loss": 0.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0252153873443604, "rewards/margins": 0.9696822166442871, "rewards/rejected": 2.055532932281494, "step": 54130 }, { "epoch": 2.513580017642416, "grad_norm": 170.73187255859375, "learning_rate": 1.4925484005756997e-07, "logits/chosen": -18.93069076538086, "logits/rejected": -19.098438262939453, "logps/chosen": -377.7881774902344, "logps/rejected": -385.7791748046875, "loss": 1.2107, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2373099327087402, "rewards/margins": -0.4960891604423523, "rewards/rejected": 3.7333991527557373, "step": 54140 }, { "epoch": 2.5140442917498493, "grad_norm": 120.1151123046875, "learning_rate": 1.49226983611124e-07, "logits/chosen": -18.717357635498047, "logits/rejected": -17.72564125061035, "logps/chosen": -435.88214111328125, "logps/rejected": -363.6648864746094, "loss": 0.5956, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5274882316589355, "rewards/margins": 1.3092310428619385, "rewards/rejected": 2.218257427215576, "step": 54150 }, { "epoch": 2.514508565857282, "grad_norm": 49.666812896728516, "learning_rate": 1.4919912716467802e-07, "logits/chosen": -18.85878562927246, "logits/rejected": -18.513044357299805, "logps/chosen": -294.7864685058594, "logps/rejected": -262.29461669921875, "loss": 1.1434, "rewards/accuracies": 0.5, "rewards/chosen": 2.885007381439209, "rewards/margins": 0.1392524540424347, "rewards/rejected": 2.745755434036255, "step": 54160 }, { "epoch": 2.5149728399647153, "grad_norm": 3.716369390487671, "learning_rate": 1.4917127071823204e-07, "logits/chosen": -18.84762954711914, "logits/rejected": -18.20458221435547, "logps/chosen": -357.5262451171875, "logps/rejected": -310.3700866699219, "loss": 0.4624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.214428424835205, "rewards/margins": 1.2118918895721436, "rewards/rejected": 2.0025365352630615, "step": 54170 }, { "epoch": 2.515437114072148, "grad_norm": 0.8089411854743958, "learning_rate": 1.4914341427178605e-07, "logits/chosen": -18.950368881225586, "logits/rejected": -17.399442672729492, "logps/chosen": -479.99285888671875, "logps/rejected": -296.365966796875, "loss": 0.288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.288753509521484, "rewards/margins": 2.471622943878174, "rewards/rejected": 1.8171310424804688, "step": 54180 }, { "epoch": 2.5159013881795813, "grad_norm": 29.308340072631836, "learning_rate": 1.4911555782534007e-07, "logits/chosen": -18.99004364013672, "logits/rejected": -17.606204986572266, "logps/chosen": -388.0741271972656, "logps/rejected": -235.24874877929688, "loss": 0.4328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.069201469421387, "rewards/margins": 2.609896183013916, "rewards/rejected": 1.4593050479888916, "step": 54190 }, { "epoch": 2.516365662287014, "grad_norm": 67.75393676757812, "learning_rate": 1.4908770137889408e-07, "logits/chosen": -18.83429527282715, "logits/rejected": -17.800867080688477, "logps/chosen": -422.181396484375, "logps/rejected": -289.5284423828125, "loss": 1.2631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6314194202423096, "rewards/margins": 1.0209141969680786, "rewards/rejected": 2.6105051040649414, "step": 54200 }, { "epoch": 2.5168299363944473, "grad_norm": 4.001238822937012, "learning_rate": 1.4905984493244812e-07, "logits/chosen": -18.453922271728516, "logits/rejected": -17.44846534729004, "logps/chosen": -325.1104736328125, "logps/rejected": -215.09249877929688, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.819964647293091, "rewards/margins": 1.466464638710022, "rewards/rejected": 1.3534998893737793, "step": 54210 }, { "epoch": 2.5172942105018805, "grad_norm": 15.628573417663574, "learning_rate": 1.4903198848600214e-07, "logits/chosen": -18.93416976928711, "logits/rejected": -18.36708641052246, "logps/chosen": -392.3772888183594, "logps/rejected": -347.37225341796875, "loss": 0.748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6469831466674805, "rewards/margins": 0.8341512680053711, "rewards/rejected": 1.8128316402435303, "step": 54220 }, { "epoch": 2.5177584846093133, "grad_norm": 45.07060623168945, "learning_rate": 1.4900413203955615e-07, "logits/chosen": -19.706642150878906, "logits/rejected": -18.943166732788086, "logps/chosen": -411.07147216796875, "logps/rejected": -301.19830322265625, "loss": 0.3137, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.622861862182617, "rewards/margins": 2.375276803970337, "rewards/rejected": 2.2475852966308594, "step": 54230 }, { "epoch": 2.5182227587167465, "grad_norm": 37.69023513793945, "learning_rate": 1.4897627559311017e-07, "logits/chosen": -18.565378189086914, "logits/rejected": -18.124094009399414, "logps/chosen": -365.3462829589844, "logps/rejected": -306.02056884765625, "loss": 0.6542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.746641159057617, "rewards/margins": 1.6867849826812744, "rewards/rejected": 2.059856414794922, "step": 54240 }, { "epoch": 2.5186870328241793, "grad_norm": 49.9832878112793, "learning_rate": 1.4894841914666418e-07, "logits/chosen": -18.588695526123047, "logits/rejected": -18.986927032470703, "logps/chosen": -275.0871276855469, "logps/rejected": -329.90667724609375, "loss": 1.049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7226413488388062, "rewards/margins": -0.3029601275920868, "rewards/rejected": 2.025601625442505, "step": 54250 }, { "epoch": 2.5191513069316125, "grad_norm": 152.5928192138672, "learning_rate": 1.489205627002182e-07, "logits/chosen": -19.71653175354004, "logits/rejected": -18.60415267944336, "logps/chosen": -427.21649169921875, "logps/rejected": -291.08990478515625, "loss": 0.5748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.687098503112793, "rewards/margins": 1.382946491241455, "rewards/rejected": 3.304152250289917, "step": 54260 }, { "epoch": 2.5196155810390453, "grad_norm": 11.68189525604248, "learning_rate": 1.4889270625377224e-07, "logits/chosen": -17.928218841552734, "logits/rejected": -16.84642791748047, "logps/chosen": -380.1686706542969, "logps/rejected": -226.11465454101562, "loss": 0.2517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0571160316467285, "rewards/margins": 2.0576252937316895, "rewards/rejected": 0.9994907379150391, "step": 54270 }, { "epoch": 2.5200798551464785, "grad_norm": 222.8442840576172, "learning_rate": 1.4886484980732622e-07, "logits/chosen": -19.624391555786133, "logits/rejected": -18.724811553955078, "logps/chosen": -452.4087829589844, "logps/rejected": -330.4404296875, "loss": 0.8755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.552365779876709, "rewards/margins": 1.2906824350357056, "rewards/rejected": 3.261683225631714, "step": 54280 }, { "epoch": 2.5205441292539117, "grad_norm": 0.5931979417800903, "learning_rate": 1.4883699336088027e-07, "logits/chosen": -19.128679275512695, "logits/rejected": -19.141923904418945, "logps/chosen": -444.0375061035156, "logps/rejected": -375.689453125, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.387192726135254, "rewards/margins": 1.4493391513824463, "rewards/rejected": 2.9378533363342285, "step": 54290 }, { "epoch": 2.5210084033613445, "grad_norm": 55.479183197021484, "learning_rate": 1.4880913691443428e-07, "logits/chosen": -17.994983673095703, "logits/rejected": -16.532930374145508, "logps/chosen": -346.6787109375, "logps/rejected": -202.02725219726562, "loss": 0.4785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.859891891479492, "rewards/margins": 1.5890759229660034, "rewards/rejected": 1.2708159685134888, "step": 54300 }, { "epoch": 2.5214726774687777, "grad_norm": 12.523630142211914, "learning_rate": 1.487812804679883e-07, "logits/chosen": -18.410850524902344, "logits/rejected": -18.0247802734375, "logps/chosen": -514.4359130859375, "logps/rejected": -483.682861328125, "loss": 0.9147, "rewards/accuracies": 0.5, "rewards/chosen": 4.261288642883301, "rewards/margins": 0.1441819816827774, "rewards/rejected": 4.1171064376831055, "step": 54310 }, { "epoch": 2.5219369515762105, "grad_norm": 75.10826873779297, "learning_rate": 1.487534240215423e-07, "logits/chosen": -19.52301788330078, "logits/rejected": -18.174917221069336, "logps/chosen": -317.79034423828125, "logps/rejected": -250.7188720703125, "loss": 0.3515, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9344356060028076, "rewards/margins": 1.5440824031829834, "rewards/rejected": 1.3903532028198242, "step": 54320 }, { "epoch": 2.5224012256836437, "grad_norm": 0.9674138426780701, "learning_rate": 1.4872556757509632e-07, "logits/chosen": -19.59147834777832, "logits/rejected": -18.4884033203125, "logps/chosen": -371.1761779785156, "logps/rejected": -294.55621337890625, "loss": 0.8585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5026512145996094, "rewards/margins": 1.1001797914505005, "rewards/rejected": 2.4024715423583984, "step": 54330 }, { "epoch": 2.5228654997910764, "grad_norm": 4.594205379486084, "learning_rate": 1.4869771112865034e-07, "logits/chosen": -19.38726234436035, "logits/rejected": -19.2917423248291, "logps/chosen": -401.21490478515625, "logps/rejected": -355.23370361328125, "loss": 0.6358, "rewards/accuracies": 0.5, "rewards/chosen": 3.583787441253662, "rewards/margins": 0.9274199604988098, "rewards/rejected": 2.656367778778076, "step": 54340 }, { "epoch": 2.5233297738985097, "grad_norm": 49.48571014404297, "learning_rate": 1.4866985468220435e-07, "logits/chosen": -17.757553100585938, "logits/rejected": -16.974111557006836, "logps/chosen": -392.74371337890625, "logps/rejected": -265.99169921875, "loss": 0.6924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.221590518951416, "rewards/margins": 1.3585752248764038, "rewards/rejected": 1.8630154132843018, "step": 54350 }, { "epoch": 2.523794048005943, "grad_norm": 1.1015713214874268, "learning_rate": 1.486419982357584e-07, "logits/chosen": -18.8503360748291, "logits/rejected": -17.73600196838379, "logps/chosen": -388.1017761230469, "logps/rejected": -315.0089111328125, "loss": 0.3141, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.083849906921387, "rewards/margins": 1.7492825984954834, "rewards/rejected": 2.3345675468444824, "step": 54360 }, { "epoch": 2.5242583221133756, "grad_norm": 127.55522155761719, "learning_rate": 1.486141417893124e-07, "logits/chosen": -19.243959426879883, "logits/rejected": -18.879196166992188, "logps/chosen": -426.8987731933594, "logps/rejected": -392.00885009765625, "loss": 0.9369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.003016471862793, "rewards/margins": 0.250338613986969, "rewards/rejected": 3.7526779174804688, "step": 54370 }, { "epoch": 2.524722596220809, "grad_norm": 106.41446685791016, "learning_rate": 1.4858628534286642e-07, "logits/chosen": -19.84769058227539, "logits/rejected": -19.352807998657227, "logps/chosen": -309.33721923828125, "logps/rejected": -303.2013244628906, "loss": 1.1103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.623077630996704, "rewards/margins": -0.304741770029068, "rewards/rejected": 2.9278194904327393, "step": 54380 }, { "epoch": 2.5251868703282416, "grad_norm": 34.45991897583008, "learning_rate": 1.4855842889642044e-07, "logits/chosen": -19.60063934326172, "logits/rejected": -18.872732162475586, "logps/chosen": -401.38226318359375, "logps/rejected": -362.3844299316406, "loss": 0.7988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4968528747558594, "rewards/margins": 0.8043487668037415, "rewards/rejected": 2.6925039291381836, "step": 54390 }, { "epoch": 2.525651144435675, "grad_norm": 51.71083068847656, "learning_rate": 1.4853057244997445e-07, "logits/chosen": -20.009023666381836, "logits/rejected": -19.715261459350586, "logps/chosen": -388.07562255859375, "logps/rejected": -355.28057861328125, "loss": 0.6317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3139755725860596, "rewards/margins": 0.5627159476280212, "rewards/rejected": 2.7512593269348145, "step": 54400 }, { "epoch": 2.5261154185431076, "grad_norm": 42.00261306762695, "learning_rate": 1.4850271600352847e-07, "logits/chosen": -19.523832321166992, "logits/rejected": -18.802379608154297, "logps/chosen": -413.9052734375, "logps/rejected": -368.5461120605469, "loss": 0.6693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0944910049438477, "rewards/margins": 0.5495911836624146, "rewards/rejected": 2.5448994636535645, "step": 54410 }, { "epoch": 2.526579692650541, "grad_norm": 92.10810852050781, "learning_rate": 1.484748595570825e-07, "logits/chosen": -18.27344512939453, "logits/rejected": -17.8685245513916, "logps/chosen": -434.5807189941406, "logps/rejected": -490.9327697753906, "loss": 0.8245, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3246617317199707, "rewards/margins": 0.08471076190471649, "rewards/rejected": 3.2399508953094482, "step": 54420 }, { "epoch": 2.527043966757974, "grad_norm": 31.19053077697754, "learning_rate": 1.4844700311063652e-07, "logits/chosen": -19.48036766052246, "logits/rejected": -18.636842727661133, "logps/chosen": -448.07550048828125, "logps/rejected": -333.92681884765625, "loss": 0.6189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.753236770629883, "rewards/margins": 1.2896562814712524, "rewards/rejected": 2.46358060836792, "step": 54430 }, { "epoch": 2.527508240865407, "grad_norm": 92.91464233398438, "learning_rate": 1.4841914666419054e-07, "logits/chosen": -19.827083587646484, "logits/rejected": -18.56058692932129, "logps/chosen": -359.93701171875, "logps/rejected": -262.5145263671875, "loss": 0.2076, "rewards/accuracies": 1.0, "rewards/chosen": 3.6628432273864746, "rewards/margins": 2.01625394821167, "rewards/rejected": 1.6465895175933838, "step": 54440 }, { "epoch": 2.52797251497284, "grad_norm": 45.993934631347656, "learning_rate": 1.4839129021774455e-07, "logits/chosen": -17.875713348388672, "logits/rejected": -16.995235443115234, "logps/chosen": -393.6036682128906, "logps/rejected": -335.83343505859375, "loss": 0.3401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.397942304611206, "rewards/margins": 1.7113949060440063, "rewards/rejected": 1.686547040939331, "step": 54450 }, { "epoch": 2.5284367890802733, "grad_norm": 33.29474639892578, "learning_rate": 1.4836343377129857e-07, "logits/chosen": -18.55677032470703, "logits/rejected": -17.812992095947266, "logps/chosen": -352.8486328125, "logps/rejected": -266.4326171875, "loss": 1.036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.150393009185791, "rewards/margins": 0.823459267616272, "rewards/rejected": 2.3269338607788086, "step": 54460 }, { "epoch": 2.528901063187706, "grad_norm": 74.6198501586914, "learning_rate": 1.4833557732485258e-07, "logits/chosen": -18.781665802001953, "logits/rejected": -18.389638900756836, "logps/chosen": -386.71856689453125, "logps/rejected": -319.8545227050781, "loss": 0.8449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5659146308898926, "rewards/margins": 1.127881646156311, "rewards/rejected": 2.43803334236145, "step": 54470 }, { "epoch": 2.529365337295139, "grad_norm": 42.05866241455078, "learning_rate": 1.4830772087840662e-07, "logits/chosen": -19.257579803466797, "logits/rejected": -19.168193817138672, "logps/chosen": -397.29302978515625, "logps/rejected": -360.2471008300781, "loss": 0.8384, "rewards/accuracies": 0.5, "rewards/chosen": 3.11942458152771, "rewards/margins": 0.027411621063947678, "rewards/rejected": 3.092013120651245, "step": 54480 }, { "epoch": 2.529829611402572, "grad_norm": 231.54586791992188, "learning_rate": 1.482798644319606e-07, "logits/chosen": -19.266704559326172, "logits/rejected": -18.272754669189453, "logps/chosen": -397.4145202636719, "logps/rejected": -317.8357238769531, "loss": 0.4996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.299883842468262, "rewards/margins": 2.187537670135498, "rewards/rejected": 2.1123461723327637, "step": 54490 }, { "epoch": 2.5302938855100052, "grad_norm": 7.746692180633545, "learning_rate": 1.4825200798551465e-07, "logits/chosen": -18.638103485107422, "logits/rejected": -17.436779022216797, "logps/chosen": -344.54443359375, "logps/rejected": -255.672119140625, "loss": 0.55, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.643322467803955, "rewards/margins": 1.650383710861206, "rewards/rejected": 0.9929388165473938, "step": 54500 }, { "epoch": 2.530758159617438, "grad_norm": 34.214508056640625, "learning_rate": 1.4822415153906866e-07, "logits/chosen": -19.207950592041016, "logits/rejected": -17.317480087280273, "logps/chosen": -488.9979553222656, "logps/rejected": -298.0838928222656, "loss": 0.4212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.671616315841675, "rewards/margins": 2.0515224933624268, "rewards/rejected": 1.6200939416885376, "step": 54510 }, { "epoch": 2.531222433724871, "grad_norm": 221.28768920898438, "learning_rate": 1.4819629509262268e-07, "logits/chosen": -18.860164642333984, "logits/rejected": -18.693614959716797, "logps/chosen": -286.174560546875, "logps/rejected": -225.5840301513672, "loss": 1.1203, "rewards/accuracies": 0.5, "rewards/chosen": 2.039686441421509, "rewards/margins": 0.15249498188495636, "rewards/rejected": 1.8871911764144897, "step": 54520 }, { "epoch": 2.5316867078323044, "grad_norm": 7.397354602813721, "learning_rate": 1.481684386461767e-07, "logits/chosen": -18.663026809692383, "logits/rejected": -16.919635772705078, "logps/chosen": -408.6808776855469, "logps/rejected": -248.5507354736328, "loss": 0.2174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7212181091308594, "rewards/margins": 2.216353416442871, "rewards/rejected": 1.5048646926879883, "step": 54530 }, { "epoch": 2.532150981939737, "grad_norm": 30.13546371459961, "learning_rate": 1.481405821997307e-07, "logits/chosen": -19.495561599731445, "logits/rejected": -19.24325942993164, "logps/chosen": -452.2721252441406, "logps/rejected": -427.80267333984375, "loss": 0.5447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.25287389755249, "rewards/margins": 0.6374632120132446, "rewards/rejected": 3.6154110431671143, "step": 54540 }, { "epoch": 2.5326152560471704, "grad_norm": 29.00448226928711, "learning_rate": 1.4811272575328472e-07, "logits/chosen": -19.232059478759766, "logits/rejected": -18.785438537597656, "logps/chosen": -461.32470703125, "logps/rejected": -379.73101806640625, "loss": 0.4569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.920490264892578, "rewards/margins": 1.2705644369125366, "rewards/rejected": 2.649925947189331, "step": 54550 }, { "epoch": 2.533079530154603, "grad_norm": 4.890122890472412, "learning_rate": 1.4808486930683874e-07, "logits/chosen": -18.86170768737793, "logits/rejected": -17.691844940185547, "logps/chosen": -239.21286010742188, "logps/rejected": -163.25802612304688, "loss": 0.3145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.744299292564392, "rewards/margins": 1.5373297929763794, "rewards/rejected": 0.20696942508220673, "step": 54560 }, { "epoch": 2.5335438042620364, "grad_norm": 1.8935436010360718, "learning_rate": 1.4805979850503738e-07, "logits/chosen": -19.678537368774414, "logits/rejected": -18.839420318603516, "logps/chosen": -360.07073974609375, "logps/rejected": -285.7843933105469, "loss": 0.9008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5580153465270996, "rewards/margins": 0.2793195843696594, "rewards/rejected": 2.278695583343506, "step": 54570 }, { "epoch": 2.534008078369469, "grad_norm": 136.09371948242188, "learning_rate": 1.4803194205859137e-07, "logits/chosen": -19.868995666503906, "logits/rejected": -19.584856033325195, "logps/chosen": -419.6578063964844, "logps/rejected": -337.9726867675781, "loss": 1.0561, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.363016128540039, "rewards/margins": 0.4105490744113922, "rewards/rejected": 2.952467203140259, "step": 54580 }, { "epoch": 2.5344723524769024, "grad_norm": 109.11697387695312, "learning_rate": 1.480040856121454e-07, "logits/chosen": -18.89443016052246, "logits/rejected": -18.48813819885254, "logps/chosen": -357.5894775390625, "logps/rejected": -273.378173828125, "loss": 0.8548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.952763319015503, "rewards/margins": 0.4665554165840149, "rewards/rejected": 2.486208200454712, "step": 54590 }, { "epoch": 2.5349366265843356, "grad_norm": 35.47782516479492, "learning_rate": 1.4797622916569943e-07, "logits/chosen": -19.27580451965332, "logits/rejected": -18.350187301635742, "logps/chosen": -504.1202697753906, "logps/rejected": -407.41705322265625, "loss": 0.519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.834205627441406, "rewards/margins": 1.4897630214691162, "rewards/rejected": 3.344442367553711, "step": 54600 }, { "epoch": 2.5354009006917684, "grad_norm": 29.20677947998047, "learning_rate": 1.4794837271925344e-07, "logits/chosen": -18.913188934326172, "logits/rejected": -18.153034210205078, "logps/chosen": -407.9961853027344, "logps/rejected": -327.46990966796875, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.410327911376953, "rewards/margins": 0.8305956721305847, "rewards/rejected": 2.5797317028045654, "step": 54610 }, { "epoch": 2.5358651747992016, "grad_norm": 49.2175407409668, "learning_rate": 1.4792051627280746e-07, "logits/chosen": -19.29823112487793, "logits/rejected": -17.830076217651367, "logps/chosen": -418.8619689941406, "logps/rejected": -222.4983367919922, "loss": 0.2438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.199338436126709, "rewards/margins": 3.121209144592285, "rewards/rejected": 1.0781296491622925, "step": 54620 }, { "epoch": 2.5363294489066344, "grad_norm": 68.4618911743164, "learning_rate": 1.4789265982636147e-07, "logits/chosen": -18.452749252319336, "logits/rejected": -18.044307708740234, "logps/chosen": -262.8677062988281, "logps/rejected": -215.61941528320312, "loss": 0.9561, "rewards/accuracies": 0.5, "rewards/chosen": 2.4528121948242188, "rewards/margins": 0.5176671743392944, "rewards/rejected": 1.9351450204849243, "step": 54630 }, { "epoch": 2.5367937230140676, "grad_norm": 9.187594413757324, "learning_rate": 1.4786480337991549e-07, "logits/chosen": -19.23465347290039, "logits/rejected": -17.502151489257812, "logps/chosen": -396.51971435546875, "logps/rejected": -330.3426818847656, "loss": 0.4268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.520878791809082, "rewards/margins": 2.391723871231079, "rewards/rejected": 2.129154920578003, "step": 54640 }, { "epoch": 2.5372579971215004, "grad_norm": 54.606082916259766, "learning_rate": 1.478369469334695e-07, "logits/chosen": -18.530010223388672, "logits/rejected": -18.189573287963867, "logps/chosen": -391.88739013671875, "logps/rejected": -363.9010314941406, "loss": 1.1537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2529170513153076, "rewards/margins": 0.5859318971633911, "rewards/rejected": 2.666985034942627, "step": 54650 }, { "epoch": 2.5377222712289336, "grad_norm": 77.4901351928711, "learning_rate": 1.4780909048702354e-07, "logits/chosen": -19.173511505126953, "logits/rejected": -19.216922760009766, "logps/chosen": -309.4910583496094, "logps/rejected": -325.0692443847656, "loss": 0.713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.156830310821533, "rewards/margins": 0.3415278494358063, "rewards/rejected": 1.8153021335601807, "step": 54660 }, { "epoch": 2.538186545336367, "grad_norm": 133.77713012695312, "learning_rate": 1.4778123404057756e-07, "logits/chosen": -18.118770599365234, "logits/rejected": -18.204458236694336, "logps/chosen": -413.8946228027344, "logps/rejected": -390.8335266113281, "loss": 0.9359, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.11789608001709, "rewards/margins": 0.22483256459236145, "rewards/rejected": 2.893063545227051, "step": 54670 }, { "epoch": 2.5386508194437996, "grad_norm": 163.6404571533203, "learning_rate": 1.4775337759413157e-07, "logits/chosen": -18.50281524658203, "logits/rejected": -17.930431365966797, "logps/chosen": -423.571533203125, "logps/rejected": -302.5047302246094, "loss": 0.8292, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.079986095428467, "rewards/margins": 0.8072684407234192, "rewards/rejected": 2.2727174758911133, "step": 54680 }, { "epoch": 2.5391150935512328, "grad_norm": 50.698707580566406, "learning_rate": 1.4772552114768559e-07, "logits/chosen": -18.612689971923828, "logits/rejected": -17.986913681030273, "logps/chosen": -408.31878662109375, "logps/rejected": -291.5523376464844, "loss": 0.4386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.577815294265747, "rewards/margins": 1.3544586896896362, "rewards/rejected": 2.223356008529663, "step": 54690 }, { "epoch": 2.5395793676586655, "grad_norm": 22.54799461364746, "learning_rate": 1.476976647012396e-07, "logits/chosen": -19.42978286743164, "logits/rejected": -18.407283782958984, "logps/chosen": -376.98687744140625, "logps/rejected": -306.41949462890625, "loss": 0.2946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.235574245452881, "rewards/margins": 1.555044412612915, "rewards/rejected": 1.6805299520492554, "step": 54700 }, { "epoch": 2.5400436417660988, "grad_norm": 34.72673797607422, "learning_rate": 1.4766980825479361e-07, "logits/chosen": -18.92400360107422, "logits/rejected": -18.311988830566406, "logps/chosen": -373.5992431640625, "logps/rejected": -265.8324279785156, "loss": 0.391, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9274063110351562, "rewards/margins": 1.7110198736190796, "rewards/rejected": 2.216386079788208, "step": 54710 }, { "epoch": 2.5405079158735315, "grad_norm": 31.31131362915039, "learning_rate": 1.4764195180834766e-07, "logits/chosen": -20.251737594604492, "logits/rejected": -18.56825828552246, "logps/chosen": -433.50311279296875, "logps/rejected": -274.4441223144531, "loss": 0.3342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.241242408752441, "rewards/margins": 2.8452255725860596, "rewards/rejected": 1.3960168361663818, "step": 54720 }, { "epoch": 2.5409721899809647, "grad_norm": 4.252617835998535, "learning_rate": 1.4761409536190167e-07, "logits/chosen": -19.2564640045166, "logits/rejected": -19.525699615478516, "logps/chosen": -341.71624755859375, "logps/rejected": -395.80438232421875, "loss": 0.7636, "rewards/accuracies": 0.5, "rewards/chosen": 3.9733901023864746, "rewards/margins": 0.7585005760192871, "rewards/rejected": 3.2148895263671875, "step": 54730 }, { "epoch": 2.541436464088398, "grad_norm": 10.388701438903809, "learning_rate": 1.4758623891545569e-07, "logits/chosen": -18.710763931274414, "logits/rejected": -17.636280059814453, "logps/chosen": -398.39837646484375, "logps/rejected": -225.451171875, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.601964235305786, "rewards/margins": 2.141418933868408, "rewards/rejected": 1.460545539855957, "step": 54740 }, { "epoch": 2.5419007381958307, "grad_norm": 114.18793487548828, "learning_rate": 1.475583824690097e-07, "logits/chosen": -18.85991859436035, "logits/rejected": -17.786884307861328, "logps/chosen": -531.5448608398438, "logps/rejected": -376.3349609375, "loss": 0.4929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.434582710266113, "rewards/margins": 1.7579987049102783, "rewards/rejected": 2.676584243774414, "step": 54750 }, { "epoch": 2.542365012303264, "grad_norm": 13.616558074951172, "learning_rate": 1.4753052602256371e-07, "logits/chosen": -18.886390686035156, "logits/rejected": -17.690095901489258, "logps/chosen": -505.09259033203125, "logps/rejected": -373.61834716796875, "loss": 0.3077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.400528907775879, "rewards/margins": 1.8884403705596924, "rewards/rejected": 2.5120890140533447, "step": 54760 }, { "epoch": 2.5428292864106967, "grad_norm": 22.119354248046875, "learning_rate": 1.4750266957611773e-07, "logits/chosen": -19.371654510498047, "logits/rejected": -18.8621768951416, "logps/chosen": -393.79718017578125, "logps/rejected": -426.0682678222656, "loss": 1.0049, "rewards/accuracies": 0.5, "rewards/chosen": 3.6994805335998535, "rewards/margins": 0.01206362247467041, "rewards/rejected": 3.6874167919158936, "step": 54770 }, { "epoch": 2.54329356051813, "grad_norm": 127.51248168945312, "learning_rate": 1.4747481312967174e-07, "logits/chosen": -18.951162338256836, "logits/rejected": -18.72475242614746, "logps/chosen": -351.1305236816406, "logps/rejected": -371.6292724609375, "loss": 1.168, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.737774133682251, "rewards/margins": -0.3186076283454895, "rewards/rejected": 3.0563814640045166, "step": 54780 }, { "epoch": 2.5437578346255627, "grad_norm": 48.10403060913086, "learning_rate": 1.4744695668322576e-07, "logits/chosen": -19.472620010375977, "logits/rejected": -18.790874481201172, "logps/chosen": -496.2731018066406, "logps/rejected": -420.5165100097656, "loss": 0.598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.221652030944824, "rewards/margins": 0.4336835741996765, "rewards/rejected": 3.787968397140503, "step": 54790 }, { "epoch": 2.544222108732996, "grad_norm": 37.53449249267578, "learning_rate": 1.4741910023677977e-07, "logits/chosen": -18.214147567749023, "logits/rejected": -18.22406578063965, "logps/chosen": -311.34991455078125, "logps/rejected": -323.10833740234375, "loss": 1.1566, "rewards/accuracies": 0.5, "rewards/chosen": 3.5017788410186768, "rewards/margins": 0.4234875738620758, "rewards/rejected": 3.078291416168213, "step": 54800 }, { "epoch": 2.544686382840429, "grad_norm": 86.77432250976562, "learning_rate": 1.4739124379033381e-07, "logits/chosen": -18.86612319946289, "logits/rejected": -18.00021743774414, "logps/chosen": -362.8489685058594, "logps/rejected": -296.0986328125, "loss": 0.4439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.402789354324341, "rewards/margins": 1.0707261562347412, "rewards/rejected": 2.3320631980895996, "step": 54810 }, { "epoch": 2.545150656947862, "grad_norm": 45.080772399902344, "learning_rate": 1.4736338734388783e-07, "logits/chosen": -18.83566665649414, "logits/rejected": -17.981769561767578, "logps/chosen": -390.2305603027344, "logps/rejected": -278.03955078125, "loss": 0.3579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8671391010284424, "rewards/margins": 1.632994294166565, "rewards/rejected": 2.2341442108154297, "step": 54820 }, { "epoch": 2.545614931055295, "grad_norm": 15.649236679077148, "learning_rate": 1.4733553089744184e-07, "logits/chosen": -18.544288635253906, "logits/rejected": -17.893796920776367, "logps/chosen": -250.31539916992188, "logps/rejected": -217.64816284179688, "loss": 0.6742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6767516136169434, "rewards/margins": 1.0948798656463623, "rewards/rejected": 1.581871747970581, "step": 54830 }, { "epoch": 2.5460792051627283, "grad_norm": 28.272769927978516, "learning_rate": 1.4730767445099586e-07, "logits/chosen": -19.30075454711914, "logits/rejected": -18.411664962768555, "logps/chosen": -415.1068420410156, "logps/rejected": -349.5378112792969, "loss": 0.5155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.106792449951172, "rewards/margins": 1.3996822834014893, "rewards/rejected": 2.7071099281311035, "step": 54840 }, { "epoch": 2.546543479270161, "grad_norm": 67.4146728515625, "learning_rate": 1.4727981800454987e-07, "logits/chosen": -19.163219451904297, "logits/rejected": -17.663494110107422, "logps/chosen": -403.0707092285156, "logps/rejected": -273.31060791015625, "loss": 0.3586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.056633472442627, "rewards/margins": 2.010610580444336, "rewards/rejected": 2.046022653579712, "step": 54850 }, { "epoch": 2.547007753377594, "grad_norm": 45.479122161865234, "learning_rate": 1.4725196155810389e-07, "logits/chosen": -18.74393653869629, "logits/rejected": -17.829822540283203, "logps/chosen": -472.1748046875, "logps/rejected": -424.072265625, "loss": 0.5966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.774383068084717, "rewards/margins": 0.586550235748291, "rewards/rejected": 3.187833309173584, "step": 54860 }, { "epoch": 2.547472027485027, "grad_norm": 2.4296844005584717, "learning_rate": 1.4722410511165793e-07, "logits/chosen": -18.698591232299805, "logits/rejected": -17.79112434387207, "logps/chosen": -341.4749755859375, "logps/rejected": -246.4760284423828, "loss": 0.3441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.443848133087158, "rewards/margins": 1.753308892250061, "rewards/rejected": 1.6905393600463867, "step": 54870 }, { "epoch": 2.5479363015924603, "grad_norm": 0.2244853526353836, "learning_rate": 1.4719624866521194e-07, "logits/chosen": -18.548511505126953, "logits/rejected": -17.026073455810547, "logps/chosen": -397.31683349609375, "logps/rejected": -221.25479125976562, "loss": 0.368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8852293491363525, "rewards/margins": 1.7332115173339844, "rewards/rejected": 1.1520180702209473, "step": 54880 }, { "epoch": 2.548400575699893, "grad_norm": 81.24250793457031, "learning_rate": 1.4716839221876596e-07, "logits/chosen": -18.200458526611328, "logits/rejected": -17.56984519958496, "logps/chosen": -432.378662109375, "logps/rejected": -302.27899169921875, "loss": 0.7685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.746164321899414, "rewards/margins": 1.210986852645874, "rewards/rejected": 2.535177707672119, "step": 54890 }, { "epoch": 2.5488648498073263, "grad_norm": 111.12193298339844, "learning_rate": 1.4714053577231997e-07, "logits/chosen": -18.735469818115234, "logits/rejected": -17.431987762451172, "logps/chosen": -468.354736328125, "logps/rejected": -344.1695861816406, "loss": 0.327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7531368732452393, "rewards/margins": 1.814039945602417, "rewards/rejected": 1.939096450805664, "step": 54900 }, { "epoch": 2.5493291239147595, "grad_norm": 83.53047943115234, "learning_rate": 1.4711267932587399e-07, "logits/chosen": -19.135555267333984, "logits/rejected": -18.528057098388672, "logps/chosen": -464.938232421875, "logps/rejected": -368.8035888671875, "loss": 0.5285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9530558586120605, "rewards/margins": 1.292364478111267, "rewards/rejected": 2.660691261291504, "step": 54910 }, { "epoch": 2.5497933980221923, "grad_norm": 41.59426498413086, "learning_rate": 1.47084822879428e-07, "logits/chosen": -19.312501907348633, "logits/rejected": -19.05582618713379, "logps/chosen": -353.4728088378906, "logps/rejected": -361.0267639160156, "loss": 0.7854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9179444313049316, "rewards/margins": 0.5090649724006653, "rewards/rejected": 2.408879041671753, "step": 54920 }, { "epoch": 2.550257672129625, "grad_norm": 2.5803842544555664, "learning_rate": 1.4705696643298204e-07, "logits/chosen": -19.844022750854492, "logits/rejected": -18.293848037719727, "logps/chosen": -419.8953552246094, "logps/rejected": -372.3382263183594, "loss": 0.4315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.051050186157227, "rewards/margins": 1.5989149808883667, "rewards/rejected": 2.4521355628967285, "step": 54930 }, { "epoch": 2.5507219462370583, "grad_norm": 97.248291015625, "learning_rate": 1.4703189563118064e-07, "logits/chosen": -19.92377471923828, "logits/rejected": -18.522533416748047, "logps/chosen": -420.81256103515625, "logps/rejected": -375.27386474609375, "loss": 1.514, "rewards/accuracies": 0.5, "rewards/chosen": 2.9462170600891113, "rewards/margins": -0.27813178300857544, "rewards/rejected": 3.224348783493042, "step": 54940 }, { "epoch": 2.5511862203444915, "grad_norm": 66.20712280273438, "learning_rate": 1.4700403918473465e-07, "logits/chosen": -18.53181266784668, "logits/rejected": -18.603384017944336, "logps/chosen": -453.0665588378906, "logps/rejected": -358.90753173828125, "loss": 0.5152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.605374574661255, "rewards/margins": 1.26186203956604, "rewards/rejected": 2.3435120582580566, "step": 54950 }, { "epoch": 2.5516504944519243, "grad_norm": 48.51811218261719, "learning_rate": 1.469761827382887e-07, "logits/chosen": -19.575414657592773, "logits/rejected": -19.386255264282227, "logps/chosen": -275.02215576171875, "logps/rejected": -236.63461303710938, "loss": 0.4191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.477477550506592, "rewards/margins": 1.1525620222091675, "rewards/rejected": 1.3249156475067139, "step": 54960 }, { "epoch": 2.5521147685593575, "grad_norm": 23.150537490844727, "learning_rate": 1.469483262918427e-07, "logits/chosen": -20.397878646850586, "logits/rejected": -19.529460906982422, "logps/chosen": -392.6285705566406, "logps/rejected": -337.3710021972656, "loss": 1.2865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5274899005889893, "rewards/margins": 0.20264141261577606, "rewards/rejected": 3.324848175048828, "step": 54970 }, { "epoch": 2.5525790426667907, "grad_norm": 35.961830139160156, "learning_rate": 1.4692046984539672e-07, "logits/chosen": -18.382877349853516, "logits/rejected": -17.63222885131836, "logps/chosen": -384.1370849609375, "logps/rejected": -288.19525146484375, "loss": 0.5566, "rewards/accuracies": 0.5, "rewards/chosen": 3.1320784091949463, "rewards/margins": 0.7858799695968628, "rewards/rejected": 2.346198081970215, "step": 54980 }, { "epoch": 2.5530433167742235, "grad_norm": 55.25058364868164, "learning_rate": 1.4689261339895073e-07, "logits/chosen": -19.04145622253418, "logits/rejected": -19.022247314453125, "logps/chosen": -310.2272644042969, "logps/rejected": -238.20413208007812, "loss": 1.0661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7451424598693848, "rewards/margins": 0.32076510787010193, "rewards/rejected": 2.42437744140625, "step": 54990 }, { "epoch": 2.5535075908816567, "grad_norm": 94.26124572753906, "learning_rate": 1.4686475695250475e-07, "logits/chosen": -20.537338256835938, "logits/rejected": -19.118183135986328, "logps/chosen": -426.597900390625, "logps/rejected": -334.8335266113281, "loss": 0.3565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.124786376953125, "rewards/margins": 1.6155321598052979, "rewards/rejected": 2.509253978729248, "step": 55000 }, { "epoch": 2.5539718649890895, "grad_norm": 104.73685455322266, "learning_rate": 1.4683690050605876e-07, "logits/chosen": -19.738204956054688, "logits/rejected": -19.340282440185547, "logps/chosen": -385.5555725097656, "logps/rejected": -337.3741760253906, "loss": 0.7953, "rewards/accuracies": 0.5, "rewards/chosen": 3.154226064682007, "rewards/margins": 0.7539057731628418, "rewards/rejected": 2.400320053100586, "step": 55010 }, { "epoch": 2.5544361390965227, "grad_norm": 33.34807205200195, "learning_rate": 1.468090440596128e-07, "logits/chosen": -18.84372901916504, "logits/rejected": -18.511211395263672, "logps/chosen": -426.1278381347656, "logps/rejected": -418.3382873535156, "loss": 1.3261, "rewards/accuracies": 0.5, "rewards/chosen": 3.0688869953155518, "rewards/margins": -0.01988346502184868, "rewards/rejected": 3.0887703895568848, "step": 55020 }, { "epoch": 2.5549004132039554, "grad_norm": 2.886481285095215, "learning_rate": 1.4678118761316682e-07, "logits/chosen": -19.058513641357422, "logits/rejected": -19.117429733276367, "logps/chosen": -408.2234802246094, "logps/rejected": -430.41546630859375, "loss": 1.5605, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0492680072784424, "rewards/margins": -0.4077354371547699, "rewards/rejected": 3.457003116607666, "step": 55030 }, { "epoch": 2.5553646873113887, "grad_norm": 57.989322662353516, "learning_rate": 1.467533311667208e-07, "logits/chosen": -18.65175437927246, "logits/rejected": -18.781719207763672, "logps/chosen": -386.95416259765625, "logps/rejected": -387.0751953125, "loss": 0.7743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2492191791534424, "rewards/margins": 0.6085883378982544, "rewards/rejected": 2.6406304836273193, "step": 55040 }, { "epoch": 2.555828961418822, "grad_norm": 73.01202392578125, "learning_rate": 1.4672547472027485e-07, "logits/chosen": -18.60088539123535, "logits/rejected": -18.652294158935547, "logps/chosen": -262.1835021972656, "logps/rejected": -212.8894500732422, "loss": 0.7236, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.128382444381714, "rewards/margins": 0.6926336884498596, "rewards/rejected": 1.4357491731643677, "step": 55050 }, { "epoch": 2.5562932355262546, "grad_norm": 166.2315216064453, "learning_rate": 1.4669761827382886e-07, "logits/chosen": -18.767818450927734, "logits/rejected": -18.022111892700195, "logps/chosen": -501.2096252441406, "logps/rejected": -435.52001953125, "loss": 0.3149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.739332675933838, "rewards/margins": 2.205066204071045, "rewards/rejected": 2.534266710281372, "step": 55060 }, { "epoch": 2.556757509633688, "grad_norm": 57.95866775512695, "learning_rate": 1.4666976182738288e-07, "logits/chosen": -20.348350524902344, "logits/rejected": -19.179697036743164, "logps/chosen": -449.8401794433594, "logps/rejected": -349.53076171875, "loss": 0.3812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.860987663269043, "rewards/margins": 1.7678743600845337, "rewards/rejected": 3.0931131839752197, "step": 55070 }, { "epoch": 2.5572217837411206, "grad_norm": 72.20748901367188, "learning_rate": 1.466419053809369e-07, "logits/chosen": -19.475360870361328, "logits/rejected": -18.720439910888672, "logps/chosen": -401.04595947265625, "logps/rejected": -339.3066711425781, "loss": 0.7534, "rewards/accuracies": 0.5, "rewards/chosen": 3.687905788421631, "rewards/margins": 0.5445660948753357, "rewards/rejected": 3.1433398723602295, "step": 55080 }, { "epoch": 2.557686057848554, "grad_norm": 30.924745559692383, "learning_rate": 1.466140489344909e-07, "logits/chosen": -18.387710571289062, "logits/rejected": -17.766796112060547, "logps/chosen": -371.9634704589844, "logps/rejected": -263.14410400390625, "loss": 0.5738, "rewards/accuracies": 0.5, "rewards/chosen": 2.694396495819092, "rewards/margins": 0.9144784808158875, "rewards/rejected": 1.7799180746078491, "step": 55090 }, { "epoch": 2.5581503319559866, "grad_norm": 77.88851928710938, "learning_rate": 1.4658619248804492e-07, "logits/chosen": -18.896400451660156, "logits/rejected": -18.97048568725586, "logps/chosen": -344.7579650878906, "logps/rejected": -361.51434326171875, "loss": 0.598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.228689670562744, "rewards/margins": 0.48028892278671265, "rewards/rejected": 2.748401165008545, "step": 55100 }, { "epoch": 2.55861460606342, "grad_norm": 39.701881408691406, "learning_rate": 1.4655833604159896e-07, "logits/chosen": -18.06144142150879, "logits/rejected": -17.569011688232422, "logps/chosen": -289.69903564453125, "logps/rejected": -280.65142822265625, "loss": 0.8259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3569800853729248, "rewards/margins": 0.3293226361274719, "rewards/rejected": 1.0276575088500977, "step": 55110 }, { "epoch": 2.559078880170853, "grad_norm": 165.8187255859375, "learning_rate": 1.4653047959515298e-07, "logits/chosen": -18.161169052124023, "logits/rejected": -18.510095596313477, "logps/chosen": -324.4275817871094, "logps/rejected": -383.736083984375, "loss": 1.4341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.597074031829834, "rewards/margins": -0.558705747127533, "rewards/rejected": 3.1557798385620117, "step": 55120 }, { "epoch": 2.559543154278286, "grad_norm": 1.697323203086853, "learning_rate": 1.46502623148707e-07, "logits/chosen": -18.175561904907227, "logits/rejected": -17.2353572845459, "logps/chosen": -419.4908752441406, "logps/rejected": -269.1680908203125, "loss": 0.4503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4430408477783203, "rewards/margins": 1.4332451820373535, "rewards/rejected": 2.009795665740967, "step": 55130 }, { "epoch": 2.560007428385719, "grad_norm": 58.01288986206055, "learning_rate": 1.46474766702261e-07, "logits/chosen": -19.183067321777344, "logits/rejected": -19.022279739379883, "logps/chosen": -408.4957580566406, "logps/rejected": -380.2794494628906, "loss": 0.2511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.115607261657715, "rewards/margins": 2.1105902194976807, "rewards/rejected": 2.005017042160034, "step": 55140 }, { "epoch": 2.560471702493152, "grad_norm": 31.209657669067383, "learning_rate": 1.4644691025581502e-07, "logits/chosen": -19.337831497192383, "logits/rejected": -18.30396842956543, "logps/chosen": -337.53778076171875, "logps/rejected": -236.0876922607422, "loss": 0.4741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.417778491973877, "rewards/margins": 0.9068790674209595, "rewards/rejected": 1.5108997821807861, "step": 55150 }, { "epoch": 2.560935976600585, "grad_norm": 124.39652252197266, "learning_rate": 1.4641905380936903e-07, "logits/chosen": -17.439533233642578, "logits/rejected": -17.943721771240234, "logps/chosen": -353.5765380859375, "logps/rejected": -366.1914978027344, "loss": 1.1108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6901729106903076, "rewards/margins": 0.1942909061908722, "rewards/rejected": 2.495882034301758, "step": 55160 }, { "epoch": 2.561400250708018, "grad_norm": 39.556217193603516, "learning_rate": 1.4639119736292308e-07, "logits/chosen": -18.22671890258789, "logits/rejected": -17.921772003173828, "logps/chosen": -387.1251220703125, "logps/rejected": -270.76220703125, "loss": 0.4543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8636579513549805, "rewards/margins": 1.3176820278167725, "rewards/rejected": 1.5459758043289185, "step": 55170 }, { "epoch": 2.561864524815451, "grad_norm": 39.69279479980469, "learning_rate": 1.463633409164771e-07, "logits/chosen": -19.626724243164062, "logits/rejected": -18.39426612854004, "logps/chosen": -348.633544921875, "logps/rejected": -206.7547149658203, "loss": 0.5237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4823527336120605, "rewards/margins": 1.3326388597488403, "rewards/rejected": 1.1497141122817993, "step": 55180 }, { "epoch": 2.5623287989228842, "grad_norm": 60.47482681274414, "learning_rate": 1.463354844700311e-07, "logits/chosen": -20.3472900390625, "logits/rejected": -19.905000686645508, "logps/chosen": -397.1462097167969, "logps/rejected": -384.3780212402344, "loss": 0.9747, "rewards/accuracies": 0.5, "rewards/chosen": 2.791132688522339, "rewards/margins": 0.028875375166535378, "rewards/rejected": 2.7622573375701904, "step": 55190 }, { "epoch": 2.562793073030317, "grad_norm": 54.75709533691406, "learning_rate": 1.4630762802358512e-07, "logits/chosen": -18.542055130004883, "logits/rejected": -17.554819107055664, "logps/chosen": -329.6703796386719, "logps/rejected": -237.43319702148438, "loss": 0.5124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3629767894744873, "rewards/margins": 1.0481884479522705, "rewards/rejected": 1.3147884607315063, "step": 55200 }, { "epoch": 2.56325734713775, "grad_norm": 104.86279296875, "learning_rate": 1.4627977157713913e-07, "logits/chosen": -18.546688079833984, "logits/rejected": -19.230600357055664, "logps/chosen": -305.8004150390625, "logps/rejected": -379.442626953125, "loss": 1.0995, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6107280254364014, "rewards/margins": -0.42673221230506897, "rewards/rejected": 3.0374598503112793, "step": 55210 }, { "epoch": 2.563721621245183, "grad_norm": 46.80586242675781, "learning_rate": 1.4625191513069315e-07, "logits/chosen": -19.254552841186523, "logits/rejected": -17.260324478149414, "logps/chosen": -463.14276123046875, "logps/rejected": -250.01724243164062, "loss": 0.2492, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.640746593475342, "rewards/margins": 2.96726393699646, "rewards/rejected": 1.6734826564788818, "step": 55220 }, { "epoch": 2.564185895352616, "grad_norm": 46.84164810180664, "learning_rate": 1.4622405868424716e-07, "logits/chosen": -19.998746871948242, "logits/rejected": -19.660085678100586, "logps/chosen": -340.6866760253906, "logps/rejected": -334.11614990234375, "loss": 0.6434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1991755962371826, "rewards/margins": 0.7554382085800171, "rewards/rejected": 2.443737506866455, "step": 55230 }, { "epoch": 2.564650169460049, "grad_norm": 2.0123908519744873, "learning_rate": 1.461962022378012e-07, "logits/chosen": -18.502073287963867, "logits/rejected": -18.61346435546875, "logps/chosen": -332.17669677734375, "logps/rejected": -257.37408447265625, "loss": 1.0393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2706029415130615, "rewards/margins": 0.3839855194091797, "rewards/rejected": 1.8866173028945923, "step": 55240 }, { "epoch": 2.565114443567482, "grad_norm": 168.3221893310547, "learning_rate": 1.461683457913552e-07, "logits/chosen": -19.549814224243164, "logits/rejected": -19.179920196533203, "logps/chosen": -371.50946044921875, "logps/rejected": -328.4700012207031, "loss": 0.7728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.190293073654175, "rewards/margins": 0.510692298412323, "rewards/rejected": 2.679600954055786, "step": 55250 }, { "epoch": 2.5655787176749154, "grad_norm": 162.03489685058594, "learning_rate": 1.4614048934490923e-07, "logits/chosen": -18.476158142089844, "logits/rejected": -17.365612030029297, "logps/chosen": -359.64703369140625, "logps/rejected": -225.12728881835938, "loss": 0.7893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5259430408477783, "rewards/margins": 0.9747869372367859, "rewards/rejected": 1.5511561632156372, "step": 55260 }, { "epoch": 2.566042991782348, "grad_norm": 199.45956420898438, "learning_rate": 1.4611263289846325e-07, "logits/chosen": -19.381038665771484, "logits/rejected": -17.768041610717773, "logps/chosen": -356.8829345703125, "logps/rejected": -302.5458679199219, "loss": 0.5339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.013550758361816, "rewards/margins": 2.3158881664276123, "rewards/rejected": 2.697661876678467, "step": 55270 }, { "epoch": 2.5665072658897814, "grad_norm": 0.7465479969978333, "learning_rate": 1.4608477645201726e-07, "logits/chosen": -18.34304428100586, "logits/rejected": -18.86073112487793, "logps/chosen": -442.5626525878906, "logps/rejected": -373.9156188964844, "loss": 0.79, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3313682079315186, "rewards/margins": 1.0589735507965088, "rewards/rejected": 2.2723946571350098, "step": 55280 }, { "epoch": 2.5669715399972146, "grad_norm": 14.827669143676758, "learning_rate": 1.4605692000557128e-07, "logits/chosen": -19.35201644897461, "logits/rejected": -18.916202545166016, "logps/chosen": -492.1509704589844, "logps/rejected": -370.89630126953125, "loss": 0.6646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.986243486404419, "rewards/margins": 1.018524408340454, "rewards/rejected": 2.967719316482544, "step": 55290 }, { "epoch": 2.5674358141046474, "grad_norm": 84.76417541503906, "learning_rate": 1.460290635591253e-07, "logits/chosen": -17.746511459350586, "logits/rejected": -18.324438095092773, "logps/chosen": -272.99188232421875, "logps/rejected": -295.44744873046875, "loss": 1.4391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.754911422729492, "rewards/margins": -0.054609037935733795, "rewards/rejected": 2.8095204830169678, "step": 55300 }, { "epoch": 2.56790008821208, "grad_norm": 174.67477416992188, "learning_rate": 1.460012071126793e-07, "logits/chosen": -20.36829948425293, "logits/rejected": -18.536174774169922, "logps/chosen": -466.72137451171875, "logps/rejected": -293.77313232421875, "loss": 0.6224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.190459728240967, "rewards/margins": 1.8081703186035156, "rewards/rejected": 2.3822898864746094, "step": 55310 }, { "epoch": 2.5683643623195134, "grad_norm": 88.30501556396484, "learning_rate": 1.4597335066623335e-07, "logits/chosen": -18.136913299560547, "logits/rejected": -18.05219268798828, "logps/chosen": -265.03619384765625, "logps/rejected": -219.68093872070312, "loss": 1.0006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.280956745147705, "rewards/margins": 0.5596283674240112, "rewards/rejected": 1.7213283777236938, "step": 55320 }, { "epoch": 2.5688286364269466, "grad_norm": 184.8839111328125, "learning_rate": 1.4594549421978736e-07, "logits/chosen": -18.822063446044922, "logits/rejected": -18.35765266418457, "logps/chosen": -335.9690856933594, "logps/rejected": -242.60885620117188, "loss": 0.8647, "rewards/accuracies": 0.5, "rewards/chosen": 2.9503424167633057, "rewards/margins": 0.5149165987968445, "rewards/rejected": 2.4354259967803955, "step": 55330 }, { "epoch": 2.5692929105343794, "grad_norm": 1.6654870510101318, "learning_rate": 1.4591763777334138e-07, "logits/chosen": -18.16663932800293, "logits/rejected": -17.218963623046875, "logps/chosen": -390.2887268066406, "logps/rejected": -260.08062744140625, "loss": 0.4628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6369614601135254, "rewards/margins": 2.216665744781494, "rewards/rejected": 1.4202957153320312, "step": 55340 }, { "epoch": 2.5697571846418126, "grad_norm": 21.457677841186523, "learning_rate": 1.458897813268954e-07, "logits/chosen": -19.444013595581055, "logits/rejected": -19.364931106567383, "logps/chosen": -292.1448669433594, "logps/rejected": -307.23468017578125, "loss": 0.9205, "rewards/accuracies": 0.5, "rewards/chosen": 2.7574167251586914, "rewards/margins": 0.15223096311092377, "rewards/rejected": 2.6051857471466064, "step": 55350 }, { "epoch": 2.570221458749246, "grad_norm": 0.3012205958366394, "learning_rate": 1.458619248804494e-07, "logits/chosen": -19.83843421936035, "logits/rejected": -19.310760498046875, "logps/chosen": -508.94989013671875, "logps/rejected": -381.71417236328125, "loss": 0.2634, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8470661640167236, "rewards/margins": 2.0812203884124756, "rewards/rejected": 1.7658456563949585, "step": 55360 }, { "epoch": 2.5706857328566786, "grad_norm": 120.98992919921875, "learning_rate": 1.4583406843400342e-07, "logits/chosen": -18.405187606811523, "logits/rejected": -17.087963104248047, "logps/chosen": -453.3858337402344, "logps/rejected": -315.1285095214844, "loss": 0.5008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9999046325683594, "rewards/margins": 2.1229677200317383, "rewards/rejected": 1.8769372701644897, "step": 55370 }, { "epoch": 2.5711500069641113, "grad_norm": 11.239458084106445, "learning_rate": 1.4580621198755746e-07, "logits/chosen": -19.153553009033203, "logits/rejected": -18.061542510986328, "logps/chosen": -342.0863342285156, "logps/rejected": -295.80877685546875, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 3.602513074874878, "rewards/margins": 1.0363689661026, "rewards/rejected": 2.566143751144409, "step": 55380 }, { "epoch": 2.5716142810715445, "grad_norm": 93.52924346923828, "learning_rate": 1.4577835554111147e-07, "logits/chosen": -19.660476684570312, "logits/rejected": -18.731761932373047, "logps/chosen": -404.3187561035156, "logps/rejected": -341.69720458984375, "loss": 0.6453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.845460891723633, "rewards/margins": 1.2596170902252197, "rewards/rejected": 2.585843801498413, "step": 55390 }, { "epoch": 2.5720785551789778, "grad_norm": 38.90214920043945, "learning_rate": 1.457504990946655e-07, "logits/chosen": -19.683263778686523, "logits/rejected": -18.20207405090332, "logps/chosen": -366.99346923828125, "logps/rejected": -315.28277587890625, "loss": 0.7987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3800621032714844, "rewards/margins": 1.7026875019073486, "rewards/rejected": 1.6773748397827148, "step": 55400 }, { "epoch": 2.5725428292864105, "grad_norm": 1.6976350545883179, "learning_rate": 1.457226426482195e-07, "logits/chosen": -19.4626522064209, "logits/rejected": -18.446561813354492, "logps/chosen": -433.63653564453125, "logps/rejected": -364.53729248046875, "loss": 0.7243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3587117195129395, "rewards/margins": 0.9257699251174927, "rewards/rejected": 3.432941436767578, "step": 55410 }, { "epoch": 2.5730071033938438, "grad_norm": 48.606056213378906, "learning_rate": 1.4569478620177352e-07, "logits/chosen": -18.723072052001953, "logits/rejected": -18.534870147705078, "logps/chosen": -466.82781982421875, "logps/rejected": -377.60870361328125, "loss": 0.8364, "rewards/accuracies": 0.5, "rewards/chosen": 3.5049374103546143, "rewards/margins": 0.5298231840133667, "rewards/rejected": 2.975114107131958, "step": 55420 }, { "epoch": 2.573471377501277, "grad_norm": 5.9598774909973145, "learning_rate": 1.4566692975532753e-07, "logits/chosen": -18.864864349365234, "logits/rejected": -18.914718627929688, "logps/chosen": -440.90509033203125, "logps/rejected": -441.74981689453125, "loss": 0.8913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.575099468231201, "rewards/margins": 0.34105008840560913, "rewards/rejected": 4.2340497970581055, "step": 55430 }, { "epoch": 2.5739356516087097, "grad_norm": 44.29386520385742, "learning_rate": 1.4563907330888155e-07, "logits/chosen": -18.94374656677246, "logits/rejected": -18.248849868774414, "logps/chosen": -392.62554931640625, "logps/rejected": -318.79473876953125, "loss": 0.9106, "rewards/accuracies": 0.5, "rewards/chosen": 2.6038310527801514, "rewards/margins": 0.25695157051086426, "rewards/rejected": 2.346879482269287, "step": 55440 }, { "epoch": 2.574399925716143, "grad_norm": 0.10993313044309616, "learning_rate": 1.456112168624356e-07, "logits/chosen": -19.174053192138672, "logits/rejected": -19.183469772338867, "logps/chosen": -293.45367431640625, "logps/rejected": -261.49395751953125, "loss": 0.5862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8688249588012695, "rewards/margins": 1.473533272743225, "rewards/rejected": 1.3952915668487549, "step": 55450 }, { "epoch": 2.5748641998235757, "grad_norm": 56.2006950378418, "learning_rate": 1.4558336041598958e-07, "logits/chosen": -19.235301971435547, "logits/rejected": -18.912168502807617, "logps/chosen": -517.984375, "logps/rejected": -465.446533203125, "loss": 0.4069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4614930152893066, "rewards/margins": 1.0938780307769775, "rewards/rejected": 2.36761474609375, "step": 55460 }, { "epoch": 2.575328473931009, "grad_norm": 32.55391311645508, "learning_rate": 1.4555550396954362e-07, "logits/chosen": -18.837940216064453, "logits/rejected": -17.719097137451172, "logps/chosen": -400.16461181640625, "logps/rejected": -279.23974609375, "loss": 0.3623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.963670015335083, "rewards/margins": 1.3890669345855713, "rewards/rejected": 1.5746030807495117, "step": 55470 }, { "epoch": 2.5757927480384417, "grad_norm": 11.501219749450684, "learning_rate": 1.4552764752309763e-07, "logits/chosen": -18.66325569152832, "logits/rejected": -17.78041648864746, "logps/chosen": -453.8063049316406, "logps/rejected": -350.57470703125, "loss": 0.4292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.238894939422607, "rewards/margins": 1.6892181634902954, "rewards/rejected": 2.5496766567230225, "step": 55480 }, { "epoch": 2.576257022145875, "grad_norm": 94.05462646484375, "learning_rate": 1.4549979107665165e-07, "logits/chosen": -18.344289779663086, "logits/rejected": -18.663230895996094, "logps/chosen": -273.739990234375, "logps/rejected": -292.4551086425781, "loss": 1.1752, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5725958347320557, "rewards/margins": -0.3137633800506592, "rewards/rejected": 1.8863592147827148, "step": 55490 }, { "epoch": 2.576721296253308, "grad_norm": 29.25762367248535, "learning_rate": 1.4547193463020566e-07, "logits/chosen": -19.116262435913086, "logits/rejected": -18.155447006225586, "logps/chosen": -345.2691955566406, "logps/rejected": -268.29949951171875, "loss": 0.779, "rewards/accuracies": 0.5, "rewards/chosen": 3.3397583961486816, "rewards/margins": 0.9447635412216187, "rewards/rejected": 2.3949949741363525, "step": 55500 }, { "epoch": 2.577185570360741, "grad_norm": 0.1053897887468338, "learning_rate": 1.4544407818375968e-07, "logits/chosen": -18.872570037841797, "logits/rejected": -17.15757179260254, "logps/chosen": -480.54022216796875, "logps/rejected": -248.8990478515625, "loss": 0.207, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8897881507873535, "rewards/margins": 3.4493680000305176, "rewards/rejected": 1.4404202699661255, "step": 55510 }, { "epoch": 2.577649844468174, "grad_norm": 87.5992660522461, "learning_rate": 1.454162217373137e-07, "logits/chosen": -18.852588653564453, "logits/rejected": -18.75370979309082, "logps/chosen": -375.07904052734375, "logps/rejected": -370.539794921875, "loss": 1.1727, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3273239135742188, "rewards/margins": -0.4807473123073578, "rewards/rejected": 3.8080711364746094, "step": 55520 }, { "epoch": 2.578114118575607, "grad_norm": 46.14735412597656, "learning_rate": 1.4538836529086773e-07, "logits/chosen": -18.68562889099121, "logits/rejected": -18.096881866455078, "logps/chosen": -434.89263916015625, "logps/rejected": -373.011474609375, "loss": 0.8679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.878391981124878, "rewards/margins": 0.49295830726623535, "rewards/rejected": 3.3854339122772217, "step": 55530 }, { "epoch": 2.57857839268304, "grad_norm": 3.3024492263793945, "learning_rate": 1.4536050884442175e-07, "logits/chosen": -18.7668399810791, "logits/rejected": -17.944793701171875, "logps/chosen": -284.5279235839844, "logps/rejected": -242.61264038085938, "loss": 0.6862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.477167844772339, "rewards/margins": 0.9687294960021973, "rewards/rejected": 1.5084384679794312, "step": 55540 }, { "epoch": 2.579042666790473, "grad_norm": 17.401823043823242, "learning_rate": 1.4533265239797576e-07, "logits/chosen": -19.44486427307129, "logits/rejected": -19.447444915771484, "logps/chosen": -385.5745544433594, "logps/rejected": -372.22882080078125, "loss": 0.7623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9110703468322754, "rewards/margins": 0.6759457588195801, "rewards/rejected": 2.2351245880126953, "step": 55550 }, { "epoch": 2.579506940897906, "grad_norm": 49.99180603027344, "learning_rate": 1.4530479595152977e-07, "logits/chosen": -18.83388328552246, "logits/rejected": -18.703916549682617, "logps/chosen": -365.239501953125, "logps/rejected": -336.9896545410156, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.683856964111328, "rewards/margins": 0.43040022253990173, "rewards/rejected": 3.2534565925598145, "step": 55560 }, { "epoch": 2.5799712150053393, "grad_norm": 86.4771957397461, "learning_rate": 1.452769395050838e-07, "logits/chosen": -18.636425018310547, "logits/rejected": -18.32896614074707, "logps/chosen": -298.0069885253906, "logps/rejected": -314.10186767578125, "loss": 0.8263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7955596446990967, "rewards/margins": 0.6801648139953613, "rewards/rejected": 2.1153950691223145, "step": 55570 }, { "epoch": 2.580435489112772, "grad_norm": 88.48968505859375, "learning_rate": 1.452490830586378e-07, "logits/chosen": -19.695098876953125, "logits/rejected": -19.280277252197266, "logps/chosen": -330.6254577636719, "logps/rejected": -302.807861328125, "loss": 0.3646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3923592567443848, "rewards/margins": 1.2319170236587524, "rewards/rejected": 2.1604418754577637, "step": 55580 }, { "epoch": 2.5808997632202053, "grad_norm": 32.605831146240234, "learning_rate": 1.4522122661219184e-07, "logits/chosen": -18.516727447509766, "logits/rejected": -17.534622192382812, "logps/chosen": -305.40484619140625, "logps/rejected": -317.9655456542969, "loss": 1.0838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5931217670440674, "rewards/margins": 0.7120312452316284, "rewards/rejected": 2.8810904026031494, "step": 55590 }, { "epoch": 2.581364037327638, "grad_norm": 220.72659301757812, "learning_rate": 1.4519337016574586e-07, "logits/chosen": -18.964664459228516, "logits/rejected": -18.402616500854492, "logps/chosen": -340.33160400390625, "logps/rejected": -385.6778869628906, "loss": 1.6432, "rewards/accuracies": 0.5, "rewards/chosen": 3.2659332752227783, "rewards/margins": -0.40579527616500854, "rewards/rejected": 3.6717281341552734, "step": 55600 }, { "epoch": 2.5818283114350713, "grad_norm": 1.0543403625488281, "learning_rate": 1.4516551371929987e-07, "logits/chosen": -19.237939834594727, "logits/rejected": -18.3045711517334, "logps/chosen": -365.4173278808594, "logps/rejected": -254.2435760498047, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.167382001876831, "rewards/margins": 1.3664582967758179, "rewards/rejected": 1.8009235858917236, "step": 55610 }, { "epoch": 2.582292585542504, "grad_norm": 41.911102294921875, "learning_rate": 1.451376572728539e-07, "logits/chosen": -19.394702911376953, "logits/rejected": -18.319339752197266, "logps/chosen": -355.36749267578125, "logps/rejected": -292.10589599609375, "loss": 0.3784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8184332847595215, "rewards/margins": 1.745998740196228, "rewards/rejected": 2.072434902191162, "step": 55620 }, { "epoch": 2.5827568596499373, "grad_norm": 93.20834350585938, "learning_rate": 1.451098008264079e-07, "logits/chosen": -18.700347900390625, "logits/rejected": -17.96456527709961, "logps/chosen": -447.59820556640625, "logps/rejected": -371.18841552734375, "loss": 0.4238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.075495719909668, "rewards/margins": 1.0982062816619873, "rewards/rejected": 2.9772891998291016, "step": 55630 }, { "epoch": 2.5832211337573705, "grad_norm": 29.191282272338867, "learning_rate": 1.4508194437996192e-07, "logits/chosen": -19.256683349609375, "logits/rejected": -18.155601501464844, "logps/chosen": -440.34710693359375, "logps/rejected": -387.3663635253906, "loss": 0.6764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9021573066711426, "rewards/margins": 1.1707109212875366, "rewards/rejected": 2.7314462661743164, "step": 55640 }, { "epoch": 2.5836854078648033, "grad_norm": 29.56975746154785, "learning_rate": 1.4505408793351593e-07, "logits/chosen": -19.705158233642578, "logits/rejected": -19.15768814086914, "logps/chosen": -417.08966064453125, "logps/rejected": -363.439697265625, "loss": 0.3705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.865509033203125, "rewards/margins": 1.302303671836853, "rewards/rejected": 2.5632052421569824, "step": 55650 }, { "epoch": 2.5841496819722365, "grad_norm": 148.8583221435547, "learning_rate": 1.4502623148706997e-07, "logits/chosen": -18.856224060058594, "logits/rejected": -17.95876693725586, "logps/chosen": -469.45465087890625, "logps/rejected": -367.7762145996094, "loss": 0.853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.4763875007629395, "rewards/margins": 1.4406886100769043, "rewards/rejected": 3.035698890686035, "step": 55660 }, { "epoch": 2.5846139560796697, "grad_norm": 5.44141960144043, "learning_rate": 1.4499837504062396e-07, "logits/chosen": -19.182344436645508, "logits/rejected": -18.87388801574707, "logps/chosen": -397.42132568359375, "logps/rejected": -346.87396240234375, "loss": 1.0196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7604289054870605, "rewards/margins": 0.4210754334926605, "rewards/rejected": 3.3393540382385254, "step": 55670 }, { "epoch": 2.5850782301871025, "grad_norm": 37.94744110107422, "learning_rate": 1.44970518594178e-07, "logits/chosen": -18.572322845458984, "logits/rejected": -17.964481353759766, "logps/chosen": -437.798583984375, "logps/rejected": -381.6943359375, "loss": 0.575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.672879457473755, "rewards/margins": 0.707715630531311, "rewards/rejected": 2.9651637077331543, "step": 55680 }, { "epoch": 2.5855425042945352, "grad_norm": 284.741943359375, "learning_rate": 1.4494266214773202e-07, "logits/chosen": -18.87575340270996, "logits/rejected": -18.570350646972656, "logps/chosen": -374.7630920410156, "logps/rejected": -307.7965087890625, "loss": 0.7937, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9701061248779297, "rewards/margins": 1.2411909103393555, "rewards/rejected": 2.728915214538574, "step": 55690 }, { "epoch": 2.5860067784019685, "grad_norm": 238.4281005859375, "learning_rate": 1.4491480570128603e-07, "logits/chosen": -20.037208557128906, "logits/rejected": -18.62850570678711, "logps/chosen": -533.8483276367188, "logps/rejected": -381.816162109375, "loss": 0.44, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.964917182922363, "rewards/margins": 1.5374348163604736, "rewards/rejected": 3.4274818897247314, "step": 55700 }, { "epoch": 2.5864710525094017, "grad_norm": 5.005692481994629, "learning_rate": 1.4488694925484005e-07, "logits/chosen": -18.55333709716797, "logits/rejected": -18.144943237304688, "logps/chosen": -359.21392822265625, "logps/rejected": -306.25982666015625, "loss": 1.0165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7544054985046387, "rewards/margins": 1.152246356010437, "rewards/rejected": 2.6021595001220703, "step": 55710 }, { "epoch": 2.5869353266168345, "grad_norm": 2.3730955123901367, "learning_rate": 1.4485909280839406e-07, "logits/chosen": -19.717941284179688, "logits/rejected": -19.18539810180664, "logps/chosen": -483.68359375, "logps/rejected": -431.61578369140625, "loss": 0.6091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.369273662567139, "rewards/margins": 1.6392018795013428, "rewards/rejected": 3.730071544647217, "step": 55720 }, { "epoch": 2.5873996007242677, "grad_norm": 241.25059509277344, "learning_rate": 1.4483123636194807e-07, "logits/chosen": -18.075456619262695, "logits/rejected": -17.692668914794922, "logps/chosen": -375.661376953125, "logps/rejected": -349.11572265625, "loss": 1.3366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.839536190032959, "rewards/margins": 0.31430017948150635, "rewards/rejected": 3.525236129760742, "step": 55730 }, { "epoch": 2.587863874831701, "grad_norm": 170.345458984375, "learning_rate": 1.4480337991550212e-07, "logits/chosen": -19.019880294799805, "logits/rejected": -19.19797134399414, "logps/chosen": -464.62762451171875, "logps/rejected": -451.19549560546875, "loss": 1.0585, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.2718000411987305, "rewards/margins": 0.021450567990541458, "rewards/rejected": 4.250349521636963, "step": 55740 }, { "epoch": 2.5883281489391337, "grad_norm": 38.92760467529297, "learning_rate": 1.4477552346905613e-07, "logits/chosen": -18.334623336791992, "logits/rejected": -17.353492736816406, "logps/chosen": -441.01422119140625, "logps/rejected": -322.1240234375, "loss": 0.7715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.73100209236145, "rewards/margins": 1.4977635145187378, "rewards/rejected": 2.233238697052002, "step": 55750 }, { "epoch": 2.5887924230465664, "grad_norm": 53.53664779663086, "learning_rate": 1.4474766702261014e-07, "logits/chosen": -19.256610870361328, "logits/rejected": -19.288976669311523, "logps/chosen": -328.17108154296875, "logps/rejected": -318.337890625, "loss": 0.8943, "rewards/accuracies": 0.5, "rewards/chosen": 2.9058310985565186, "rewards/margins": -0.04817866161465645, "rewards/rejected": 2.954009771347046, "step": 55760 }, { "epoch": 2.5892566971539996, "grad_norm": 97.64500427246094, "learning_rate": 1.4471981057616416e-07, "logits/chosen": -18.781774520874023, "logits/rejected": -18.494308471679688, "logps/chosen": -434.4510192871094, "logps/rejected": -410.305419921875, "loss": 1.102, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.339934825897217, "rewards/margins": -0.1127217561006546, "rewards/rejected": 4.4526567459106445, "step": 55770 }, { "epoch": 2.589720971261433, "grad_norm": 24.643959045410156, "learning_rate": 1.4469195412971817e-07, "logits/chosen": -18.933481216430664, "logits/rejected": -18.068172454833984, "logps/chosen": -406.0789794921875, "logps/rejected": -377.4317321777344, "loss": 1.4714, "rewards/accuracies": 0.5, "rewards/chosen": 3.6925883293151855, "rewards/margins": 0.6063998937606812, "rewards/rejected": 3.086188316345215, "step": 55780 }, { "epoch": 2.5901852453688656, "grad_norm": 156.64088439941406, "learning_rate": 1.446640976832722e-07, "logits/chosen": -19.058805465698242, "logits/rejected": -17.530475616455078, "logps/chosen": -462.53253173828125, "logps/rejected": -342.09796142578125, "loss": 0.3478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3792572021484375, "rewards/margins": 2.644714593887329, "rewards/rejected": 1.7345426082611084, "step": 55790 }, { "epoch": 2.590649519476299, "grad_norm": 16.5191593170166, "learning_rate": 1.4463624123682623e-07, "logits/chosen": -20.514862060546875, "logits/rejected": -19.392292022705078, "logps/chosen": -457.6915588378906, "logps/rejected": -436.92529296875, "loss": 0.8411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.198068141937256, "rewards/margins": 0.2240426242351532, "rewards/rejected": 3.9740257263183594, "step": 55800 }, { "epoch": 2.591113793583732, "grad_norm": 208.1575164794922, "learning_rate": 1.4460838479038024e-07, "logits/chosen": -18.23023223876953, "logits/rejected": -17.857816696166992, "logps/chosen": -379.0248718261719, "logps/rejected": -300.483154296875, "loss": 1.1254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.821307420730591, "rewards/margins": 0.6578641533851624, "rewards/rejected": 3.163443088531494, "step": 55810 }, { "epoch": 2.591578067691165, "grad_norm": 9.59465217590332, "learning_rate": 1.4458052834393426e-07, "logits/chosen": -19.221698760986328, "logits/rejected": -17.99593734741211, "logps/chosen": -519.0027465820312, "logps/rejected": -425.2493591308594, "loss": 1.0548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.840810298919678, "rewards/margins": 0.8197891116142273, "rewards/rejected": 4.021021842956543, "step": 55820 }, { "epoch": 2.592042341798598, "grad_norm": 17.028715133666992, "learning_rate": 1.4455267189748827e-07, "logits/chosen": -18.48772621154785, "logits/rejected": -17.093780517578125, "logps/chosen": -356.42779541015625, "logps/rejected": -156.49722290039062, "loss": 0.4121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.344834804534912, "rewards/margins": 2.2004740238189697, "rewards/rejected": 0.14436078071594238, "step": 55830 }, { "epoch": 2.592506615906031, "grad_norm": 8.185729026794434, "learning_rate": 1.445248154510423e-07, "logits/chosen": -18.543930053710938, "logits/rejected": -17.840911865234375, "logps/chosen": -378.98321533203125, "logps/rejected": -286.65301513671875, "loss": 0.8631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2892794609069824, "rewards/margins": 1.2402839660644531, "rewards/rejected": 2.0489954948425293, "step": 55840 }, { "epoch": 2.592970890013464, "grad_norm": 39.5125617980957, "learning_rate": 1.444969590045963e-07, "logits/chosen": -17.96403694152832, "logits/rejected": -17.688426971435547, "logps/chosen": -406.8833923339844, "logps/rejected": -345.3338623046875, "loss": 0.5685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0826597213745117, "rewards/margins": 1.1152746677398682, "rewards/rejected": 1.9673852920532227, "step": 55850 }, { "epoch": 2.593435164120897, "grad_norm": 117.59324645996094, "learning_rate": 1.4446910255815032e-07, "logits/chosen": -18.29697036743164, "logits/rejected": -17.683326721191406, "logps/chosen": -424.39300537109375, "logps/rejected": -318.62249755859375, "loss": 0.5088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7762138843536377, "rewards/margins": 1.6772327423095703, "rewards/rejected": 2.0989811420440674, "step": 55860 }, { "epoch": 2.59389943822833, "grad_norm": 0.09699656069278717, "learning_rate": 1.4444124611170436e-07, "logits/chosen": -18.99178123474121, "logits/rejected": -18.169300079345703, "logps/chosen": -301.40875244140625, "logps/rejected": -280.63983154296875, "loss": 0.4839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.391127824783325, "rewards/margins": 1.5310062170028687, "rewards/rejected": 1.860121488571167, "step": 55870 }, { "epoch": 2.5943637123357632, "grad_norm": 45.95064163208008, "learning_rate": 1.4441338966525835e-07, "logits/chosen": -19.172271728515625, "logits/rejected": -19.241207122802734, "logps/chosen": -338.393310546875, "logps/rejected": -398.0455017089844, "loss": 0.8242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6834237575531006, "rewards/margins": 0.25365525484085083, "rewards/rejected": 2.4297685623168945, "step": 55880 }, { "epoch": 2.594827986443196, "grad_norm": 3.6961629390716553, "learning_rate": 1.443855332188124e-07, "logits/chosen": -18.098743438720703, "logits/rejected": -17.711956024169922, "logps/chosen": -389.4662170410156, "logps/rejected": -283.2613830566406, "loss": 0.5058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3050448894500732, "rewards/margins": 1.2959939241409302, "rewards/rejected": 2.0090513229370117, "step": 55890 }, { "epoch": 2.5952922605506292, "grad_norm": 220.49618530273438, "learning_rate": 1.443576767723664e-07, "logits/chosen": -19.664386749267578, "logits/rejected": -19.75723648071289, "logps/chosen": -388.056640625, "logps/rejected": -352.5543212890625, "loss": 0.8266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3668665885925293, "rewards/margins": 0.5633569955825806, "rewards/rejected": 2.803509473800659, "step": 55900 }, { "epoch": 2.595756534658062, "grad_norm": 89.6560287475586, "learning_rate": 1.4432982032592042e-07, "logits/chosen": -18.58120346069336, "logits/rejected": -18.939929962158203, "logps/chosen": -396.3453674316406, "logps/rejected": -441.53564453125, "loss": 0.8995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.747004747390747, "rewards/margins": 0.5120353698730469, "rewards/rejected": 3.2349696159362793, "step": 55910 }, { "epoch": 2.596220808765495, "grad_norm": 20.176908493041992, "learning_rate": 1.4430196387947443e-07, "logits/chosen": -19.012853622436523, "logits/rejected": -18.889022827148438, "logps/chosen": -384.6630859375, "logps/rejected": -379.83062744140625, "loss": 0.8838, "rewards/accuracies": 0.5, "rewards/chosen": 3.311063051223755, "rewards/margins": -0.037203095853328705, "rewards/rejected": 3.348266124725342, "step": 55920 }, { "epoch": 2.596685082872928, "grad_norm": 143.32305908203125, "learning_rate": 1.4427410743302844e-07, "logits/chosen": -19.282291412353516, "logits/rejected": -18.868011474609375, "logps/chosen": -304.9592590332031, "logps/rejected": -297.5752868652344, "loss": 0.7733, "rewards/accuracies": 0.5, "rewards/chosen": 2.669221878051758, "rewards/margins": 0.23168854415416718, "rewards/rejected": 2.437533378601074, "step": 55930 }, { "epoch": 2.597149356980361, "grad_norm": 19.71779441833496, "learning_rate": 1.4424625098658246e-07, "logits/chosen": -18.974939346313477, "logits/rejected": -18.20587730407715, "logps/chosen": -436.62579345703125, "logps/rejected": -294.2762145996094, "loss": 0.6502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1985647678375244, "rewards/margins": 0.8087776303291321, "rewards/rejected": 2.389787197113037, "step": 55940 }, { "epoch": 2.5976136310877944, "grad_norm": 295.8445739746094, "learning_rate": 1.442183945401365e-07, "logits/chosen": -19.814823150634766, "logits/rejected": -18.547563552856445, "logps/chosen": -517.4486083984375, "logps/rejected": -317.66033935546875, "loss": 0.3852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.165007591247559, "rewards/margins": 2.506476402282715, "rewards/rejected": 2.6585307121276855, "step": 55950 }, { "epoch": 2.598077905195227, "grad_norm": 17.40603256225586, "learning_rate": 1.4419053809369051e-07, "logits/chosen": -19.963085174560547, "logits/rejected": -19.064741134643555, "logps/chosen": -506.27276611328125, "logps/rejected": -323.75274658203125, "loss": 0.1706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.924566268920898, "rewards/margins": 2.645104169845581, "rewards/rejected": 2.279461622238159, "step": 55960 }, { "epoch": 2.5985421793026604, "grad_norm": 6.109076023101807, "learning_rate": 1.4416268164724453e-07, "logits/chosen": -19.428586959838867, "logits/rejected": -18.675521850585938, "logps/chosen": -366.94561767578125, "logps/rejected": -261.4835510253906, "loss": 0.6471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1171505451202393, "rewards/margins": 1.0514674186706543, "rewards/rejected": 2.065683126449585, "step": 55970 }, { "epoch": 2.599006453410093, "grad_norm": 62.23514938354492, "learning_rate": 1.4413482520079854e-07, "logits/chosen": -18.131145477294922, "logits/rejected": -17.899911880493164, "logps/chosen": -306.23028564453125, "logps/rejected": -289.55694580078125, "loss": 1.071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5541703701019287, "rewards/margins": -0.035765696316957474, "rewards/rejected": 2.5899364948272705, "step": 55980 }, { "epoch": 2.5994707275175264, "grad_norm": 67.84686279296875, "learning_rate": 1.4410696875435256e-07, "logits/chosen": -17.929338455200195, "logits/rejected": -18.58627700805664, "logps/chosen": -288.00274658203125, "logps/rejected": -385.8528747558594, "loss": 1.2384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.729322671890259, "rewards/margins": -0.2040122002363205, "rewards/rejected": 2.9333345890045166, "step": 55990 }, { "epoch": 2.599935001624959, "grad_norm": 49.94200134277344, "learning_rate": 1.4407911230790657e-07, "logits/chosen": -19.628469467163086, "logits/rejected": -19.18443489074707, "logps/chosen": -452.8145446777344, "logps/rejected": -344.77960205078125, "loss": 0.751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.186182975769043, "rewards/margins": 1.2896528244018555, "rewards/rejected": 3.8965306282043457, "step": 56000 }, { "epoch": 2.6003992757323924, "grad_norm": 192.71072387695312, "learning_rate": 1.4405125586146061e-07, "logits/chosen": -18.157867431640625, "logits/rejected": -16.95775032043457, "logps/chosen": -506.3857421875, "logps/rejected": -362.2652282714844, "loss": 0.6646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.692824602127075, "rewards/margins": 1.7852544784545898, "rewards/rejected": 1.9075701236724854, "step": 56010 }, { "epoch": 2.6008635498398256, "grad_norm": 47.81389236450195, "learning_rate": 1.4402339941501463e-07, "logits/chosen": -18.47592544555664, "logits/rejected": -18.46908950805664, "logps/chosen": -399.16192626953125, "logps/rejected": -370.1763610839844, "loss": 0.9233, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4390029907226562, "rewards/margins": 0.32024312019348145, "rewards/rejected": 3.118759870529175, "step": 56020 }, { "epoch": 2.6013278239472584, "grad_norm": 42.732330322265625, "learning_rate": 1.4399554296856862e-07, "logits/chosen": -19.726163864135742, "logits/rejected": -19.6796875, "logps/chosen": -346.2851257324219, "logps/rejected": -367.59844970703125, "loss": 1.0123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4607608318328857, "rewards/margins": 0.49695247411727905, "rewards/rejected": 2.963808298110962, "step": 56030 }, { "epoch": 2.6017920980546916, "grad_norm": 68.6395034790039, "learning_rate": 1.4396768652212266e-07, "logits/chosen": -19.366962432861328, "logits/rejected": -17.742977142333984, "logps/chosen": -483.6446228027344, "logps/rejected": -412.242431640625, "loss": 0.4618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.347247123718262, "rewards/margins": 2.0317132472991943, "rewards/rejected": 2.3155336380004883, "step": 56040 }, { "epoch": 2.6022563721621244, "grad_norm": 3.125093460083008, "learning_rate": 1.4393983007567667e-07, "logits/chosen": -18.730754852294922, "logits/rejected": -17.704612731933594, "logps/chosen": -399.45819091796875, "logps/rejected": -325.735107421875, "loss": 0.4844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.955686569213867, "rewards/margins": 1.329272747039795, "rewards/rejected": 2.626413583755493, "step": 56050 }, { "epoch": 2.6027206462695576, "grad_norm": 88.80097198486328, "learning_rate": 1.439119736292307e-07, "logits/chosen": -18.729887008666992, "logits/rejected": -18.429882049560547, "logps/chosen": -400.90863037109375, "logps/rejected": -356.8868713378906, "loss": 0.4875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.875394344329834, "rewards/margins": 1.319272518157959, "rewards/rejected": 2.556121826171875, "step": 56060 }, { "epoch": 2.6031849203769903, "grad_norm": 17.83395767211914, "learning_rate": 1.438841171827847e-07, "logits/chosen": -19.498348236083984, "logits/rejected": -18.17365074157715, "logps/chosen": -377.9895324707031, "logps/rejected": -299.5392150878906, "loss": 0.7351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.510329008102417, "rewards/margins": 1.0072147846221924, "rewards/rejected": 2.5031142234802246, "step": 56070 }, { "epoch": 2.6036491944844236, "grad_norm": 174.60264587402344, "learning_rate": 1.4385626073633874e-07, "logits/chosen": -19.80588722229004, "logits/rejected": -18.61435317993164, "logps/chosen": -427.1011657714844, "logps/rejected": -345.47698974609375, "loss": 0.7326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.900421619415283, "rewards/margins": 1.172410249710083, "rewards/rejected": 2.7280116081237793, "step": 56080 }, { "epoch": 2.6041134685918568, "grad_norm": 259.419677734375, "learning_rate": 1.4382840428989273e-07, "logits/chosen": -19.069355010986328, "logits/rejected": -17.497314453125, "logps/chosen": -432.3614807128906, "logps/rejected": -289.5834045410156, "loss": 0.5503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.225259304046631, "rewards/margins": 2.7621500492095947, "rewards/rejected": 1.4631094932556152, "step": 56090 }, { "epoch": 2.6045777426992895, "grad_norm": 269.9162902832031, "learning_rate": 1.4380054784344677e-07, "logits/chosen": -19.227434158325195, "logits/rejected": -18.48729705810547, "logps/chosen": -461.9935607910156, "logps/rejected": -382.787841796875, "loss": 0.7888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.760889768600464, "rewards/margins": 1.285435438156128, "rewards/rejected": 2.475454330444336, "step": 56100 }, { "epoch": 2.6050420168067228, "grad_norm": 136.04617309570312, "learning_rate": 1.4377269139700079e-07, "logits/chosen": -17.77830696105957, "logits/rejected": -17.742063522338867, "logps/chosen": -310.77166748046875, "logps/rejected": -283.99798583984375, "loss": 1.0082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1882708072662354, "rewards/margins": 0.5253034830093384, "rewards/rejected": 1.662967324256897, "step": 56110 }, { "epoch": 2.605506290914156, "grad_norm": 95.91866302490234, "learning_rate": 1.437448349505548e-07, "logits/chosen": -18.674121856689453, "logits/rejected": -17.787460327148438, "logps/chosen": -432.1268005371094, "logps/rejected": -310.52935791015625, "loss": 0.6116, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8106391429901123, "rewards/margins": 1.4153943061828613, "rewards/rejected": 2.39524507522583, "step": 56120 }, { "epoch": 2.6059705650215887, "grad_norm": 17.171138763427734, "learning_rate": 1.4371697850410882e-07, "logits/chosen": -18.924663543701172, "logits/rejected": -20.018611907958984, "logps/chosen": -385.0345458984375, "logps/rejected": -370.68182373046875, "loss": 0.9967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0685324668884277, "rewards/margins": 0.2242775410413742, "rewards/rejected": 2.844254732131958, "step": 56130 }, { "epoch": 2.6064348391290215, "grad_norm": 133.3858642578125, "learning_rate": 1.4368912205766283e-07, "logits/chosen": -19.051860809326172, "logits/rejected": -18.982297897338867, "logps/chosen": -343.62835693359375, "logps/rejected": -290.251953125, "loss": 0.6708, "rewards/accuracies": 0.5, "rewards/chosen": 2.742058277130127, "rewards/margins": 0.6108086705207825, "rewards/rejected": 2.13124942779541, "step": 56140 }, { "epoch": 2.6068991132364547, "grad_norm": 82.70958709716797, "learning_rate": 1.4366126561121684e-07, "logits/chosen": -18.473224639892578, "logits/rejected": -17.783245086669922, "logps/chosen": -371.6112365722656, "logps/rejected": -290.3824768066406, "loss": 1.0127, "rewards/accuracies": 0.5, "rewards/chosen": 3.398804187774658, "rewards/margins": 0.8330001831054688, "rewards/rejected": 2.5658037662506104, "step": 56150 }, { "epoch": 2.607363387343888, "grad_norm": 73.19205474853516, "learning_rate": 1.4363340916477089e-07, "logits/chosen": -19.907711029052734, "logits/rejected": -18.38207244873047, "logps/chosen": -494.44195556640625, "logps/rejected": -360.87786865234375, "loss": 0.223, "rewards/accuracies": 1.0, "rewards/chosen": 4.007161617279053, "rewards/margins": 2.4597764015197754, "rewards/rejected": 1.5473854541778564, "step": 56160 }, { "epoch": 2.6078276614513207, "grad_norm": 0.09477752447128296, "learning_rate": 1.436055527183249e-07, "logits/chosen": -20.083799362182617, "logits/rejected": -19.201135635375977, "logps/chosen": -421.7726135253906, "logps/rejected": -254.36184692382812, "loss": 0.3723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.891140937805176, "rewards/margins": 2.3784594535827637, "rewards/rejected": 2.512681722640991, "step": 56170 }, { "epoch": 2.608291935558754, "grad_norm": 0.37278619408607483, "learning_rate": 1.4357769627187891e-07, "logits/chosen": -18.8914737701416, "logits/rejected": -17.806367874145508, "logps/chosen": -330.885498046875, "logps/rejected": -186.95443725585938, "loss": 0.667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2016830444335938, "rewards/margins": 1.8095033168792725, "rewards/rejected": 1.3921797275543213, "step": 56180 }, { "epoch": 2.608756209666187, "grad_norm": 43.26930618286133, "learning_rate": 1.4354983982543293e-07, "logits/chosen": -19.147113800048828, "logits/rejected": -19.336944580078125, "logps/chosen": -338.59454345703125, "logps/rejected": -373.21319580078125, "loss": 1.2589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.121206760406494, "rewards/margins": 0.18740394711494446, "rewards/rejected": 2.933802604675293, "step": 56190 }, { "epoch": 2.60922048377362, "grad_norm": 46.65106964111328, "learning_rate": 1.4352198337898694e-07, "logits/chosen": -19.543315887451172, "logits/rejected": -18.466724395751953, "logps/chosen": -402.3883361816406, "logps/rejected": -264.30511474609375, "loss": 0.5966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.181889772415161, "rewards/margins": 1.2664661407470703, "rewards/rejected": 1.9154236316680908, "step": 56200 }, { "epoch": 2.6096847578810527, "grad_norm": 36.350303649902344, "learning_rate": 1.4349412693254096e-07, "logits/chosen": -18.217975616455078, "logits/rejected": -17.54574203491211, "logps/chosen": -442.78570556640625, "logps/rejected": -429.0748596191406, "loss": 1.1451, "rewards/accuracies": 0.5, "rewards/chosen": 2.8402528762817383, "rewards/margins": -0.10381679236888885, "rewards/rejected": 2.9440696239471436, "step": 56210 }, { "epoch": 2.610149031988486, "grad_norm": 9.910083770751953, "learning_rate": 1.4346627048609497e-07, "logits/chosen": -18.499971389770508, "logits/rejected": -17.32083511352539, "logps/chosen": -407.2809143066406, "logps/rejected": -351.6349182128906, "loss": 0.4903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6837172508239746, "rewards/margins": 1.2583322525024414, "rewards/rejected": 2.425384998321533, "step": 56220 }, { "epoch": 2.610613306095919, "grad_norm": 32.27172088623047, "learning_rate": 1.4343841403964901e-07, "logits/chosen": -19.788326263427734, "logits/rejected": -18.060104370117188, "logps/chosen": -345.20294189453125, "logps/rejected": -273.8114929199219, "loss": 0.3667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4172844886779785, "rewards/margins": 1.3083093166351318, "rewards/rejected": 1.1089751720428467, "step": 56230 }, { "epoch": 2.611077580203352, "grad_norm": 0.1532449722290039, "learning_rate": 1.43410557593203e-07, "logits/chosen": -19.221803665161133, "logits/rejected": -18.620479583740234, "logps/chosen": -389.71307373046875, "logps/rejected": -315.855224609375, "loss": 0.7956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9712212085723877, "rewards/margins": 1.3738816976547241, "rewards/rejected": 2.597339630126953, "step": 56240 }, { "epoch": 2.611541854310785, "grad_norm": 8.866183280944824, "learning_rate": 1.4338270114675704e-07, "logits/chosen": -20.21554183959961, "logits/rejected": -18.808765411376953, "logps/chosen": -467.0533752441406, "logps/rejected": -350.38275146484375, "loss": 0.4222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.599024534225464, "rewards/margins": 1.1817877292633057, "rewards/rejected": 2.417236566543579, "step": 56250 }, { "epoch": 2.6120061284182183, "grad_norm": 87.62213897705078, "learning_rate": 1.4335484470031106e-07, "logits/chosen": -21.057662963867188, "logits/rejected": -19.124778747558594, "logps/chosen": -435.56158447265625, "logps/rejected": -315.9355773925781, "loss": 0.6569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.973636627197266, "rewards/margins": 2.121910810470581, "rewards/rejected": 2.8517255783081055, "step": 56260 }, { "epoch": 2.612470402525651, "grad_norm": 44.54783248901367, "learning_rate": 1.4332698825386507e-07, "logits/chosen": -19.032129287719727, "logits/rejected": -18.470104217529297, "logps/chosen": -434.81304931640625, "logps/rejected": -402.8609924316406, "loss": 0.7799, "rewards/accuracies": 0.5, "rewards/chosen": 3.581195116043091, "rewards/margins": 0.892652690410614, "rewards/rejected": 2.688542604446411, "step": 56270 }, { "epoch": 2.6129346766330843, "grad_norm": 81.35882568359375, "learning_rate": 1.4329913180741909e-07, "logits/chosen": -18.562007904052734, "logits/rejected": -17.54290199279785, "logps/chosen": -386.2972106933594, "logps/rejected": -360.93060302734375, "loss": 0.3945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.720301866531372, "rewards/margins": 1.1777331829071045, "rewards/rejected": 2.5425684452056885, "step": 56280 }, { "epoch": 2.613398950740517, "grad_norm": 246.20079040527344, "learning_rate": 1.4327127536097313e-07, "logits/chosen": -18.67104721069336, "logits/rejected": -17.784181594848633, "logps/chosen": -356.51214599609375, "logps/rejected": -303.1880798339844, "loss": 0.9266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1100049018859863, "rewards/margins": 0.7195293307304382, "rewards/rejected": 2.3904757499694824, "step": 56290 }, { "epoch": 2.6138632248479503, "grad_norm": 22.406009674072266, "learning_rate": 1.4324341891452712e-07, "logits/chosen": -19.4343204498291, "logits/rejected": -17.944072723388672, "logps/chosen": -506.517578125, "logps/rejected": -347.90594482421875, "loss": 0.4322, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3816633224487305, "rewards/margins": 1.876859426498413, "rewards/rejected": 2.504803419113159, "step": 56300 }, { "epoch": 2.614327498955383, "grad_norm": 132.80633544921875, "learning_rate": 1.4321556246808116e-07, "logits/chosen": -18.84316062927246, "logits/rejected": -17.901060104370117, "logps/chosen": -377.66192626953125, "logps/rejected": -269.0623474121094, "loss": 0.5534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.135413408279419, "rewards/margins": 1.2541906833648682, "rewards/rejected": 1.8812227249145508, "step": 56310 }, { "epoch": 2.6147917730628163, "grad_norm": 19.516752243041992, "learning_rate": 1.4318770602163517e-07, "logits/chosen": -18.456323623657227, "logits/rejected": -17.894399642944336, "logps/chosen": -485.542724609375, "logps/rejected": -350.4432373046875, "loss": 0.3268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.425159931182861, "rewards/margins": 1.8746144771575928, "rewards/rejected": 2.5505452156066895, "step": 56320 }, { "epoch": 2.6152560471702495, "grad_norm": 131.3446044921875, "learning_rate": 1.4315984957518919e-07, "logits/chosen": -18.798864364624023, "logits/rejected": -18.336008071899414, "logps/chosen": -318.7547607421875, "logps/rejected": -322.39581298828125, "loss": 1.325, "rewards/accuracies": 0.5, "rewards/chosen": 1.5772039890289307, "rewards/margins": -0.2327069789171219, "rewards/rejected": 1.8099111318588257, "step": 56330 }, { "epoch": 2.6157203212776823, "grad_norm": 20.12257194519043, "learning_rate": 1.431319931287432e-07, "logits/chosen": -18.455102920532227, "logits/rejected": -17.865970611572266, "logps/chosen": -394.8061828613281, "logps/rejected": -309.6219177246094, "loss": 0.9112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.800560235977173, "rewards/margins": 0.7410179376602173, "rewards/rejected": 2.059542179107666, "step": 56340 }, { "epoch": 2.6161845953851155, "grad_norm": 152.99842834472656, "learning_rate": 1.4310413668229721e-07, "logits/chosen": -18.966533660888672, "logits/rejected": -18.4468936920166, "logps/chosen": -428.46820068359375, "logps/rejected": -322.2409973144531, "loss": 0.7506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.008657932281494, "rewards/margins": 0.7933131456375122, "rewards/rejected": 2.2153451442718506, "step": 56350 }, { "epoch": 2.6166488694925483, "grad_norm": 196.9791717529297, "learning_rate": 1.4307628023585123e-07, "logits/chosen": -18.77037811279297, "logits/rejected": -18.552438735961914, "logps/chosen": -418.350830078125, "logps/rejected": -336.33575439453125, "loss": 0.8984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2817776203155518, "rewards/margins": 0.561366856098175, "rewards/rejected": 2.7204108238220215, "step": 56360 }, { "epoch": 2.6171131435999815, "grad_norm": 86.41241455078125, "learning_rate": 1.4304842378940527e-07, "logits/chosen": -20.265708923339844, "logits/rejected": -19.04409408569336, "logps/chosen": -418.92596435546875, "logps/rejected": -328.42950439453125, "loss": 0.3792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.515740871429443, "rewards/margins": 1.909409761428833, "rewards/rejected": 2.6063313484191895, "step": 56370 }, { "epoch": 2.6175774177074143, "grad_norm": 10.492280960083008, "learning_rate": 1.4302056734295928e-07, "logits/chosen": -18.724327087402344, "logits/rejected": -17.917003631591797, "logps/chosen": -335.60784912109375, "logps/rejected": -234.41433715820312, "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5644278526306152, "rewards/margins": 1.2116979360580444, "rewards/rejected": 1.3527300357818604, "step": 56380 }, { "epoch": 2.6180416918148475, "grad_norm": 53.509159088134766, "learning_rate": 1.429927108965133e-07, "logits/chosen": -19.959575653076172, "logits/rejected": -19.949295043945312, "logps/chosen": -412.93414306640625, "logps/rejected": -330.98919677734375, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.006945610046387, "rewards/margins": 1.3010492324829102, "rewards/rejected": 2.7058966159820557, "step": 56390 }, { "epoch": 2.6185059659222807, "grad_norm": 8.825533866882324, "learning_rate": 1.4296485445006731e-07, "logits/chosen": -18.136104583740234, "logits/rejected": -18.97821044921875, "logps/chosen": -308.88873291015625, "logps/rejected": -356.34625244140625, "loss": 1.0733, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2086501121520996, "rewards/margins": 0.04970378801226616, "rewards/rejected": 2.1589457988739014, "step": 56400 }, { "epoch": 2.6189702400297135, "grad_norm": 27.782100677490234, "learning_rate": 1.4293699800362133e-07, "logits/chosen": -20.230632781982422, "logits/rejected": -19.001964569091797, "logps/chosen": -486.592529296875, "logps/rejected": -301.8730773925781, "loss": 0.3705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.564972400665283, "rewards/margins": 2.1916298866271973, "rewards/rejected": 2.373342275619507, "step": 56410 }, { "epoch": 2.6194345141371467, "grad_norm": 1.0135499238967896, "learning_rate": 1.4290914155717534e-07, "logits/chosen": -19.827571868896484, "logits/rejected": -18.84174919128418, "logps/chosen": -490.6319885253906, "logps/rejected": -427.8377990722656, "loss": 0.7135, "rewards/accuracies": 0.5, "rewards/chosen": 4.220532417297363, "rewards/margins": 0.9503142237663269, "rewards/rejected": 3.2702178955078125, "step": 56420 }, { "epoch": 2.6198987882445794, "grad_norm": 26.311962127685547, "learning_rate": 1.4288128511072936e-07, "logits/chosen": -19.178678512573242, "logits/rejected": -18.33566665649414, "logps/chosen": -464.17059326171875, "logps/rejected": -400.85992431640625, "loss": 0.6059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.571527481079102, "rewards/margins": 1.027147650718689, "rewards/rejected": 3.544379472732544, "step": 56430 }, { "epoch": 2.6203630623520127, "grad_norm": 1.54728364944458, "learning_rate": 1.428534286642834e-07, "logits/chosen": -19.19844627380371, "logits/rejected": -17.944883346557617, "logps/chosen": -415.06781005859375, "logps/rejected": -267.31903076171875, "loss": 0.6117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3369147777557373, "rewards/margins": 1.350969910621643, "rewards/rejected": 1.9859449863433838, "step": 56440 }, { "epoch": 2.6208273364594454, "grad_norm": 58.33738708496094, "learning_rate": 1.4282557221783739e-07, "logits/chosen": -18.105289459228516, "logits/rejected": -18.023401260375977, "logps/chosen": -415.7127990722656, "logps/rejected": -389.21063232421875, "loss": 1.2747, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9554855823516846, "rewards/margins": -0.30505096912384033, "rewards/rejected": 3.2605366706848145, "step": 56450 }, { "epoch": 2.6212916105668786, "grad_norm": 30.1340389251709, "learning_rate": 1.4279771577139143e-07, "logits/chosen": -19.228591918945312, "logits/rejected": -18.22933578491211, "logps/chosen": -370.1988525390625, "logps/rejected": -323.83734130859375, "loss": 0.4334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.897089958190918, "rewards/margins": 1.2275381088256836, "rewards/rejected": 1.6695518493652344, "step": 56460 }, { "epoch": 2.621755884674312, "grad_norm": 12.973344802856445, "learning_rate": 1.4276985932494544e-07, "logits/chosen": -18.503238677978516, "logits/rejected": -17.451444625854492, "logps/chosen": -456.56170654296875, "logps/rejected": -320.3511047363281, "loss": 0.5263, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.257702350616455, "rewards/margins": 1.8160957098007202, "rewards/rejected": 2.4416065216064453, "step": 56470 }, { "epoch": 2.6222201587817446, "grad_norm": 1.8564403057098389, "learning_rate": 1.4274200287849946e-07, "logits/chosen": -19.31814956665039, "logits/rejected": -17.60857391357422, "logps/chosen": -311.1449890136719, "logps/rejected": -202.66319274902344, "loss": 0.552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1401591300964355, "rewards/margins": 1.8910398483276367, "rewards/rejected": 1.2491190433502197, "step": 56480 }, { "epoch": 2.622684432889178, "grad_norm": 321.65484619140625, "learning_rate": 1.4271414643205347e-07, "logits/chosen": -19.0052433013916, "logits/rejected": -18.18753433227539, "logps/chosen": -356.0067443847656, "logps/rejected": -322.0979919433594, "loss": 0.6356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.383965015411377, "rewards/margins": 1.057523250579834, "rewards/rejected": 1.3264416456222534, "step": 56490 }, { "epoch": 2.623148706996611, "grad_norm": 153.44140625, "learning_rate": 1.426862899856075e-07, "logits/chosen": -19.043575286865234, "logits/rejected": -19.015291213989258, "logps/chosen": -462.9498596191406, "logps/rejected": -366.7543029785156, "loss": 0.7397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.621470928192139, "rewards/margins": 1.2967876195907593, "rewards/rejected": 3.324683427810669, "step": 56500 }, { "epoch": 2.623612981104044, "grad_norm": 168.8323974609375, "learning_rate": 1.426584335391615e-07, "logits/chosen": -19.23997688293457, "logits/rejected": -18.863576889038086, "logps/chosen": -312.453369140625, "logps/rejected": -296.68310546875, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.116238832473755, "rewards/margins": 0.6609703302383423, "rewards/rejected": 2.455268621444702, "step": 56510 }, { "epoch": 2.6240772552114766, "grad_norm": 46.04225540161133, "learning_rate": 1.4263057709271554e-07, "logits/chosen": -18.23098373413086, "logits/rejected": -18.136104583740234, "logps/chosen": -324.184814453125, "logps/rejected": -302.95849609375, "loss": 0.6018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9229202270507812, "rewards/margins": 0.43810024857521057, "rewards/rejected": 2.4848196506500244, "step": 56520 }, { "epoch": 2.62454152931891, "grad_norm": 17.63821029663086, "learning_rate": 1.4260272064626956e-07, "logits/chosen": -17.766361236572266, "logits/rejected": -17.64535903930664, "logps/chosen": -344.3671875, "logps/rejected": -357.40411376953125, "loss": 0.9032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.973397731781006, "rewards/margins": 0.16911546885967255, "rewards/rejected": 2.8042826652526855, "step": 56530 }, { "epoch": 2.625005803426343, "grad_norm": 200.8017578125, "learning_rate": 1.4257486419982357e-07, "logits/chosen": -18.003803253173828, "logits/rejected": -17.680782318115234, "logps/chosen": -354.1653137207031, "logps/rejected": -338.8851318359375, "loss": 1.0399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3061676025390625, "rewards/margins": 0.5991300344467163, "rewards/rejected": 2.707037925720215, "step": 56540 }, { "epoch": 2.625470077533776, "grad_norm": 27.496023178100586, "learning_rate": 1.4254700775337758e-07, "logits/chosen": -18.123605728149414, "logits/rejected": -18.339637756347656, "logps/chosen": -374.84857177734375, "logps/rejected": -337.51141357421875, "loss": 1.4474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6617541313171387, "rewards/margins": 0.8446398973464966, "rewards/rejected": 2.8171138763427734, "step": 56550 }, { "epoch": 2.625934351641209, "grad_norm": 214.39540100097656, "learning_rate": 1.425191513069316e-07, "logits/chosen": -18.782411575317383, "logits/rejected": -18.26462173461914, "logps/chosen": -347.7376403808594, "logps/rejected": -272.6795959472656, "loss": 0.8744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8267040252685547, "rewards/margins": 0.4779396057128906, "rewards/rejected": 3.3487648963928223, "step": 56560 }, { "epoch": 2.6263986257486422, "grad_norm": 56.72160720825195, "learning_rate": 1.4249129486048561e-07, "logits/chosen": -18.547895431518555, "logits/rejected": -18.04186248779297, "logps/chosen": -396.8932189941406, "logps/rejected": -332.397216796875, "loss": 0.5394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8239688873291016, "rewards/margins": 1.5379726886749268, "rewards/rejected": 2.285996437072754, "step": 56570 }, { "epoch": 2.626862899856075, "grad_norm": 94.15465545654297, "learning_rate": 1.4246343841403965e-07, "logits/chosen": -19.993480682373047, "logits/rejected": -18.642433166503906, "logps/chosen": -484.42828369140625, "logps/rejected": -370.75457763671875, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.295358657836914, "rewards/margins": 1.3982566595077515, "rewards/rejected": 2.897101879119873, "step": 56580 }, { "epoch": 2.627327173963508, "grad_norm": 0.27288126945495605, "learning_rate": 1.4243558196759367e-07, "logits/chosen": -19.463024139404297, "logits/rejected": -18.90988540649414, "logps/chosen": -452.4237365722656, "logps/rejected": -413.5907287597656, "loss": 0.5372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.701374053955078, "rewards/margins": 1.6623108386993408, "rewards/rejected": 3.0390636920928955, "step": 56590 }, { "epoch": 2.627791448070941, "grad_norm": 74.69575500488281, "learning_rate": 1.4240772552114768e-07, "logits/chosen": -19.892847061157227, "logits/rejected": -19.061342239379883, "logps/chosen": -411.75421142578125, "logps/rejected": -364.30474853515625, "loss": 0.3534, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7004005908966064, "rewards/margins": 1.7328999042510986, "rewards/rejected": 1.9675004482269287, "step": 56600 }, { "epoch": 2.628255722178374, "grad_norm": 165.3255615234375, "learning_rate": 1.423798690747017e-07, "logits/chosen": -18.792259216308594, "logits/rejected": -17.81638526916504, "logps/chosen": -417.3675842285156, "logps/rejected": -291.428466796875, "loss": 0.514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5318470001220703, "rewards/margins": 1.6773344278335571, "rewards/rejected": 1.8545124530792236, "step": 56610 }, { "epoch": 2.628719996285807, "grad_norm": 61.65585708618164, "learning_rate": 1.423520126282557e-07, "logits/chosen": -19.558000564575195, "logits/rejected": -18.228281021118164, "logps/chosen": -401.9662170410156, "logps/rejected": -288.87921142578125, "loss": 0.5198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1181962490081787, "rewards/margins": 0.6724725365638733, "rewards/rejected": 2.4457240104675293, "step": 56620 }, { "epoch": 2.62918427039324, "grad_norm": 29.14359474182129, "learning_rate": 1.4232415618180973e-07, "logits/chosen": -18.47555923461914, "logits/rejected": -18.441730499267578, "logps/chosen": -353.2415466308594, "logps/rejected": -320.0327453613281, "loss": 0.8752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6802704334259033, "rewards/margins": 0.7804375290870667, "rewards/rejected": 2.8998327255249023, "step": 56630 }, { "epoch": 2.6296485445006734, "grad_norm": 61.83304214477539, "learning_rate": 1.4229629973536374e-07, "logits/chosen": -19.60886001586914, "logits/rejected": -18.05935287475586, "logps/chosen": -287.3503723144531, "logps/rejected": -217.0243682861328, "loss": 0.4593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1377639770507812, "rewards/margins": 1.5143436193466187, "rewards/rejected": 1.6234201192855835, "step": 56640 }, { "epoch": 2.630112818608106, "grad_norm": 215.25128173828125, "learning_rate": 1.4226844328891778e-07, "logits/chosen": -19.265308380126953, "logits/rejected": -18.903608322143555, "logps/chosen": -426.78546142578125, "logps/rejected": -305.0674743652344, "loss": 0.7976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.500926494598389, "rewards/margins": 1.6434358358383179, "rewards/rejected": 2.8574905395507812, "step": 56650 }, { "epoch": 2.6305770927155394, "grad_norm": 124.8985366821289, "learning_rate": 1.4224058684247177e-07, "logits/chosen": -18.911727905273438, "logits/rejected": -17.56375503540039, "logps/chosen": -406.02337646484375, "logps/rejected": -251.7563934326172, "loss": 0.5561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8930091857910156, "rewards/margins": 2.0700721740722656, "rewards/rejected": 1.8229373693466187, "step": 56660 }, { "epoch": 2.631041366822972, "grad_norm": 1.8541674613952637, "learning_rate": 1.422127303960258e-07, "logits/chosen": -17.989381790161133, "logits/rejected": -18.60104751586914, "logps/chosen": -287.3648986816406, "logps/rejected": -297.49932861328125, "loss": 0.9554, "rewards/accuracies": 0.5, "rewards/chosen": 2.853639602661133, "rewards/margins": 0.3769391179084778, "rewards/rejected": 2.4767003059387207, "step": 56670 }, { "epoch": 2.6315056409304054, "grad_norm": 68.05005645751953, "learning_rate": 1.4218487394957983e-07, "logits/chosen": -19.33945083618164, "logits/rejected": -18.78838348388672, "logps/chosen": -336.644775390625, "logps/rejected": -307.3932800292969, "loss": 0.8316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.68593430519104, "rewards/margins": 0.4389670789241791, "rewards/rejected": 2.246967315673828, "step": 56680 }, { "epoch": 2.631969915037838, "grad_norm": 49.03050231933594, "learning_rate": 1.4215701750313384e-07, "logits/chosen": -18.79240608215332, "logits/rejected": -18.815502166748047, "logps/chosen": -385.04296875, "logps/rejected": -433.875, "loss": 1.6741, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2319393157958984, "rewards/margins": -0.7286826372146606, "rewards/rejected": 3.9606223106384277, "step": 56690 }, { "epoch": 2.6324341891452714, "grad_norm": 53.942928314208984, "learning_rate": 1.4212916105668786e-07, "logits/chosen": -18.898853302001953, "logits/rejected": -18.982572555541992, "logps/chosen": -362.71087646484375, "logps/rejected": -361.9931335449219, "loss": 0.714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.308912992477417, "rewards/margins": 0.534360408782959, "rewards/rejected": 2.774552583694458, "step": 56700 }, { "epoch": 2.6328984632527046, "grad_norm": 32.266258239746094, "learning_rate": 1.421013046102419e-07, "logits/chosen": -18.685134887695312, "logits/rejected": -18.611379623413086, "logps/chosen": -337.76141357421875, "logps/rejected": -435.4656677246094, "loss": 1.2632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.498920440673828, "rewards/margins": -0.1841956079006195, "rewards/rejected": 2.6831161975860596, "step": 56710 }, { "epoch": 2.6333627373601374, "grad_norm": 72.1561050415039, "learning_rate": 1.4207344816379588e-07, "logits/chosen": -18.8571834564209, "logits/rejected": -18.529197692871094, "logps/chosen": -353.07647705078125, "logps/rejected": -319.46282958984375, "loss": 0.5802, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1032605171203613, "rewards/margins": 0.909663200378418, "rewards/rejected": 2.1935973167419434, "step": 56720 }, { "epoch": 2.6338270114675706, "grad_norm": 15.184019088745117, "learning_rate": 1.4204559171734993e-07, "logits/chosen": -19.27390480041504, "logits/rejected": -18.655181884765625, "logps/chosen": -381.1890869140625, "logps/rejected": -343.73529052734375, "loss": 0.5883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.794538974761963, "rewards/margins": 1.393862247467041, "rewards/rejected": 2.4006764888763428, "step": 56730 }, { "epoch": 2.6342912855750034, "grad_norm": 0.023859210312366486, "learning_rate": 1.4201773527090394e-07, "logits/chosen": -19.116031646728516, "logits/rejected": -18.284175872802734, "logps/chosen": -507.2200622558594, "logps/rejected": -413.556884765625, "loss": 0.89, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 5.011625289916992, "rewards/margins": 0.7659000158309937, "rewards/rejected": 4.245725154876709, "step": 56740 }, { "epoch": 2.6347555596824366, "grad_norm": 71.74439239501953, "learning_rate": 1.4198987882445795e-07, "logits/chosen": -19.40807342529297, "logits/rejected": -19.219905853271484, "logps/chosen": -423.99456787109375, "logps/rejected": -443.114501953125, "loss": 0.4898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9763312339782715, "rewards/margins": 0.8871108889579773, "rewards/rejected": 3.0892205238342285, "step": 56750 }, { "epoch": 2.6352198337898693, "grad_norm": 102.99481201171875, "learning_rate": 1.4196202237801197e-07, "logits/chosen": -19.10112953186035, "logits/rejected": -19.662429809570312, "logps/chosen": -341.52008056640625, "logps/rejected": -409.14898681640625, "loss": 1.1051, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.235185146331787, "rewards/margins": -0.4431215822696686, "rewards/rejected": 3.6783065795898438, "step": 56760 }, { "epoch": 2.6356841078973026, "grad_norm": 38.19758224487305, "learning_rate": 1.4193416593156598e-07, "logits/chosen": -18.28818130493164, "logits/rejected": -18.367340087890625, "logps/chosen": -316.4789733886719, "logps/rejected": -288.8318176269531, "loss": 1.0341, "rewards/accuracies": 0.5, "rewards/chosen": 1.4749436378479004, "rewards/margins": 0.13790300488471985, "rewards/rejected": 1.337040662765503, "step": 56770 }, { "epoch": 2.636148382004736, "grad_norm": 11.613061904907227, "learning_rate": 1.4190630948512e-07, "logits/chosen": -19.764408111572266, "logits/rejected": -18.578458786010742, "logps/chosen": -393.75762939453125, "logps/rejected": -295.64215087890625, "loss": 0.4942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6447949409484863, "rewards/margins": 1.7459125518798828, "rewards/rejected": 1.8988821506500244, "step": 56780 }, { "epoch": 2.6366126561121686, "grad_norm": 4.164614677429199, "learning_rate": 1.4187845303867404e-07, "logits/chosen": -18.676898956298828, "logits/rejected": -18.092571258544922, "logps/chosen": -306.6797180175781, "logps/rejected": -242.7735137939453, "loss": 0.9604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8599228858947754, "rewards/margins": 1.3070703744888306, "rewards/rejected": 1.5528523921966553, "step": 56790 }, { "epoch": 2.6370769302196018, "grad_norm": 32.77748489379883, "learning_rate": 1.4185059659222805e-07, "logits/chosen": -19.794876098632812, "logits/rejected": -19.633268356323242, "logps/chosen": -466.481201171875, "logps/rejected": -398.80963134765625, "loss": 0.8081, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.741637706756592, "rewards/margins": 1.0227893590927124, "rewards/rejected": 2.718848466873169, "step": 56800 }, { "epoch": 2.6375412043270345, "grad_norm": 35.76245880126953, "learning_rate": 1.4182274014578207e-07, "logits/chosen": -19.296049118041992, "logits/rejected": -18.255495071411133, "logps/chosen": -427.70989990234375, "logps/rejected": -328.6031799316406, "loss": 0.6234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3929243087768555, "rewards/margins": 1.2914477586746216, "rewards/rejected": 3.1014761924743652, "step": 56810 }, { "epoch": 2.6380054784344678, "grad_norm": 124.08304595947266, "learning_rate": 1.4179488369933608e-07, "logits/chosen": -19.484495162963867, "logits/rejected": -18.452007293701172, "logps/chosen": -521.1375122070312, "logps/rejected": -434.47686767578125, "loss": 0.4571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.5343427658081055, "rewards/margins": 2.121964931488037, "rewards/rejected": 3.4123783111572266, "step": 56820 }, { "epoch": 2.6384697525419005, "grad_norm": 15.106879234313965, "learning_rate": 1.417670272528901e-07, "logits/chosen": -18.858427047729492, "logits/rejected": -18.835390090942383, "logps/chosen": -327.49542236328125, "logps/rejected": -326.85552978515625, "loss": 1.4488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.97200870513916, "rewards/margins": -0.10856528580188751, "rewards/rejected": 3.080573797225952, "step": 56830 }, { "epoch": 2.6389340266493337, "grad_norm": 65.22691345214844, "learning_rate": 1.417391708064441e-07, "logits/chosen": -20.562896728515625, "logits/rejected": -19.413606643676758, "logps/chosen": -455.86053466796875, "logps/rejected": -358.7945251464844, "loss": 0.9063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.591379642486572, "rewards/margins": 1.1441742181777954, "rewards/rejected": 3.4472053050994873, "step": 56840 }, { "epoch": 2.639398300756767, "grad_norm": 220.5723419189453, "learning_rate": 1.4171131435999813e-07, "logits/chosen": -19.671268463134766, "logits/rejected": -19.671934127807617, "logps/chosen": -482.3926696777344, "logps/rejected": -420.3143615722656, "loss": 0.4922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.673112154006958, "rewards/margins": 0.9538698196411133, "rewards/rejected": 2.719242572784424, "step": 56850 }, { "epoch": 2.6398625748641997, "grad_norm": 0.30290302634239197, "learning_rate": 1.4168345791355217e-07, "logits/chosen": -17.73903465270996, "logits/rejected": -17.191484451293945, "logps/chosen": -433.1400451660156, "logps/rejected": -348.6824951171875, "loss": 1.0015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.973051071166992, "rewards/margins": 1.4486756324768066, "rewards/rejected": 2.5243756771087646, "step": 56860 }, { "epoch": 2.640326848971633, "grad_norm": 40.31406784057617, "learning_rate": 1.4165560146710616e-07, "logits/chosen": -19.705860137939453, "logits/rejected": -18.6590633392334, "logps/chosen": -430.1101989746094, "logps/rejected": -418.18927001953125, "loss": 0.9126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7121129035949707, "rewards/margins": 0.4439105987548828, "rewards/rejected": 2.268202304840088, "step": 56870 }, { "epoch": 2.6407911230790657, "grad_norm": 25.58711051940918, "learning_rate": 1.416277450206602e-07, "logits/chosen": -19.00100326538086, "logits/rejected": -17.71062660217285, "logps/chosen": -482.59014892578125, "logps/rejected": -365.5501708984375, "loss": 0.6296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6539313793182373, "rewards/margins": 0.90324467420578, "rewards/rejected": 2.7506868839263916, "step": 56880 }, { "epoch": 2.641255397186499, "grad_norm": 15.71096134185791, "learning_rate": 1.415998885742142e-07, "logits/chosen": -18.845935821533203, "logits/rejected": -17.272672653198242, "logps/chosen": -422.9810485839844, "logps/rejected": -223.89395141601562, "loss": 0.698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.001056671142578, "rewards/margins": 2.128786563873291, "rewards/rejected": 1.8722699880599976, "step": 56890 }, { "epoch": 2.6417196712939317, "grad_norm": 37.71321105957031, "learning_rate": 1.4157203212776823e-07, "logits/chosen": -18.396120071411133, "logits/rejected": -18.28214454650879, "logps/chosen": -370.7333984375, "logps/rejected": -285.460693359375, "loss": 0.823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3706214427948, "rewards/margins": 0.865203857421875, "rewards/rejected": 1.5054175853729248, "step": 56900 }, { "epoch": 2.642183945401365, "grad_norm": 80.01006317138672, "learning_rate": 1.4154417568132224e-07, "logits/chosen": -18.608131408691406, "logits/rejected": -18.983266830444336, "logps/chosen": -342.5215148925781, "logps/rejected": -337.0180358886719, "loss": 0.9813, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.603360176086426, "rewards/margins": -0.003842294216156006, "rewards/rejected": 2.6072025299072266, "step": 56910 }, { "epoch": 2.642648219508798, "grad_norm": 22.091886520385742, "learning_rate": 1.4151631923487628e-07, "logits/chosen": -19.31665802001953, "logits/rejected": -17.71689224243164, "logps/chosen": -404.53253173828125, "logps/rejected": -292.9681396484375, "loss": 0.7047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.730125904083252, "rewards/margins": 1.4416799545288086, "rewards/rejected": 2.2884457111358643, "step": 56920 }, { "epoch": 2.643112493616231, "grad_norm": 79.08560180664062, "learning_rate": 1.4148846278843027e-07, "logits/chosen": -18.88799285888672, "logits/rejected": -18.636329650878906, "logps/chosen": -469.967041015625, "logps/rejected": -428.71240234375, "loss": 1.2833, "rewards/accuracies": 0.5, "rewards/chosen": 4.533596992492676, "rewards/margins": 0.3036758601665497, "rewards/rejected": 4.229921340942383, "step": 56930 }, { "epoch": 2.643576767723664, "grad_norm": 137.8601837158203, "learning_rate": 1.414606063419843e-07, "logits/chosen": -18.623647689819336, "logits/rejected": -17.993173599243164, "logps/chosen": -360.1565856933594, "logps/rejected": -311.3908996582031, "loss": 0.5374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0684380531311035, "rewards/margins": 0.9615405797958374, "rewards/rejected": 2.1068971157073975, "step": 56940 }, { "epoch": 2.6440410418310973, "grad_norm": 78.51077270507812, "learning_rate": 1.4143274989553832e-07, "logits/chosen": -18.345130920410156, "logits/rejected": -17.393033981323242, "logps/chosen": -417.3013610839844, "logps/rejected": -296.5645446777344, "loss": 0.5705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.597492218017578, "rewards/margins": 1.5302056074142456, "rewards/rejected": 2.067286729812622, "step": 56950 }, { "epoch": 2.64450531593853, "grad_norm": 208.314453125, "learning_rate": 1.4140489344909234e-07, "logits/chosen": -18.760915756225586, "logits/rejected": -18.19415855407715, "logps/chosen": -476.8818359375, "logps/rejected": -375.1452941894531, "loss": 0.9146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7060108184814453, "rewards/margins": 0.8238981366157532, "rewards/rejected": 2.882112741470337, "step": 56960 }, { "epoch": 2.644969590045963, "grad_norm": 3.290100574493408, "learning_rate": 1.4137703700264635e-07, "logits/chosen": -19.827320098876953, "logits/rejected": -19.423030853271484, "logps/chosen": -394.9832458496094, "logps/rejected": -336.24359130859375, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1588544845581055, "rewards/margins": 1.2859172821044922, "rewards/rejected": 2.8729374408721924, "step": 56970 }, { "epoch": 2.645433864153396, "grad_norm": 34.866939544677734, "learning_rate": 1.4134918055620037e-07, "logits/chosen": -18.3405704498291, "logits/rejected": -18.317956924438477, "logps/chosen": -512.0953369140625, "logps/rejected": -450.14501953125, "loss": 0.818, "rewards/accuracies": 0.5, "rewards/chosen": 4.7886152267456055, "rewards/margins": 0.7610198855400085, "rewards/rejected": 4.027595043182373, "step": 56980 }, { "epoch": 2.6458981382608293, "grad_norm": 195.661865234375, "learning_rate": 1.4132132410975438e-07, "logits/chosen": -18.39582633972168, "logits/rejected": -17.570514678955078, "logps/chosen": -410.8815002441406, "logps/rejected": -271.4491882324219, "loss": 0.3996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4171040058135986, "rewards/margins": 1.7964941263198853, "rewards/rejected": 1.620609998703003, "step": 56990 }, { "epoch": 2.646362412368262, "grad_norm": 7.871420383453369, "learning_rate": 1.4129346766330842e-07, "logits/chosen": -19.21324920654297, "logits/rejected": -17.93600845336914, "logps/chosen": -427.1336975097656, "logps/rejected": -294.64312744140625, "loss": 0.263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.729666233062744, "rewards/margins": 2.187572956085205, "rewards/rejected": 1.5420928001403809, "step": 57000 }, { "epoch": 2.6468266864756953, "grad_norm": 151.30882263183594, "learning_rate": 1.4126561121686244e-07, "logits/chosen": -19.150279998779297, "logits/rejected": -18.427928924560547, "logps/chosen": -375.94927978515625, "logps/rejected": -333.89556884765625, "loss": 1.0987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.108985185623169, "rewards/margins": 0.6289867758750916, "rewards/rejected": 2.4799983501434326, "step": 57010 }, { "epoch": 2.6472909605831285, "grad_norm": 176.378662109375, "learning_rate": 1.4123775477041643e-07, "logits/chosen": -19.32695198059082, "logits/rejected": -18.5729923248291, "logps/chosen": -416.950439453125, "logps/rejected": -292.337158203125, "loss": 0.7958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0099925994873047, "rewards/margins": 1.2318227291107178, "rewards/rejected": 1.7781696319580078, "step": 57020 }, { "epoch": 2.6477552346905613, "grad_norm": 83.0306167602539, "learning_rate": 1.4120989832397047e-07, "logits/chosen": -19.92441749572754, "logits/rejected": -19.377620697021484, "logps/chosen": -398.61865234375, "logps/rejected": -320.5578308105469, "loss": 0.6028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5217678546905518, "rewards/margins": 0.7288568615913391, "rewards/rejected": 2.7929110527038574, "step": 57030 }, { "epoch": 2.648219508797994, "grad_norm": 8.755629539489746, "learning_rate": 1.4118204187752448e-07, "logits/chosen": -18.661714553833008, "logits/rejected": -18.245861053466797, "logps/chosen": -361.34088134765625, "logps/rejected": -274.75592041015625, "loss": 0.8045, "rewards/accuracies": 0.5, "rewards/chosen": 3.166569232940674, "rewards/margins": 1.278814673423767, "rewards/rejected": 1.8877546787261963, "step": 57040 }, { "epoch": 2.6486837829054273, "grad_norm": 12.86487865447998, "learning_rate": 1.411541854310785e-07, "logits/chosen": -19.542329788208008, "logits/rejected": -19.54438018798828, "logps/chosen": -296.63812255859375, "logps/rejected": -308.1880798339844, "loss": 0.7918, "rewards/accuracies": 0.5, "rewards/chosen": 2.820511817932129, "rewards/margins": 0.3620699942111969, "rewards/rejected": 2.458441734313965, "step": 57050 }, { "epoch": 2.6491480570128605, "grad_norm": 42.73139572143555, "learning_rate": 1.411263289846325e-07, "logits/chosen": -19.842866897583008, "logits/rejected": -18.421401977539062, "logps/chosen": -423.1288146972656, "logps/rejected": -295.53802490234375, "loss": 0.3492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3925087451934814, "rewards/margins": 1.7659803628921509, "rewards/rejected": 1.626528024673462, "step": 57060 }, { "epoch": 2.6496123311202933, "grad_norm": 0.921381413936615, "learning_rate": 1.4109847253818655e-07, "logits/chosen": -19.34109115600586, "logits/rejected": -17.80742645263672, "logps/chosen": -360.41522216796875, "logps/rejected": -281.08209228515625, "loss": 0.4012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.112495422363281, "rewards/margins": 2.2208571434020996, "rewards/rejected": 1.891638159751892, "step": 57070 }, { "epoch": 2.6500766052277265, "grad_norm": 132.96546936035156, "learning_rate": 1.4107061609174054e-07, "logits/chosen": -18.891767501831055, "logits/rejected": -17.52009391784668, "logps/chosen": -434.9388732910156, "logps/rejected": -272.9900817871094, "loss": 1.0264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.272226333618164, "rewards/margins": 1.4880287647247314, "rewards/rejected": 1.784197211265564, "step": 57080 }, { "epoch": 2.6505408793351597, "grad_norm": 46.7591438293457, "learning_rate": 1.4104275964529458e-07, "logits/chosen": -19.334375381469727, "logits/rejected": -18.639263153076172, "logps/chosen": -437.55047607421875, "logps/rejected": -331.10693359375, "loss": 0.4881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1507058143615723, "rewards/margins": 0.911036491394043, "rewards/rejected": 2.2396693229675293, "step": 57090 }, { "epoch": 2.6510051534425925, "grad_norm": 25.6140079498291, "learning_rate": 1.410149031988486e-07, "logits/chosen": -18.89577865600586, "logits/rejected": -18.543054580688477, "logps/chosen": -379.6099548339844, "logps/rejected": -332.33111572265625, "loss": 0.8028, "rewards/accuracies": 0.5, "rewards/chosen": 2.8114800453186035, "rewards/margins": 0.16788974404335022, "rewards/rejected": 2.643589973449707, "step": 57100 }, { "epoch": 2.6514694275500257, "grad_norm": 3.7355382442474365, "learning_rate": 1.409870467524026e-07, "logits/chosen": -20.04802131652832, "logits/rejected": -18.669292449951172, "logps/chosen": -437.529052734375, "logps/rejected": -283.5562744140625, "loss": 0.3267, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.862816572189331, "rewards/margins": 1.8267288208007812, "rewards/rejected": 2.03608775138855, "step": 57110 }, { "epoch": 2.6519337016574585, "grad_norm": 67.41323852539062, "learning_rate": 1.4095919030595662e-07, "logits/chosen": -19.656572341918945, "logits/rejected": -18.603742599487305, "logps/chosen": -368.23455810546875, "logps/rejected": -282.46435546875, "loss": 0.8982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2274861335754395, "rewards/margins": 0.643367350101471, "rewards/rejected": 2.584118604660034, "step": 57120 }, { "epoch": 2.6523979757648917, "grad_norm": 36.81914520263672, "learning_rate": 1.4093133385951067e-07, "logits/chosen": -19.709110260009766, "logits/rejected": -18.660255432128906, "logps/chosen": -433.66552734375, "logps/rejected": -355.7886962890625, "loss": 0.762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4222335815429688, "rewards/margins": 0.7622026205062866, "rewards/rejected": 2.6600306034088135, "step": 57130 }, { "epoch": 2.6528622498723244, "grad_norm": 30.26968002319336, "learning_rate": 1.4090347741306465e-07, "logits/chosen": -18.760021209716797, "logits/rejected": -18.12143898010254, "logps/chosen": -438.73175048828125, "logps/rejected": -392.6187438964844, "loss": 0.6103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2895121574401855, "rewards/margins": 1.2463809251785278, "rewards/rejected": 3.0431313514709473, "step": 57140 }, { "epoch": 2.6533265239797577, "grad_norm": 27.498289108276367, "learning_rate": 1.408756209666187e-07, "logits/chosen": -20.195240020751953, "logits/rejected": -19.44786834716797, "logps/chosen": -353.46649169921875, "logps/rejected": -325.20159912109375, "loss": 0.6457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5400023460388184, "rewards/margins": 1.046452283859253, "rewards/rejected": 2.4935498237609863, "step": 57150 }, { "epoch": 2.653790798087191, "grad_norm": 82.05796813964844, "learning_rate": 1.408477645201727e-07, "logits/chosen": -19.48358154296875, "logits/rejected": -19.7344970703125, "logps/chosen": -384.46728515625, "logps/rejected": -342.0694274902344, "loss": 1.1577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3570456504821777, "rewards/margins": 0.5279108285903931, "rewards/rejected": 2.829134941101074, "step": 57160 }, { "epoch": 2.6542550721946236, "grad_norm": 153.15013122558594, "learning_rate": 1.4081990807372672e-07, "logits/chosen": -18.007761001586914, "logits/rejected": -17.959959030151367, "logps/chosen": -351.97015380859375, "logps/rejected": -412.6358947753906, "loss": 1.6631, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4887208938598633, "rewards/margins": -0.05868129804730415, "rewards/rejected": 2.5474021434783936, "step": 57170 }, { "epoch": 2.654719346302057, "grad_norm": 32.80311584472656, "learning_rate": 1.4079205162728074e-07, "logits/chosen": -19.972137451171875, "logits/rejected": -18.699918746948242, "logps/chosen": -352.43804931640625, "logps/rejected": -273.98406982421875, "loss": 0.4603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.418740749359131, "rewards/margins": 1.2095153331756592, "rewards/rejected": 2.2092254161834717, "step": 57180 }, { "epoch": 2.6551836204094896, "grad_norm": 135.4984130859375, "learning_rate": 1.4076419518083475e-07, "logits/chosen": -19.385692596435547, "logits/rejected": -18.172922134399414, "logps/chosen": -437.68389892578125, "logps/rejected": -315.033447265625, "loss": 0.5611, "rewards/accuracies": 0.5, "rewards/chosen": 4.150883197784424, "rewards/margins": 1.4587481021881104, "rewards/rejected": 2.6921353340148926, "step": 57190 }, { "epoch": 2.655647894516923, "grad_norm": 0.8140756487846375, "learning_rate": 1.4073633873438877e-07, "logits/chosen": -19.295085906982422, "logits/rejected": -18.362361907958984, "logps/chosen": -392.25653076171875, "logps/rejected": -290.77947998046875, "loss": 0.8639, "rewards/accuracies": 0.5, "rewards/chosen": 3.6770973205566406, "rewards/margins": 0.7601105570793152, "rewards/rejected": 2.9169869422912598, "step": 57200 }, { "epoch": 2.6561121686243556, "grad_norm": 24.680524826049805, "learning_rate": 1.4070848228794278e-07, "logits/chosen": -18.62368392944336, "logits/rejected": -17.39792251586914, "logps/chosen": -453.99285888671875, "logps/rejected": -306.51763916015625, "loss": 0.4452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.537761211395264, "rewards/margins": 2.5027196407318115, "rewards/rejected": 2.035041570663452, "step": 57210 }, { "epoch": 2.656576442731789, "grad_norm": 6.107827663421631, "learning_rate": 1.4068062584149682e-07, "logits/chosen": -19.360698699951172, "logits/rejected": -18.292644500732422, "logps/chosen": -491.78448486328125, "logps/rejected": -398.0067443847656, "loss": 0.3352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9891438484191895, "rewards/margins": 1.626487135887146, "rewards/rejected": 2.362656831741333, "step": 57220 }, { "epoch": 2.657040716839222, "grad_norm": 7.849083423614502, "learning_rate": 1.406527693950508e-07, "logits/chosen": -19.759593963623047, "logits/rejected": -18.862262725830078, "logps/chosen": -348.54541015625, "logps/rejected": -325.0445251464844, "loss": 0.7778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5699477195739746, "rewards/margins": 0.8331869840621948, "rewards/rejected": 1.7367607355117798, "step": 57230 }, { "epoch": 2.657504990946655, "grad_norm": 129.9779510498047, "learning_rate": 1.4062491294860485e-07, "logits/chosen": -18.38168716430664, "logits/rejected": -18.48746109008789, "logps/chosen": -277.1268615722656, "logps/rejected": -246.39102172851562, "loss": 0.6431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2149720191955566, "rewards/margins": 0.9685270190238953, "rewards/rejected": 2.2464444637298584, "step": 57240 }, { "epoch": 2.657969265054088, "grad_norm": 72.36058807373047, "learning_rate": 1.4059705650215887e-07, "logits/chosen": -18.854490280151367, "logits/rejected": -18.33829116821289, "logps/chosen": -419.755615234375, "logps/rejected": -355.546875, "loss": 0.7553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.842787027359009, "rewards/margins": 0.20931772887706757, "rewards/rejected": 2.633469343185425, "step": 57250 }, { "epoch": 2.658433539161521, "grad_norm": 215.2576446533203, "learning_rate": 1.4056920005571288e-07, "logits/chosen": -18.339813232421875, "logits/rejected": -17.882667541503906, "logps/chosen": -401.51678466796875, "logps/rejected": -427.36541748046875, "loss": 1.0263, "rewards/accuracies": 0.5, "rewards/chosen": 3.362706422805786, "rewards/margins": 0.046343445777893066, "rewards/rejected": 3.3163630962371826, "step": 57260 }, { "epoch": 2.658897813268954, "grad_norm": 46.482479095458984, "learning_rate": 1.405413436092669e-07, "logits/chosen": -19.18316650390625, "logits/rejected": -19.042299270629883, "logps/chosen": -373.30499267578125, "logps/rejected": -363.76068115234375, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.697706699371338, "rewards/margins": 1.248285174369812, "rewards/rejected": 2.4494214057922363, "step": 57270 }, { "epoch": 2.659362087376387, "grad_norm": 139.7864532470703, "learning_rate": 1.4051348716282094e-07, "logits/chosen": -18.730966567993164, "logits/rejected": -18.680461883544922, "logps/chosen": -414.00775146484375, "logps/rejected": -339.6462707519531, "loss": 1.3324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5705056190490723, "rewards/margins": -0.19997557997703552, "rewards/rejected": 2.770481586456299, "step": 57280 }, { "epoch": 2.65982636148382, "grad_norm": 8.501187324523926, "learning_rate": 1.4048563071637492e-07, "logits/chosen": -19.015398025512695, "logits/rejected": -17.64545249938965, "logps/chosen": -452.588623046875, "logps/rejected": -309.15557861328125, "loss": 0.7807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2753844261169434, "rewards/margins": 1.4538609981536865, "rewards/rejected": 1.8215240240097046, "step": 57290 }, { "epoch": 2.6602906355912532, "grad_norm": 86.3838119506836, "learning_rate": 1.4045777426992897e-07, "logits/chosen": -18.537397384643555, "logits/rejected": -18.491357803344727, "logps/chosen": -409.488037109375, "logps/rejected": -332.25555419921875, "loss": 1.1249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6472485065460205, "rewards/margins": 0.5233186483383179, "rewards/rejected": 3.123929738998413, "step": 57300 }, { "epoch": 2.660754909698686, "grad_norm": 1.001440405845642, "learning_rate": 1.4042991782348298e-07, "logits/chosen": -19.070297241210938, "logits/rejected": -18.2089900970459, "logps/chosen": -337.9419860839844, "logps/rejected": -268.1981506347656, "loss": 0.9557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8183703422546387, "rewards/margins": 1.5199044942855835, "rewards/rejected": 2.2984659671783447, "step": 57310 }, { "epoch": 2.661219183806119, "grad_norm": 134.90602111816406, "learning_rate": 1.40402061377037e-07, "logits/chosen": -18.963211059570312, "logits/rejected": -18.677806854248047, "logps/chosen": -356.67681884765625, "logps/rejected": -338.33966064453125, "loss": 0.8961, "rewards/accuracies": 0.5, "rewards/chosen": 2.746387481689453, "rewards/margins": 0.7209596633911133, "rewards/rejected": 2.0254275798797607, "step": 57320 }, { "epoch": 2.6616834579135524, "grad_norm": 21.83169937133789, "learning_rate": 1.40374204930591e-07, "logits/chosen": -18.69171142578125, "logits/rejected": -18.70445442199707, "logps/chosen": -379.49322509765625, "logps/rejected": -392.4662170410156, "loss": 0.9339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.334907054901123, "rewards/margins": 0.5212447047233582, "rewards/rejected": 2.813662528991699, "step": 57330 }, { "epoch": 2.662147732020985, "grad_norm": 83.59081268310547, "learning_rate": 1.4034634848414505e-07, "logits/chosen": -19.578861236572266, "logits/rejected": -18.24400520324707, "logps/chosen": -502.76275634765625, "logps/rejected": -318.9350280761719, "loss": 0.4287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.674370288848877, "rewards/margins": 1.6582748889923096, "rewards/rejected": 2.0160956382751465, "step": 57340 }, { "epoch": 2.662612006128418, "grad_norm": 3.160677194595337, "learning_rate": 1.4031849203769904e-07, "logits/chosen": -18.24077033996582, "logits/rejected": -16.955331802368164, "logps/chosen": -386.6607360839844, "logps/rejected": -240.1950225830078, "loss": 0.6649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0397701263427734, "rewards/margins": 1.6136773824691772, "rewards/rejected": 1.4260928630828857, "step": 57350 }, { "epoch": 2.663076280235851, "grad_norm": 6.3462090492248535, "learning_rate": 1.4029063559125308e-07, "logits/chosen": -19.65145492553711, "logits/rejected": -19.09712028503418, "logps/chosen": -437.40655517578125, "logps/rejected": -387.611083984375, "loss": 0.5262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.750412464141846, "rewards/margins": 1.4695546627044678, "rewards/rejected": 3.280858278274536, "step": 57360 }, { "epoch": 2.6635405543432844, "grad_norm": 17.119503021240234, "learning_rate": 1.402627791448071e-07, "logits/chosen": -18.289451599121094, "logits/rejected": -18.574926376342773, "logps/chosen": -480.45648193359375, "logps/rejected": -447.038330078125, "loss": 0.6071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4173824787139893, "rewards/margins": 0.7153914570808411, "rewards/rejected": 2.701991081237793, "step": 57370 }, { "epoch": 2.664004828450717, "grad_norm": 123.82877349853516, "learning_rate": 1.402349226983611e-07, "logits/chosen": -18.884113311767578, "logits/rejected": -18.419330596923828, "logps/chosen": -431.7920837402344, "logps/rejected": -302.1512145996094, "loss": 0.5783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.987614870071411, "rewards/margins": 1.3937716484069824, "rewards/rejected": 2.593843460083008, "step": 57380 }, { "epoch": 2.6644691025581504, "grad_norm": 138.3296356201172, "learning_rate": 1.4020706625191512e-07, "logits/chosen": -20.40897560119629, "logits/rejected": -18.733407974243164, "logps/chosen": -393.7779235839844, "logps/rejected": -293.6657409667969, "loss": 0.2455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.397000312805176, "rewards/margins": 2.3491029739379883, "rewards/rejected": 2.0478975772857666, "step": 57390 }, { "epoch": 2.6649333766655836, "grad_norm": 105.95091247558594, "learning_rate": 1.4017920980546914e-07, "logits/chosen": -18.423351287841797, "logits/rejected": -18.141399383544922, "logps/chosen": -305.3990783691406, "logps/rejected": -217.2034454345703, "loss": 0.6785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6571531295776367, "rewards/margins": 1.5331437587738037, "rewards/rejected": 1.124009370803833, "step": 57400 }, { "epoch": 2.6653976507730164, "grad_norm": 115.29129028320312, "learning_rate": 1.4015135335902315e-07, "logits/chosen": -18.76138687133789, "logits/rejected": -18.58963394165039, "logps/chosen": -270.4507141113281, "logps/rejected": -316.77691650390625, "loss": 1.7277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0809130668640137, "rewards/margins": -0.22435025870800018, "rewards/rejected": 3.3052635192871094, "step": 57410 }, { "epoch": 2.665861924880449, "grad_norm": 65.42992401123047, "learning_rate": 1.4012349691257717e-07, "logits/chosen": -19.486162185668945, "logits/rejected": -18.853981018066406, "logps/chosen": -395.8074035644531, "logps/rejected": -285.4455871582031, "loss": 0.6966, "rewards/accuracies": 0.5, "rewards/chosen": 3.7226550579071045, "rewards/margins": 1.2996217012405396, "rewards/rejected": 2.4230332374572754, "step": 57420 }, { "epoch": 2.6663261989878824, "grad_norm": 47.42691421508789, "learning_rate": 1.400956404661312e-07, "logits/chosen": -19.663583755493164, "logits/rejected": -19.148618698120117, "logps/chosen": -344.8232727050781, "logps/rejected": -311.50970458984375, "loss": 0.5461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4170289039611816, "rewards/margins": 0.7597588300704956, "rewards/rejected": 2.6572699546813965, "step": 57430 }, { "epoch": 2.6667904730953156, "grad_norm": 94.38877868652344, "learning_rate": 1.400677840196852e-07, "logits/chosen": -18.427398681640625, "logits/rejected": -17.339101791381836, "logps/chosen": -459.2164611816406, "logps/rejected": -310.3347473144531, "loss": 0.2314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7985832691192627, "rewards/margins": 2.7550387382507324, "rewards/rejected": 1.0435444116592407, "step": 57440 }, { "epoch": 2.6672547472027484, "grad_norm": 159.36805725097656, "learning_rate": 1.4003992757323924e-07, "logits/chosen": -17.990253448486328, "logits/rejected": -17.6456298828125, "logps/chosen": -349.3793640136719, "logps/rejected": -287.43341064453125, "loss": 0.7644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5097298622131348, "rewards/margins": 0.22675271332263947, "rewards/rejected": 2.282977342605591, "step": 57450 }, { "epoch": 2.6677190213101816, "grad_norm": 80.99398040771484, "learning_rate": 1.4001207112679325e-07, "logits/chosen": -17.59200668334961, "logits/rejected": -17.080970764160156, "logps/chosen": -328.76348876953125, "logps/rejected": -289.01324462890625, "loss": 0.4777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1487298011779785, "rewards/margins": 0.7414175271987915, "rewards/rejected": 1.407312035560608, "step": 57460 }, { "epoch": 2.668183295417615, "grad_norm": 8.107210159301758, "learning_rate": 1.3998421468034727e-07, "logits/chosen": -17.998281478881836, "logits/rejected": -17.69137954711914, "logps/chosen": -316.8327331542969, "logps/rejected": -330.97491455078125, "loss": 1.3365, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8561773300170898, "rewards/margins": -0.4925565719604492, "rewards/rejected": 2.348733901977539, "step": 57470 }, { "epoch": 2.6686475695250476, "grad_norm": 168.15179443359375, "learning_rate": 1.3995635823390128e-07, "logits/chosen": -19.593772888183594, "logits/rejected": -17.6708984375, "logps/chosen": -506.78936767578125, "logps/rejected": -414.6007385253906, "loss": 0.6737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.081024169921875, "rewards/margins": 0.7639453411102295, "rewards/rejected": 3.3170788288116455, "step": 57480 }, { "epoch": 2.6691118436324808, "grad_norm": 78.14828491210938, "learning_rate": 1.3992850178745532e-07, "logits/chosen": -19.63172149658203, "logits/rejected": -18.721200942993164, "logps/chosen": -442.4546813964844, "logps/rejected": -318.08544921875, "loss": 0.2848, "rewards/accuracies": 1.0, "rewards/chosen": 4.959977626800537, "rewards/margins": 2.2900891304016113, "rewards/rejected": 2.669888734817505, "step": 57490 }, { "epoch": 2.6695761177399135, "grad_norm": 74.28643035888672, "learning_rate": 1.399006453410093e-07, "logits/chosen": -18.850074768066406, "logits/rejected": -18.14389419555664, "logps/chosen": -419.4396057128906, "logps/rejected": -326.10137939453125, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.083336353302002, "rewards/margins": 0.6715168356895447, "rewards/rejected": 2.4118192195892334, "step": 57500 }, { "epoch": 2.6700403918473468, "grad_norm": 51.212799072265625, "learning_rate": 1.3987278889456335e-07, "logits/chosen": -19.18303871154785, "logits/rejected": -18.203033447265625, "logps/chosen": -417.4662170410156, "logps/rejected": -339.25897216796875, "loss": 0.4461, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.260764122009277, "rewards/margins": 1.6841394901275635, "rewards/rejected": 2.5766243934631348, "step": 57510 }, { "epoch": 2.6705046659547795, "grad_norm": 2.6310064792633057, "learning_rate": 1.3984493244811737e-07, "logits/chosen": -19.38714599609375, "logits/rejected": -18.701396942138672, "logps/chosen": -391.58990478515625, "logps/rejected": -317.9158935546875, "loss": 0.565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.543597459793091, "rewards/margins": 0.9895423650741577, "rewards/rejected": 2.5540547370910645, "step": 57520 }, { "epoch": 2.6709689400622127, "grad_norm": 19.10670280456543, "learning_rate": 1.3981707600167138e-07, "logits/chosen": -19.025005340576172, "logits/rejected": -17.260589599609375, "logps/chosen": -451.1819763183594, "logps/rejected": -266.38568115234375, "loss": 0.2904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2360775470733643, "rewards/margins": 1.8785327672958374, "rewards/rejected": 1.3575447797775269, "step": 57530 }, { "epoch": 2.671433214169646, "grad_norm": 136.6952667236328, "learning_rate": 1.397892195552254e-07, "logits/chosen": -18.18728256225586, "logits/rejected": -18.374515533447266, "logps/chosen": -265.3270263671875, "logps/rejected": -255.7365264892578, "loss": 0.788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0975773334503174, "rewards/margins": 0.3039700388908386, "rewards/rejected": 1.7936069965362549, "step": 57540 }, { "epoch": 2.6718974882770787, "grad_norm": 80.15097045898438, "learning_rate": 1.3976136310877944e-07, "logits/chosen": -19.152124404907227, "logits/rejected": -18.517732620239258, "logps/chosen": -359.94451904296875, "logps/rejected": -323.6478576660156, "loss": 0.861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6858112812042236, "rewards/margins": 0.36086735129356384, "rewards/rejected": 3.324944019317627, "step": 57550 }, { "epoch": 2.672361762384512, "grad_norm": 0.8975622653961182, "learning_rate": 1.3973350666233342e-07, "logits/chosen": -18.459850311279297, "logits/rejected": -17.75004005432129, "logps/chosen": -420.423828125, "logps/rejected": -351.19146728515625, "loss": 0.5148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6859593391418457, "rewards/margins": 1.2947361469268799, "rewards/rejected": 2.3912227153778076, "step": 57560 }, { "epoch": 2.6728260364919447, "grad_norm": 2.8952126502990723, "learning_rate": 1.3970565021588746e-07, "logits/chosen": -19.385421752929688, "logits/rejected": -18.180269241333008, "logps/chosen": -389.00531005859375, "logps/rejected": -355.4598083496094, "loss": 0.43, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.814992904663086, "rewards/margins": 1.5057907104492188, "rewards/rejected": 2.309201717376709, "step": 57570 }, { "epoch": 2.673290310599378, "grad_norm": 4.3805012702941895, "learning_rate": 1.3967779376944148e-07, "logits/chosen": -19.04469871520996, "logits/rejected": -17.82351303100586, "logps/chosen": -373.87884521484375, "logps/rejected": -279.8534240722656, "loss": 0.3677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3044328689575195, "rewards/margins": 2.0800766944885254, "rewards/rejected": 2.2243568897247314, "step": 57580 }, { "epoch": 2.6737545847068107, "grad_norm": 0.06686897575855255, "learning_rate": 1.396499373229955e-07, "logits/chosen": -18.6485538482666, "logits/rejected": -18.311626434326172, "logps/chosen": -251.3078155517578, "logps/rejected": -171.42332458496094, "loss": 0.5978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6905946731567383, "rewards/margins": 1.8852565288543701, "rewards/rejected": 0.8053382039070129, "step": 57590 }, { "epoch": 2.674218858814244, "grad_norm": 7.85420560836792, "learning_rate": 1.396220808765495e-07, "logits/chosen": -20.15067481994629, "logits/rejected": -18.844711303710938, "logps/chosen": -406.08209228515625, "logps/rejected": -304.54425048828125, "loss": 0.3862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.059655666351318, "rewards/margins": 1.2496023178100586, "rewards/rejected": 2.810053586959839, "step": 57600 }, { "epoch": 2.674683132921677, "grad_norm": 0.008862734772264957, "learning_rate": 1.3959422443010352e-07, "logits/chosen": -18.327260971069336, "logits/rejected": -18.601648330688477, "logps/chosen": -265.0727233886719, "logps/rejected": -284.96295166015625, "loss": 1.9213, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4958362579345703, "rewards/margins": 0.21859295666217804, "rewards/rejected": 2.277243137359619, "step": 57610 }, { "epoch": 2.67514740702911, "grad_norm": 3.004775285720825, "learning_rate": 1.3956636798365754e-07, "logits/chosen": -18.195533752441406, "logits/rejected": -18.322431564331055, "logps/chosen": -343.91766357421875, "logps/rejected": -361.59783935546875, "loss": 0.999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3603992462158203, "rewards/margins": 0.4171610474586487, "rewards/rejected": 2.9432382583618164, "step": 57620 }, { "epoch": 2.675611681136543, "grad_norm": 3.9668655395507812, "learning_rate": 1.3953851153721155e-07, "logits/chosen": -19.031124114990234, "logits/rejected": -18.317996978759766, "logps/chosen": -345.13641357421875, "logps/rejected": -264.8808898925781, "loss": 0.5647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.36425518989563, "rewards/margins": 0.8492604494094849, "rewards/rejected": 1.514994502067566, "step": 57630 }, { "epoch": 2.676075955243976, "grad_norm": 31.077774047851562, "learning_rate": 1.395106550907656e-07, "logits/chosen": -19.284961700439453, "logits/rejected": -18.400110244750977, "logps/chosen": -393.0887756347656, "logps/rejected": -275.8123779296875, "loss": 0.5452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.20928955078125, "rewards/margins": 1.6888692378997803, "rewards/rejected": 2.5204203128814697, "step": 57640 }, { "epoch": 2.676540229351409, "grad_norm": 149.49386596679688, "learning_rate": 1.3948279864431958e-07, "logits/chosen": -19.055734634399414, "logits/rejected": -19.399805068969727, "logps/chosen": -405.55133056640625, "logps/rejected": -440.4967346191406, "loss": 1.3098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4545483589172363, "rewards/margins": -0.05477433279156685, "rewards/rejected": 3.50932240486145, "step": 57650 }, { "epoch": 2.677004503458842, "grad_norm": 24.283592224121094, "learning_rate": 1.3945494219787362e-07, "logits/chosen": -18.12864875793457, "logits/rejected": -17.806365966796875, "logps/chosen": -334.06561279296875, "logps/rejected": -296.9915771484375, "loss": 0.6393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4107699394226074, "rewards/margins": 0.9695609211921692, "rewards/rejected": 1.4412089586257935, "step": 57660 }, { "epoch": 2.677468777566275, "grad_norm": 61.77714538574219, "learning_rate": 1.3942708575142764e-07, "logits/chosen": -19.651321411132812, "logits/rejected": -18.38820457458496, "logps/chosen": -362.7637023925781, "logps/rejected": -259.69219970703125, "loss": 1.1879, "rewards/accuracies": 0.5, "rewards/chosen": 2.9744133949279785, "rewards/margins": 0.5037838816642761, "rewards/rejected": 2.470629930496216, "step": 57670 }, { "epoch": 2.6779330516737083, "grad_norm": 128.83367919921875, "learning_rate": 1.3939922930498165e-07, "logits/chosen": -19.05495834350586, "logits/rejected": -18.71025276184082, "logps/chosen": -376.8742370605469, "logps/rejected": -392.76971435546875, "loss": 0.9752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4956557750701904, "rewards/margins": 0.7503083944320679, "rewards/rejected": 2.745347261428833, "step": 57680 }, { "epoch": 2.678397325781141, "grad_norm": 81.2220687866211, "learning_rate": 1.3937137285853567e-07, "logits/chosen": -19.207412719726562, "logits/rejected": -17.79134750366211, "logps/chosen": -380.70538330078125, "logps/rejected": -253.4672393798828, "loss": 0.6095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5494816303253174, "rewards/margins": 2.0109195709228516, "rewards/rejected": 1.538562297821045, "step": 57690 }, { "epoch": 2.6788615998885743, "grad_norm": 23.328378677368164, "learning_rate": 1.393435164120897e-07, "logits/chosen": -18.503782272338867, "logits/rejected": -17.45159339904785, "logps/chosen": -361.6175842285156, "logps/rejected": -263.911376953125, "loss": 0.4751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.666889190673828, "rewards/margins": 2.1829543113708496, "rewards/rejected": 1.4839345216751099, "step": 57700 }, { "epoch": 2.679325873996007, "grad_norm": 186.47537231445312, "learning_rate": 1.393156599656437e-07, "logits/chosen": -18.811328887939453, "logits/rejected": -18.013233184814453, "logps/chosen": -359.5837097167969, "logps/rejected": -317.0024719238281, "loss": 0.3356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.386663913726807, "rewards/margins": 1.993870735168457, "rewards/rejected": 2.3927924633026123, "step": 57710 }, { "epoch": 2.6797901481034403, "grad_norm": 154.03623962402344, "learning_rate": 1.3928780351919774e-07, "logits/chosen": -18.941946029663086, "logits/rejected": -18.036365509033203, "logps/chosen": -466.159423828125, "logps/rejected": -316.4770202636719, "loss": 0.4758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8852953910827637, "rewards/margins": 1.9244407415390015, "rewards/rejected": 1.9608547687530518, "step": 57720 }, { "epoch": 2.680254422210873, "grad_norm": 94.35064697265625, "learning_rate": 1.3925994707275175e-07, "logits/chosen": -18.211130142211914, "logits/rejected": -18.10859489440918, "logps/chosen": -364.5030822753906, "logps/rejected": -365.056396484375, "loss": 1.1398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.28175687789917, "rewards/margins": 0.32405537366867065, "rewards/rejected": 2.9577019214630127, "step": 57730 }, { "epoch": 2.6807186963183063, "grad_norm": 49.45332336425781, "learning_rate": 1.3923209062630576e-07, "logits/chosen": -19.220277786254883, "logits/rejected": -18.76902961730957, "logps/chosen": -425.9713439941406, "logps/rejected": -384.4710388183594, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.459487199783325, "rewards/margins": 0.4117097854614258, "rewards/rejected": 3.0477774143218994, "step": 57740 }, { "epoch": 2.6811829704257395, "grad_norm": 12.044124603271484, "learning_rate": 1.3920423417985978e-07, "logits/chosen": -19.43666648864746, "logits/rejected": -18.269365310668945, "logps/chosen": -444.91827392578125, "logps/rejected": -344.5044860839844, "loss": 0.2774, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.243460178375244, "rewards/margins": 1.485271692276001, "rewards/rejected": 2.758188247680664, "step": 57750 }, { "epoch": 2.6816472445331723, "grad_norm": 39.44330978393555, "learning_rate": 1.3917637773341382e-07, "logits/chosen": -18.96841812133789, "logits/rejected": -18.199764251708984, "logps/chosen": -276.0552673339844, "logps/rejected": -270.12872314453125, "loss": 0.732, "rewards/accuracies": 0.5, "rewards/chosen": 1.647686243057251, "rewards/margins": 0.1486823409795761, "rewards/rejected": 1.4990038871765137, "step": 57760 }, { "epoch": 2.6821115186406055, "grad_norm": 10.172536849975586, "learning_rate": 1.391485212869678e-07, "logits/chosen": -19.759033203125, "logits/rejected": -19.243684768676758, "logps/chosen": -432.09722900390625, "logps/rejected": -329.16961669921875, "loss": 0.4174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7311439514160156, "rewards/margins": 1.4632084369659424, "rewards/rejected": 2.267935276031494, "step": 57770 }, { "epoch": 2.6825757927480387, "grad_norm": 86.65852355957031, "learning_rate": 1.3912066484052185e-07, "logits/chosen": -19.01303482055664, "logits/rejected": -18.248973846435547, "logps/chosen": -401.5732421875, "logps/rejected": -368.1260681152344, "loss": 0.8868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.788928270339966, "rewards/margins": 1.0405070781707764, "rewards/rejected": 2.7484211921691895, "step": 57780 }, { "epoch": 2.6830400668554715, "grad_norm": 7.582171440124512, "learning_rate": 1.3909280839407586e-07, "logits/chosen": -19.57477569580078, "logits/rejected": -17.527530670166016, "logps/chosen": -523.9212036132812, "logps/rejected": -322.11614990234375, "loss": 0.3294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.4530439376831055, "rewards/margins": 3.2693183422088623, "rewards/rejected": 2.183725595474243, "step": 57790 }, { "epoch": 2.6835043409629042, "grad_norm": 129.6648712158203, "learning_rate": 1.3906495194762988e-07, "logits/chosen": -19.43801498413086, "logits/rejected": -18.372547149658203, "logps/chosen": -431.96978759765625, "logps/rejected": -324.4870910644531, "loss": 0.7947, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.6887435913085938, "rewards/margins": 0.8914797902107239, "rewards/rejected": 2.7972640991210938, "step": 57800 }, { "epoch": 2.6839686150703375, "grad_norm": 0.9561700224876404, "learning_rate": 1.390370955011839e-07, "logits/chosen": -19.354475021362305, "logits/rejected": -18.307735443115234, "logps/chosen": -347.3623352050781, "logps/rejected": -245.1562957763672, "loss": 0.9965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6599934101104736, "rewards/margins": 0.9413707852363586, "rewards/rejected": 1.7186228036880493, "step": 57810 }, { "epoch": 2.6844328891777707, "grad_norm": 34.866424560546875, "learning_rate": 1.390092390547379e-07, "logits/chosen": -18.827442169189453, "logits/rejected": -18.820106506347656, "logps/chosen": -450.71923828125, "logps/rejected": -505.6031799316406, "loss": 1.2186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.132439374923706, "rewards/margins": -0.448723167181015, "rewards/rejected": 3.581162691116333, "step": 57820 }, { "epoch": 2.6848971632852034, "grad_norm": 16.745201110839844, "learning_rate": 1.3898138260829192e-07, "logits/chosen": -19.43408966064453, "logits/rejected": -17.644733428955078, "logps/chosen": -417.384521484375, "logps/rejected": -236.0959930419922, "loss": 0.8401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.799020290374756, "rewards/margins": 2.6116843223571777, "rewards/rejected": 2.187335729598999, "step": 57830 }, { "epoch": 2.6853614373926367, "grad_norm": 15.845930099487305, "learning_rate": 1.3895352616184594e-07, "logits/chosen": -18.459291458129883, "logits/rejected": -18.548768997192383, "logps/chosen": -278.79364013671875, "logps/rejected": -332.2694396972656, "loss": 1.2434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8896591663360596, "rewards/margins": -0.12904486060142517, "rewards/rejected": 2.0187039375305176, "step": 57840 }, { "epoch": 2.68582571150007, "grad_norm": 0.054345518350601196, "learning_rate": 1.3892566971539998e-07, "logits/chosen": -18.75095558166504, "logits/rejected": -17.521419525146484, "logps/chosen": -488.6399841308594, "logps/rejected": -317.7773742675781, "loss": 0.6222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5221381187438965, "rewards/margins": 2.288642168045044, "rewards/rejected": 2.2334959506988525, "step": 57850 }, { "epoch": 2.6862899856075027, "grad_norm": 38.35072326660156, "learning_rate": 1.3889781326895397e-07, "logits/chosen": -18.400609970092773, "logits/rejected": -18.30984878540039, "logps/chosen": -341.2156677246094, "logps/rejected": -314.5231018066406, "loss": 0.8128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3207552433013916, "rewards/margins": 0.49569544196128845, "rewards/rejected": 1.8250596523284912, "step": 57860 }, { "epoch": 2.6867542597149354, "grad_norm": 1.0371308326721191, "learning_rate": 1.38869956822508e-07, "logits/chosen": -19.271697998046875, "logits/rejected": -19.02233123779297, "logps/chosen": -434.71002197265625, "logps/rejected": -379.37579345703125, "loss": 1.3088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.720471143722534, "rewards/margins": 0.3036887049674988, "rewards/rejected": 3.416782855987549, "step": 57870 }, { "epoch": 2.6872185338223686, "grad_norm": 32.33778762817383, "learning_rate": 1.3884210037606202e-07, "logits/chosen": -19.00336456298828, "logits/rejected": -18.321441650390625, "logps/chosen": -361.2853088378906, "logps/rejected": -307.96734619140625, "loss": 0.3438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5906710624694824, "rewards/margins": 1.3672001361846924, "rewards/rejected": 2.223471164703369, "step": 57880 }, { "epoch": 2.687682807929802, "grad_norm": 104.47368621826172, "learning_rate": 1.3881424392961604e-07, "logits/chosen": -18.693729400634766, "logits/rejected": -18.705657958984375, "logps/chosen": -330.89813232421875, "logps/rejected": -329.1203308105469, "loss": 0.9978, "rewards/accuracies": 0.5, "rewards/chosen": 2.8806777000427246, "rewards/margins": 0.1021004468202591, "rewards/rejected": 2.7785773277282715, "step": 57890 }, { "epoch": 2.6881470820372346, "grad_norm": 19.2529354095459, "learning_rate": 1.3878638748317005e-07, "logits/chosen": -19.000499725341797, "logits/rejected": -17.976572036743164, "logps/chosen": -397.50213623046875, "logps/rejected": -315.7304992675781, "loss": 0.7608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7499186992645264, "rewards/margins": 0.7384149432182312, "rewards/rejected": 3.0115036964416504, "step": 57900 }, { "epoch": 2.688611356144668, "grad_norm": 6.724636077880859, "learning_rate": 1.387585310367241e-07, "logits/chosen": -19.419841766357422, "logits/rejected": -18.23651695251465, "logps/chosen": -439.9052734375, "logps/rejected": -321.71661376953125, "loss": 0.2141, "rewards/accuracies": 1.0, "rewards/chosen": 4.798789978027344, "rewards/margins": 2.09624981880188, "rewards/rejected": 2.7025399208068848, "step": 57910 }, { "epoch": 2.689075630252101, "grad_norm": 42.79267120361328, "learning_rate": 1.3873067459027808e-07, "logits/chosen": -18.564016342163086, "logits/rejected": -18.32284927368164, "logps/chosen": -408.017333984375, "logps/rejected": -312.2924499511719, "loss": 0.4712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.534792900085449, "rewards/margins": 1.9873453378677368, "rewards/rejected": 2.5474469661712646, "step": 57920 }, { "epoch": 2.689539904359534, "grad_norm": 87.53233337402344, "learning_rate": 1.3870281814383212e-07, "logits/chosen": -18.517358779907227, "logits/rejected": -18.440824508666992, "logps/chosen": -408.7525329589844, "logps/rejected": -369.36798095703125, "loss": 1.2533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8005356788635254, "rewards/margins": 0.5518794655799866, "rewards/rejected": 3.2486565113067627, "step": 57930 }, { "epoch": 2.690004178466967, "grad_norm": 132.4814453125, "learning_rate": 1.3867496169738613e-07, "logits/chosen": -17.639678955078125, "logits/rejected": -18.355152130126953, "logps/chosen": -254.98422241210938, "logps/rejected": -351.34075927734375, "loss": 1.6535, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.1811587810516357, "rewards/margins": -1.1051304340362549, "rewards/rejected": 2.2862887382507324, "step": 57940 }, { "epoch": 2.6904684525744, "grad_norm": 33.62331771850586, "learning_rate": 1.3864710525094015e-07, "logits/chosen": -18.530519485473633, "logits/rejected": -19.161298751831055, "logps/chosen": -400.4112854003906, "logps/rejected": -387.15545654296875, "loss": 1.2105, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6106553077697754, "rewards/margins": 0.5747756958007812, "rewards/rejected": 3.0358798503875732, "step": 57950 }, { "epoch": 2.690932726681833, "grad_norm": 68.59600067138672, "learning_rate": 1.3861924880449416e-07, "logits/chosen": -19.032150268554688, "logits/rejected": -18.28396224975586, "logps/chosen": -463.32403564453125, "logps/rejected": -430.72747802734375, "loss": 0.6515, "rewards/accuracies": 0.5, "rewards/chosen": 3.4765453338623047, "rewards/margins": 0.6795867681503296, "rewards/rejected": 2.7969586849212646, "step": 57960 }, { "epoch": 2.691397000789266, "grad_norm": 139.67005920410156, "learning_rate": 1.385913923580482e-07, "logits/chosen": -19.62480354309082, "logits/rejected": -18.323822021484375, "logps/chosen": -473.8165588378906, "logps/rejected": -333.14727783203125, "loss": 0.3624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.834495544433594, "rewards/margins": 2.212585926055908, "rewards/rejected": 2.6219098567962646, "step": 57970 }, { "epoch": 2.691861274896699, "grad_norm": 5.781530857086182, "learning_rate": 1.385635359116022e-07, "logits/chosen": -19.191144943237305, "logits/rejected": -17.68277359008789, "logps/chosen": -463.37322998046875, "logps/rejected": -307.42626953125, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 5.148402690887451, "rewards/margins": 2.74033784866333, "rewards/rejected": 2.4080650806427, "step": 57980 }, { "epoch": 2.6923255490041322, "grad_norm": 129.82640075683594, "learning_rate": 1.3853567946515623e-07, "logits/chosen": -19.408435821533203, "logits/rejected": -18.379995346069336, "logps/chosen": -351.61444091796875, "logps/rejected": -298.1789245605469, "loss": 0.6545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.856119155883789, "rewards/margins": 0.5266774296760559, "rewards/rejected": 2.329441547393799, "step": 57990 }, { "epoch": 2.692789823111565, "grad_norm": 5.622568130493164, "learning_rate": 1.3850782301871025e-07, "logits/chosen": -18.87624740600586, "logits/rejected": -18.544246673583984, "logps/chosen": -323.884521484375, "logps/rejected": -270.85809326171875, "loss": 0.67, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3164000511169434, "rewards/margins": 0.5620330572128296, "rewards/rejected": 1.7543668746948242, "step": 58000 }, { "epoch": 2.693254097218998, "grad_norm": 0.8453588485717773, "learning_rate": 1.3847996657226426e-07, "logits/chosen": -20.033817291259766, "logits/rejected": -17.52960205078125, "logps/chosen": -467.7095642089844, "logps/rejected": -237.13565063476562, "loss": 0.2828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7024638652801514, "rewards/margins": 2.2977898120880127, "rewards/rejected": 1.4046738147735596, "step": 58010 }, { "epoch": 2.693718371326431, "grad_norm": 3.3047478199005127, "learning_rate": 1.3845211012581828e-07, "logits/chosen": -19.33324432373047, "logits/rejected": -17.972768783569336, "logps/chosen": -433.2730407714844, "logps/rejected": -311.04718017578125, "loss": 0.2946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.768346071243286, "rewards/margins": 1.6846511363983154, "rewards/rejected": 2.0836949348449707, "step": 58020 }, { "epoch": 2.694182645433864, "grad_norm": 114.91360473632812, "learning_rate": 1.384242536793723e-07, "logits/chosen": -19.156904220581055, "logits/rejected": -18.221446990966797, "logps/chosen": -417.543701171875, "logps/rejected": -348.5065612792969, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6234536170959473, "rewards/margins": 1.672472357749939, "rewards/rejected": 1.9509811401367188, "step": 58030 }, { "epoch": 2.694646919541297, "grad_norm": 60.36510467529297, "learning_rate": 1.383963972329263e-07, "logits/chosen": -18.725858688354492, "logits/rejected": -18.347558975219727, "logps/chosen": -361.8890075683594, "logps/rejected": -278.2566833496094, "loss": 0.7445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6928417682647705, "rewards/margins": 1.3852274417877197, "rewards/rejected": 2.30761456489563, "step": 58040 }, { "epoch": 2.69511119364873, "grad_norm": 2.405194044113159, "learning_rate": 1.3836854078648032e-07, "logits/chosen": -18.372644424438477, "logits/rejected": -18.075862884521484, "logps/chosen": -320.4385986328125, "logps/rejected": -277.8589782714844, "loss": 0.962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.189530611038208, "rewards/margins": 1.3258339166641235, "rewards/rejected": 1.8636966943740845, "step": 58050 }, { "epoch": 2.6955754677561634, "grad_norm": 68.95328521728516, "learning_rate": 1.3834068434003436e-07, "logits/chosen": -19.52588653564453, "logits/rejected": -18.92112159729004, "logps/chosen": -380.15911865234375, "logps/rejected": -331.3692321777344, "loss": 0.9072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4620418548583984, "rewards/margins": 0.7782109975814819, "rewards/rejected": 2.683830976486206, "step": 58060 }, { "epoch": 2.696039741863596, "grad_norm": 93.26473236083984, "learning_rate": 1.3831282789358835e-07, "logits/chosen": -18.453479766845703, "logits/rejected": -17.700496673583984, "logps/chosen": -405.3120422363281, "logps/rejected": -327.2096252441406, "loss": 0.5046, "rewards/accuracies": 0.5, "rewards/chosen": 3.353170394897461, "rewards/margins": 1.309436559677124, "rewards/rejected": 2.043734073638916, "step": 58070 }, { "epoch": 2.6965040159710294, "grad_norm": 27.363094329833984, "learning_rate": 1.382849714471424e-07, "logits/chosen": -18.96521759033203, "logits/rejected": -18.399044036865234, "logps/chosen": -344.911376953125, "logps/rejected": -290.2044982910156, "loss": 0.4465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6700358390808105, "rewards/margins": 2.109740734100342, "rewards/rejected": 1.5602948665618896, "step": 58080 }, { "epoch": 2.696968290078462, "grad_norm": 20.1481990814209, "learning_rate": 1.382571150006964e-07, "logits/chosen": -17.722536087036133, "logits/rejected": -17.379840850830078, "logps/chosen": -373.90081787109375, "logps/rejected": -367.99334716796875, "loss": 0.9701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.903944492340088, "rewards/margins": 0.3961756229400635, "rewards/rejected": 2.5077688694000244, "step": 58090 }, { "epoch": 2.6974325641858954, "grad_norm": 102.36080932617188, "learning_rate": 1.3822925855425042e-07, "logits/chosen": -19.857967376708984, "logits/rejected": -19.77408218383789, "logps/chosen": -415.07452392578125, "logps/rejected": -399.48077392578125, "loss": 1.3829, "rewards/accuracies": 0.5, "rewards/chosen": 3.3389651775360107, "rewards/margins": 0.517208456993103, "rewards/rejected": 2.8217568397521973, "step": 58100 }, { "epoch": 2.697896838293328, "grad_norm": 191.59317016601562, "learning_rate": 1.3820140210780443e-07, "logits/chosen": -18.774852752685547, "logits/rejected": -18.98130226135254, "logps/chosen": -465.0699157714844, "logps/rejected": -442.6590881347656, "loss": 1.5001, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.945507526397705, "rewards/margins": -0.08617416769266129, "rewards/rejected": 4.031682014465332, "step": 58110 }, { "epoch": 2.6983611124007614, "grad_norm": 195.1905517578125, "learning_rate": 1.3817354566135848e-07, "logits/chosen": -19.838605880737305, "logits/rejected": -19.480205535888672, "logps/chosen": -383.8355712890625, "logps/rejected": -381.1724853515625, "loss": 0.4925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.008296489715576, "rewards/margins": 0.7327381372451782, "rewards/rejected": 2.2755579948425293, "step": 58120 }, { "epoch": 2.6988253865081946, "grad_norm": 1.981747031211853, "learning_rate": 1.3814568921491246e-07, "logits/chosen": -19.291902542114258, "logits/rejected": -17.598886489868164, "logps/chosen": -293.8247375488281, "logps/rejected": -222.3667449951172, "loss": 0.6622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9228439331054688, "rewards/margins": 1.5490696430206299, "rewards/rejected": 1.3737742900848389, "step": 58130 }, { "epoch": 2.6992896606156274, "grad_norm": 4.808364391326904, "learning_rate": 1.381178327684665e-07, "logits/chosen": -19.53043556213379, "logits/rejected": -18.79798698425293, "logps/chosen": -414.6388244628906, "logps/rejected": -252.6892852783203, "loss": 0.4602, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.020162105560303, "rewards/margins": 1.2096540927886963, "rewards/rejected": 2.8105080127716064, "step": 58140 }, { "epoch": 2.6997539347230606, "grad_norm": 110.80204010009766, "learning_rate": 1.3808997632202052e-07, "logits/chosen": -18.651384353637695, "logits/rejected": -18.052961349487305, "logps/chosen": -293.6439208984375, "logps/rejected": -279.0516357421875, "loss": 0.8761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7911906242370605, "rewards/margins": 1.8425804376602173, "rewards/rejected": 1.9486100673675537, "step": 58150 }, { "epoch": 2.700218208830494, "grad_norm": 0.18186356127262115, "learning_rate": 1.3806211987557453e-07, "logits/chosen": -20.47869110107422, "logits/rejected": -18.330059051513672, "logps/chosen": -438.67132568359375, "logps/rejected": -291.85211181640625, "loss": 0.4923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9143974781036377, "rewards/margins": 1.940590262413025, "rewards/rejected": 1.9738073348999023, "step": 58160 }, { "epoch": 2.7006824829379266, "grad_norm": 79.38690948486328, "learning_rate": 1.3803426342912855e-07, "logits/chosen": -18.603580474853516, "logits/rejected": -17.701005935668945, "logps/chosen": -481.39794921875, "logps/rejected": -380.77178955078125, "loss": 0.5935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.606208086013794, "rewards/margins": 1.3022191524505615, "rewards/rejected": 2.3039889335632324, "step": 58170 }, { "epoch": 2.7011467570453593, "grad_norm": 32.641841888427734, "learning_rate": 1.380064069826826e-07, "logits/chosen": -19.356203079223633, "logits/rejected": -18.587121963500977, "logps/chosen": -392.47052001953125, "logps/rejected": -299.63311767578125, "loss": 0.274, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.734633922576904, "rewards/margins": 1.9611310958862305, "rewards/rejected": 2.773503303527832, "step": 58180 }, { "epoch": 2.7016110311527926, "grad_norm": 128.9665069580078, "learning_rate": 1.3797855053623658e-07, "logits/chosen": -18.471025466918945, "logits/rejected": -18.053361892700195, "logps/chosen": -359.97076416015625, "logps/rejected": -296.5394592285156, "loss": 0.6248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.934204578399658, "rewards/margins": 1.499915599822998, "rewards/rejected": 2.4342894554138184, "step": 58190 }, { "epoch": 2.7020753052602258, "grad_norm": 27.611553192138672, "learning_rate": 1.379506940897906e-07, "logits/chosen": -19.466678619384766, "logits/rejected": -18.250516891479492, "logps/chosen": -359.7906799316406, "logps/rejected": -222.50747680664062, "loss": 0.3794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.872260570526123, "rewards/margins": 1.8936645984649658, "rewards/rejected": 1.9785959720611572, "step": 58200 }, { "epoch": 2.7025395793676585, "grad_norm": 50.76456832885742, "learning_rate": 1.3792283764334463e-07, "logits/chosen": -20.216655731201172, "logits/rejected": -19.742395401000977, "logps/chosen": -330.14288330078125, "logps/rejected": -278.5754089355469, "loss": 0.5484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.153599977493286, "rewards/margins": 0.7563880681991577, "rewards/rejected": 2.397211790084839, "step": 58210 }, { "epoch": 2.7030038534750918, "grad_norm": 122.79668426513672, "learning_rate": 1.3789498119689865e-07, "logits/chosen": -18.948898315429688, "logits/rejected": -17.163145065307617, "logps/chosen": -496.9513244628906, "logps/rejected": -270.8125915527344, "loss": 0.2767, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.781023979187012, "rewards/margins": 2.383852243423462, "rewards/rejected": 2.397171974182129, "step": 58220 }, { "epoch": 2.703468127582525, "grad_norm": 51.66176986694336, "learning_rate": 1.3786712475045266e-07, "logits/chosen": -18.304325103759766, "logits/rejected": -17.38616943359375, "logps/chosen": -425.40618896484375, "logps/rejected": -341.07049560546875, "loss": 0.5146, "rewards/accuracies": 0.5, "rewards/chosen": 3.7738430500030518, "rewards/margins": 1.6602017879486084, "rewards/rejected": 2.1136410236358643, "step": 58230 }, { "epoch": 2.7039324016899577, "grad_norm": 69.30125427246094, "learning_rate": 1.3783926830400668e-07, "logits/chosen": -19.460891723632812, "logits/rejected": -19.25653648376465, "logps/chosen": -329.7804260253906, "logps/rejected": -333.5242919921875, "loss": 0.868, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.237060546875, "rewards/margins": 0.10208089649677277, "rewards/rejected": 3.134979724884033, "step": 58240 }, { "epoch": 2.7043966757973905, "grad_norm": 27.20536231994629, "learning_rate": 1.378114118575607e-07, "logits/chosen": -18.41769027709961, "logits/rejected": -17.372425079345703, "logps/chosen": -471.3478088378906, "logps/rejected": -320.22113037109375, "loss": 0.3604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.643797874450684, "rewards/margins": 2.5642011165618896, "rewards/rejected": 2.079596757888794, "step": 58250 }, { "epoch": 2.7048609499048237, "grad_norm": 111.41088104248047, "learning_rate": 1.377835554111147e-07, "logits/chosen": -19.177915573120117, "logits/rejected": -19.75990867614746, "logps/chosen": -385.52752685546875, "logps/rejected": -334.21661376953125, "loss": 0.7251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1527228355407715, "rewards/margins": 0.6170968413352966, "rewards/rejected": 2.535626173019409, "step": 58260 }, { "epoch": 2.705325224012257, "grad_norm": 179.78775024414062, "learning_rate": 1.3775569896466875e-07, "logits/chosen": -18.178911209106445, "logits/rejected": -17.799495697021484, "logps/chosen": -399.7672424316406, "logps/rejected": -299.79180908203125, "loss": 0.4839, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.00478458404541, "rewards/margins": 1.9639962911605835, "rewards/rejected": 2.040787935256958, "step": 58270 }, { "epoch": 2.7057894981196897, "grad_norm": 92.63237762451172, "learning_rate": 1.3772784251822273e-07, "logits/chosen": -18.758750915527344, "logits/rejected": -19.006412506103516, "logps/chosen": -364.2237243652344, "logps/rejected": -363.2013244628906, "loss": 0.5566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2276337146759033, "rewards/margins": 0.9000711441040039, "rewards/rejected": 2.3275625705718994, "step": 58280 }, { "epoch": 2.706253772227123, "grad_norm": 9.66018295288086, "learning_rate": 1.3769998607177678e-07, "logits/chosen": -19.616008758544922, "logits/rejected": -18.55021858215332, "logps/chosen": -387.8440246582031, "logps/rejected": -264.0615234375, "loss": 0.4366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8015332221984863, "rewards/margins": 1.722070336341858, "rewards/rejected": 2.0794622898101807, "step": 58290 }, { "epoch": 2.706718046334556, "grad_norm": 11.03674602508545, "learning_rate": 1.376721296253308e-07, "logits/chosen": -18.857318878173828, "logits/rejected": -17.721540451049805, "logps/chosen": -397.0827941894531, "logps/rejected": -251.06494140625, "loss": 0.2708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3441829681396484, "rewards/margins": 2.137216091156006, "rewards/rejected": 1.2069671154022217, "step": 58300 }, { "epoch": 2.707182320441989, "grad_norm": 241.12159729003906, "learning_rate": 1.376442731788848e-07, "logits/chosen": -18.394636154174805, "logits/rejected": -18.35055160522461, "logps/chosen": -330.7954406738281, "logps/rejected": -323.015380859375, "loss": 1.3673, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.865682601928711, "rewards/margins": -0.13779780268669128, "rewards/rejected": 3.0034801959991455, "step": 58310 }, { "epoch": 2.707646594549422, "grad_norm": 47.48638153076172, "learning_rate": 1.3761641673243882e-07, "logits/chosen": -18.832258224487305, "logits/rejected": -18.56230354309082, "logps/chosen": -448.8706970214844, "logps/rejected": -341.53302001953125, "loss": 0.6449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.275231122970581, "rewards/margins": 0.8538389205932617, "rewards/rejected": 2.4213926792144775, "step": 58320 }, { "epoch": 2.708110868656855, "grad_norm": 66.98896026611328, "learning_rate": 1.3758856028599286e-07, "logits/chosen": -20.559778213500977, "logits/rejected": -20.51490020751953, "logps/chosen": -437.51605224609375, "logps/rejected": -374.5011291503906, "loss": 0.8429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.175362586975098, "rewards/margins": 0.8087641000747681, "rewards/rejected": 3.366598129272461, "step": 58330 }, { "epoch": 2.708575142764288, "grad_norm": 41.1423454284668, "learning_rate": 1.3756070383954685e-07, "logits/chosen": -19.865036010742188, "logits/rejected": -18.793893814086914, "logps/chosen": -414.63873291015625, "logps/rejected": -320.87420654296875, "loss": 0.5063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8269524574279785, "rewards/margins": 1.5476694107055664, "rewards/rejected": 2.279282808303833, "step": 58340 }, { "epoch": 2.709039416871721, "grad_norm": 174.7556610107422, "learning_rate": 1.375328473931009e-07, "logits/chosen": -19.87959861755371, "logits/rejected": -18.745235443115234, "logps/chosen": -428.61541748046875, "logps/rejected": -368.6553039550781, "loss": 0.4042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.274866104125977, "rewards/margins": 0.9921121597290039, "rewards/rejected": 3.2827541828155518, "step": 58350 }, { "epoch": 2.709503690979154, "grad_norm": 35.00420379638672, "learning_rate": 1.375049909466549e-07, "logits/chosen": -19.157154083251953, "logits/rejected": -18.352699279785156, "logps/chosen": -359.4755859375, "logps/rejected": -309.9906311035156, "loss": 0.5353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3495330810546875, "rewards/margins": 1.426824927330017, "rewards/rejected": 2.922708034515381, "step": 58360 }, { "epoch": 2.7099679650865873, "grad_norm": 222.6016845703125, "learning_rate": 1.3747713450020892e-07, "logits/chosen": -19.706201553344727, "logits/rejected": -19.509489059448242, "logps/chosen": -272.59600830078125, "logps/rejected": -299.09271240234375, "loss": 1.2889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8499313592910767, "rewards/margins": -0.22319576144218445, "rewards/rejected": 2.073127031326294, "step": 58370 }, { "epoch": 2.71043223919402, "grad_norm": 33.0339469909668, "learning_rate": 1.3744927805376293e-07, "logits/chosen": -18.658903121948242, "logits/rejected": -18.63949203491211, "logps/chosen": -324.6783142089844, "logps/rejected": -306.4197692871094, "loss": 0.6634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5750892162323, "rewards/margins": 0.47826823592185974, "rewards/rejected": 2.0968210697174072, "step": 58380 }, { "epoch": 2.7108965133014533, "grad_norm": 9.702486991882324, "learning_rate": 1.3742142160731697e-07, "logits/chosen": -18.390727996826172, "logits/rejected": -17.71160888671875, "logps/chosen": -392.4124450683594, "logps/rejected": -306.3108825683594, "loss": 0.7141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.561990737915039, "rewards/margins": 1.990649938583374, "rewards/rejected": 2.571341037750244, "step": 58390 }, { "epoch": 2.711360787408886, "grad_norm": 245.00758361816406, "learning_rate": 1.3739356516087096e-07, "logits/chosen": -19.377286911010742, "logits/rejected": -18.449352264404297, "logps/chosen": -453.30023193359375, "logps/rejected": -328.49945068359375, "loss": 1.0653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.495087146759033, "rewards/margins": 0.6563242077827454, "rewards/rejected": 2.838763475418091, "step": 58400 }, { "epoch": 2.7118250615163193, "grad_norm": 42.437477111816406, "learning_rate": 1.3736570871442498e-07, "logits/chosen": -19.385751724243164, "logits/rejected": -18.01937484741211, "logps/chosen": -383.9488830566406, "logps/rejected": -304.37164306640625, "loss": 0.3974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.717911958694458, "rewards/margins": 1.8896172046661377, "rewards/rejected": 1.8282949924468994, "step": 58410 }, { "epoch": 2.712289335623752, "grad_norm": 199.98582458496094, "learning_rate": 1.3733785226797902e-07, "logits/chosen": -19.23440170288086, "logits/rejected": -18.4193172454834, "logps/chosen": -333.8146057128906, "logps/rejected": -290.45611572265625, "loss": 0.6299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4079384803771973, "rewards/margins": 0.6689761281013489, "rewards/rejected": 1.7389624118804932, "step": 58420 }, { "epoch": 2.7127536097311853, "grad_norm": 29.19925880432129, "learning_rate": 1.3730999582153303e-07, "logits/chosen": -18.701162338256836, "logits/rejected": -18.463525772094727, "logps/chosen": -303.70526123046875, "logps/rejected": -317.14251708984375, "loss": 1.3494, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.111711263656616, "rewards/margins": -0.6343388557434082, "rewards/rejected": 3.7460503578186035, "step": 58430 }, { "epoch": 2.7132178838386185, "grad_norm": 5.568308353424072, "learning_rate": 1.3728213937508705e-07, "logits/chosen": -19.52872657775879, "logits/rejected": -18.26010513305664, "logps/chosen": -424.48541259765625, "logps/rejected": -310.65765380859375, "loss": 0.2893, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2790489196777344, "rewards/margins": 1.9077695608139038, "rewards/rejected": 1.3712791204452515, "step": 58440 }, { "epoch": 2.7136821579460513, "grad_norm": 55.94401931762695, "learning_rate": 1.3725428292864106e-07, "logits/chosen": -19.013748168945312, "logits/rejected": -18.063764572143555, "logps/chosen": -420.52978515625, "logps/rejected": -284.40869140625, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.303314685821533, "rewards/margins": 1.9046932458877563, "rewards/rejected": 1.3986215591430664, "step": 58450 }, { "epoch": 2.7141464320534845, "grad_norm": 161.6171875, "learning_rate": 1.3722642648219508e-07, "logits/chosen": -18.968326568603516, "logits/rejected": -18.815265655517578, "logps/chosen": -442.8722229003906, "logps/rejected": -413.80035400390625, "loss": 1.0894, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8204853534698486, "rewards/margins": -0.45396775007247925, "rewards/rejected": 4.274453163146973, "step": 58460 }, { "epoch": 2.7146107061609173, "grad_norm": 52.1121940612793, "learning_rate": 1.371985700357491e-07, "logits/chosen": -19.315465927124023, "logits/rejected": -17.976661682128906, "logps/chosen": -373.60797119140625, "logps/rejected": -231.07553100585938, "loss": 0.524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.252559661865234, "rewards/margins": 2.1569664478302, "rewards/rejected": 2.095592975616455, "step": 58470 }, { "epoch": 2.7150749802683505, "grad_norm": 174.73812866210938, "learning_rate": 1.3717071358930313e-07, "logits/chosen": -19.176921844482422, "logits/rejected": -18.01541519165039, "logps/chosen": -363.7052001953125, "logps/rejected": -276.1927490234375, "loss": 0.6869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2407665252685547, "rewards/margins": 1.7218090295791626, "rewards/rejected": 1.5189578533172607, "step": 58480 }, { "epoch": 2.7155392543757833, "grad_norm": 225.8689422607422, "learning_rate": 1.3714285714285712e-07, "logits/chosen": -17.713165283203125, "logits/rejected": -18.285078048706055, "logps/chosen": -287.1773376464844, "logps/rejected": -368.28326416015625, "loss": 1.9138, "rewards/accuracies": 0.5, "rewards/chosen": 2.7373595237731934, "rewards/margins": -0.6479926109313965, "rewards/rejected": 3.385352373123169, "step": 58490 }, { "epoch": 2.7160035284832165, "grad_norm": 30.714200973510742, "learning_rate": 1.3711500069641116e-07, "logits/chosen": -19.931751251220703, "logits/rejected": -18.828248977661133, "logps/chosen": -400.1658935546875, "logps/rejected": -313.91082763671875, "loss": 0.5411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9399876594543457, "rewards/margins": 1.1944081783294678, "rewards/rejected": 2.745579242706299, "step": 58500 }, { "epoch": 2.7164678025906497, "grad_norm": 54.97648239135742, "learning_rate": 1.3708714424996517e-07, "logits/chosen": -18.920459747314453, "logits/rejected": -17.438472747802734, "logps/chosen": -434.3111267089844, "logps/rejected": -358.7602844238281, "loss": 0.6798, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.972476005554199, "rewards/margins": 1.325552225112915, "rewards/rejected": 1.6469236612319946, "step": 58510 }, { "epoch": 2.7169320766980825, "grad_norm": 218.93563842773438, "learning_rate": 1.370592878035192e-07, "logits/chosen": -18.992496490478516, "logits/rejected": -19.373126983642578, "logps/chosen": -399.17034912109375, "logps/rejected": -435.18524169921875, "loss": 1.4322, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3372466564178467, "rewards/margins": -0.12027387320995331, "rewards/rejected": 3.4575207233428955, "step": 58520 }, { "epoch": 2.7173963508055157, "grad_norm": 13.18789005279541, "learning_rate": 1.370314313570732e-07, "logits/chosen": -19.298778533935547, "logits/rejected": -18.236934661865234, "logps/chosen": -392.37164306640625, "logps/rejected": -263.31695556640625, "loss": 0.2232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9546496868133545, "rewards/margins": 2.5955681800842285, "rewards/rejected": 1.3590819835662842, "step": 58530 }, { "epoch": 2.7178606249129484, "grad_norm": 42.3459587097168, "learning_rate": 1.3700357491062724e-07, "logits/chosen": -19.070653915405273, "logits/rejected": -17.954185485839844, "logps/chosen": -321.2257385253906, "logps/rejected": -256.9444885253906, "loss": 0.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.438044309616089, "rewards/margins": 0.7724016904830933, "rewards/rejected": 1.665642499923706, "step": 58540 }, { "epoch": 2.7183248990203817, "grad_norm": 177.08872985839844, "learning_rate": 1.3697571846418123e-07, "logits/chosen": -18.476940155029297, "logits/rejected": -17.762767791748047, "logps/chosen": -442.88311767578125, "logps/rejected": -330.3157653808594, "loss": 0.4887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6576504707336426, "rewards/margins": 1.4853501319885254, "rewards/rejected": 2.172300100326538, "step": 58550 }, { "epoch": 2.7187891731278144, "grad_norm": 19.674747467041016, "learning_rate": 1.3694786201773527e-07, "logits/chosen": -18.01554298400879, "logits/rejected": -17.83823013305664, "logps/chosen": -382.6037292480469, "logps/rejected": -357.69830322265625, "loss": 0.8876, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.212634325027466, "rewards/margins": 0.35288748145103455, "rewards/rejected": 2.8597464561462402, "step": 58560 }, { "epoch": 2.7192534472352476, "grad_norm": 204.6927032470703, "learning_rate": 1.369200055712893e-07, "logits/chosen": -18.24009132385254, "logits/rejected": -18.437397003173828, "logps/chosen": -424.7688903808594, "logps/rejected": -450.4922790527344, "loss": 1.0004, "rewards/accuracies": 0.5, "rewards/chosen": 2.9951529502868652, "rewards/margins": -0.21855445206165314, "rewards/rejected": 3.213707685470581, "step": 58570 }, { "epoch": 2.719717721342681, "grad_norm": 44.81259536743164, "learning_rate": 1.368921491248433e-07, "logits/chosen": -19.394821166992188, "logits/rejected": -17.651607513427734, "logps/chosen": -358.389404296875, "logps/rejected": -261.14593505859375, "loss": 0.6666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6815104484558105, "rewards/margins": 1.6869754791259766, "rewards/rejected": 1.9945356845855713, "step": 58580 }, { "epoch": 2.7201819954501136, "grad_norm": 48.49032974243164, "learning_rate": 1.3686429267839732e-07, "logits/chosen": -18.83699607849121, "logits/rejected": -17.55436897277832, "logps/chosen": -394.16680908203125, "logps/rejected": -312.4776916503906, "loss": 0.5458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.208799839019775, "rewards/margins": 1.4519102573394775, "rewards/rejected": 2.7568893432617188, "step": 58590 }, { "epoch": 2.720646269557547, "grad_norm": 2.4298291206359863, "learning_rate": 1.3683643623195133e-07, "logits/chosen": -18.361841201782227, "logits/rejected": -17.168071746826172, "logps/chosen": -365.07916259765625, "logps/rejected": -244.78280639648438, "loss": 0.3985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2466537952423096, "rewards/margins": 1.732068419456482, "rewards/rejected": 1.5145851373672485, "step": 58600 }, { "epoch": 2.72111054366498, "grad_norm": 99.66384887695312, "learning_rate": 1.3680857978550535e-07, "logits/chosen": -18.90514373779297, "logits/rejected": -19.162954330444336, "logps/chosen": -376.8651428222656, "logps/rejected": -308.6023864746094, "loss": 1.1566, "rewards/accuracies": 0.5, "rewards/chosen": 2.869464635848999, "rewards/margins": 0.0892120823264122, "rewards/rejected": 2.780252456665039, "step": 58610 }, { "epoch": 2.721574817772413, "grad_norm": 268.9440002441406, "learning_rate": 1.3678072333905936e-07, "logits/chosen": -18.677814483642578, "logits/rejected": -18.75320053100586, "logps/chosen": -378.02935791015625, "logps/rejected": -443.5048828125, "loss": 1.1447, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.057957172393799, "rewards/margins": -0.5432831048965454, "rewards/rejected": 3.6012396812438965, "step": 58620 }, { "epoch": 2.7220390918798456, "grad_norm": 128.7799072265625, "learning_rate": 1.367528668926134e-07, "logits/chosen": -19.22608757019043, "logits/rejected": -18.962949752807617, "logps/chosen": -319.9596252441406, "logps/rejected": -297.4245910644531, "loss": 0.6723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8499867916107178, "rewards/margins": 0.397257000207901, "rewards/rejected": 2.4527297019958496, "step": 58630 }, { "epoch": 2.722503365987279, "grad_norm": 19.610517501831055, "learning_rate": 1.3672501044616742e-07, "logits/chosen": -19.383901596069336, "logits/rejected": -17.78548812866211, "logps/chosen": -382.42919921875, "logps/rejected": -203.8317108154297, "loss": 0.2878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9377639293670654, "rewards/margins": 2.925856828689575, "rewards/rejected": 1.0119068622589111, "step": 58640 }, { "epoch": 2.722967640094712, "grad_norm": 29.32709312438965, "learning_rate": 1.3669715399972143e-07, "logits/chosen": -19.461881637573242, "logits/rejected": -18.550830841064453, "logps/chosen": -483.50225830078125, "logps/rejected": -363.565673828125, "loss": 0.4914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.393277645111084, "rewards/margins": 1.3084537982940674, "rewards/rejected": 3.0848240852355957, "step": 58650 }, { "epoch": 2.723431914202145, "grad_norm": 79.55834197998047, "learning_rate": 1.3666929755327545e-07, "logits/chosen": -20.37208366394043, "logits/rejected": -20.186275482177734, "logps/chosen": -342.46966552734375, "logps/rejected": -300.1087341308594, "loss": 0.9145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4703900814056396, "rewards/margins": -0.007945108227431774, "rewards/rejected": 2.47833514213562, "step": 58660 }, { "epoch": 2.723896188309578, "grad_norm": 2.140254259109497, "learning_rate": 1.3664144110682946e-07, "logits/chosen": -19.226533889770508, "logits/rejected": -18.468368530273438, "logps/chosen": -365.84832763671875, "logps/rejected": -292.37738037109375, "loss": 0.708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1769473552703857, "rewards/margins": 1.003218650817871, "rewards/rejected": 2.1737284660339355, "step": 58670 }, { "epoch": 2.7243604624170112, "grad_norm": 72.74482727050781, "learning_rate": 1.3661358466038347e-07, "logits/chosen": -18.250886917114258, "logits/rejected": -18.09843635559082, "logps/chosen": -480.964111328125, "logps/rejected": -435.863037109375, "loss": 0.5837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9347805976867676, "rewards/margins": 0.9458236694335938, "rewards/rejected": 2.988957166671753, "step": 58680 }, { "epoch": 2.724824736524444, "grad_norm": 0.193058043718338, "learning_rate": 1.3658572821393752e-07, "logits/chosen": -18.881240844726562, "logits/rejected": -18.127944946289062, "logps/chosen": -305.02264404296875, "logps/rejected": -241.61160278320312, "loss": 0.661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.592862606048584, "rewards/margins": 2.2404301166534424, "rewards/rejected": 1.3524324893951416, "step": 58690 }, { "epoch": 2.725289010631877, "grad_norm": 8.490012168884277, "learning_rate": 1.365578717674915e-07, "logits/chosen": -19.058324813842773, "logits/rejected": -18.435989379882812, "logps/chosen": -302.1767883300781, "logps/rejected": -288.6212158203125, "loss": 0.8717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8763952255249023, "rewards/margins": 0.983894944190979, "rewards/rejected": 1.8925002813339233, "step": 58700 }, { "epoch": 2.72575328473931, "grad_norm": 0.683914840221405, "learning_rate": 1.3653001532104554e-07, "logits/chosen": -18.734987258911133, "logits/rejected": -17.679615020751953, "logps/chosen": -321.5965270996094, "logps/rejected": -216.9576416015625, "loss": 1.0117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3986876010894775, "rewards/margins": 0.7722169160842896, "rewards/rejected": 1.6264705657958984, "step": 58710 }, { "epoch": 2.726217558846743, "grad_norm": 1.0309629440307617, "learning_rate": 1.3650215887459956e-07, "logits/chosen": -18.417346954345703, "logits/rejected": -17.99612808227539, "logps/chosen": -332.6202087402344, "logps/rejected": -303.34075927734375, "loss": 0.8925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.28035306930542, "rewards/margins": 0.7471176385879517, "rewards/rejected": 2.533235549926758, "step": 58720 }, { "epoch": 2.726681832954176, "grad_norm": 155.68313598632812, "learning_rate": 1.3647430242815357e-07, "logits/chosen": -18.691875457763672, "logits/rejected": -17.970664978027344, "logps/chosen": -400.9460754394531, "logps/rejected": -307.1361389160156, "loss": 0.4029, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3632283210754395, "rewards/margins": 1.2768027782440186, "rewards/rejected": 2.08642578125, "step": 58730 }, { "epoch": 2.727146107061609, "grad_norm": 12.60615062713623, "learning_rate": 1.364464459817076e-07, "logits/chosen": -18.887401580810547, "logits/rejected": -17.195507049560547, "logps/chosen": -369.0177307128906, "logps/rejected": -186.4302520751953, "loss": 0.2408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.952512502670288, "rewards/margins": 2.4699254035949707, "rewards/rejected": 0.48258695006370544, "step": 58740 }, { "epoch": 2.7276103811690424, "grad_norm": 120.89031219482422, "learning_rate": 1.3641858953526163e-07, "logits/chosen": -20.257457733154297, "logits/rejected": -19.670568466186523, "logps/chosen": -431.7748107910156, "logps/rejected": -383.59600830078125, "loss": 0.4398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2198948860168457, "rewards/margins": 0.6685705184936523, "rewards/rejected": 2.5513243675231934, "step": 58750 }, { "epoch": 2.728074655276475, "grad_norm": 52.631309509277344, "learning_rate": 1.3639073308881562e-07, "logits/chosen": -18.60586929321289, "logits/rejected": -18.608966827392578, "logps/chosen": -373.81573486328125, "logps/rejected": -380.1150207519531, "loss": 0.588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8659045696258545, "rewards/margins": 0.6525095701217651, "rewards/rejected": 3.2133948802948, "step": 58760 }, { "epoch": 2.7285389293839084, "grad_norm": 4.284878730773926, "learning_rate": 1.3636287664236966e-07, "logits/chosen": -19.359899520874023, "logits/rejected": -18.46944808959961, "logps/chosen": -374.87969970703125, "logps/rejected": -278.7552490234375, "loss": 0.485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.047956943511963, "rewards/margins": 1.5784189701080322, "rewards/rejected": 1.4695384502410889, "step": 58770 }, { "epoch": 2.729003203491341, "grad_norm": 91.99032592773438, "learning_rate": 1.3633502019592367e-07, "logits/chosen": -18.883962631225586, "logits/rejected": -18.95438003540039, "logps/chosen": -391.266845703125, "logps/rejected": -377.2427673339844, "loss": 0.9102, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1830737590789795, "rewards/margins": 0.36160993576049805, "rewards/rejected": 2.8214638233184814, "step": 58780 }, { "epoch": 2.7294674775987744, "grad_norm": 102.2624282836914, "learning_rate": 1.363071637494777e-07, "logits/chosen": -19.7830753326416, "logits/rejected": -18.90283966064453, "logps/chosen": -293.7518310546875, "logps/rejected": -261.9974670410156, "loss": 0.6654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.242105484008789, "rewards/margins": 0.5890175700187683, "rewards/rejected": 2.653088092803955, "step": 58790 }, { "epoch": 2.729931751706207, "grad_norm": 102.90696716308594, "learning_rate": 1.362793073030317e-07, "logits/chosen": -19.41307830810547, "logits/rejected": -18.22342300415039, "logps/chosen": -519.9776611328125, "logps/rejected": -336.60394287109375, "loss": 0.3208, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.71933126449585, "rewards/margins": 2.072108507156372, "rewards/rejected": 2.647223711013794, "step": 58800 }, { "epoch": 2.7303960258136404, "grad_norm": 103.24857330322266, "learning_rate": 1.3625145085658572e-07, "logits/chosen": -19.805038452148438, "logits/rejected": -18.904338836669922, "logps/chosen": -433.53668212890625, "logps/rejected": -369.0721130371094, "loss": 0.4628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6156864166259766, "rewards/margins": 1.4291287660598755, "rewards/rejected": 2.1865575313568115, "step": 58810 }, { "epoch": 2.7308602999210736, "grad_norm": 64.1019287109375, "learning_rate": 1.3622359441013973e-07, "logits/chosen": -19.79228401184082, "logits/rejected": -19.02253532409668, "logps/chosen": -360.50360107421875, "logps/rejected": -259.28790283203125, "loss": 0.7429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.18001389503479, "rewards/margins": 0.8940618634223938, "rewards/rejected": 2.28595232963562, "step": 58820 }, { "epoch": 2.7313245740285064, "grad_norm": 101.37142181396484, "learning_rate": 1.3619573796369375e-07, "logits/chosen": -19.627689361572266, "logits/rejected": -19.74891471862793, "logps/chosen": -334.83148193359375, "logps/rejected": -277.3067932128906, "loss": 0.6251, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6070332527160645, "rewards/margins": 0.774594783782959, "rewards/rejected": 2.8324387073516846, "step": 58830 }, { "epoch": 2.7317888481359396, "grad_norm": 32.306419372558594, "learning_rate": 1.361678815172478e-07, "logits/chosen": -18.620370864868164, "logits/rejected": -17.95847511291504, "logps/chosen": -311.4853820800781, "logps/rejected": -218.7779083251953, "loss": 0.4678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2859795093536377, "rewards/margins": 0.7776161432266235, "rewards/rejected": 1.5083634853363037, "step": 58840 }, { "epoch": 2.7322531222433724, "grad_norm": 1.6045845746994019, "learning_rate": 1.361400250708018e-07, "logits/chosen": -18.592859268188477, "logits/rejected": -16.971452713012695, "logps/chosen": -500.3304748535156, "logps/rejected": -342.9280700683594, "loss": 0.83, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9984161853790283, "rewards/margins": 1.9073883295059204, "rewards/rejected": 2.0910277366638184, "step": 58850 }, { "epoch": 2.7327173963508056, "grad_norm": 58.831817626953125, "learning_rate": 1.3611216862435582e-07, "logits/chosen": -18.75888442993164, "logits/rejected": -17.915142059326172, "logps/chosen": -459.07989501953125, "logps/rejected": -363.2654724121094, "loss": 0.6145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.167576313018799, "rewards/margins": 1.159401297569275, "rewards/rejected": 2.008174419403076, "step": 58860 }, { "epoch": 2.7331816704582383, "grad_norm": 32.01137161254883, "learning_rate": 1.3608431217790983e-07, "logits/chosen": -19.166027069091797, "logits/rejected": -18.459781646728516, "logps/chosen": -439.30572509765625, "logps/rejected": -371.5018615722656, "loss": 0.8502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.128798723220825, "rewards/margins": 0.3945097327232361, "rewards/rejected": 2.7342891693115234, "step": 58870 }, { "epoch": 2.7336459445656716, "grad_norm": 32.27577590942383, "learning_rate": 1.3605645573146385e-07, "logits/chosen": -19.212291717529297, "logits/rejected": -18.32375717163086, "logps/chosen": -481.53253173828125, "logps/rejected": -366.2876281738281, "loss": 0.4175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.586590766906738, "rewards/margins": 1.9827959537506104, "rewards/rejected": 2.603794574737549, "step": 58880 }, { "epoch": 2.7341102186731048, "grad_norm": 17.599098205566406, "learning_rate": 1.3602859928501786e-07, "logits/chosen": -18.204120635986328, "logits/rejected": -17.213537216186523, "logps/chosen": -308.3020935058594, "logps/rejected": -217.9315643310547, "loss": 0.4596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4683997631073, "rewards/margins": 1.5316803455352783, "rewards/rejected": 0.9367190599441528, "step": 58890 }, { "epoch": 2.7345744927805375, "grad_norm": 27.305770874023438, "learning_rate": 1.360007428385719e-07, "logits/chosen": -19.53216552734375, "logits/rejected": -18.264785766601562, "logps/chosen": -465.79833984375, "logps/rejected": -385.13153076171875, "loss": 0.8187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.475200653076172, "rewards/margins": 1.1238385438919067, "rewards/rejected": 3.3513617515563965, "step": 58900 }, { "epoch": 2.7350387668879708, "grad_norm": 43.13807678222656, "learning_rate": 1.359728863921259e-07, "logits/chosen": -18.665008544921875, "logits/rejected": -18.43181800842285, "logps/chosen": -333.92413330078125, "logps/rejected": -319.15118408203125, "loss": 0.5147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0682806968688965, "rewards/margins": 1.132542371749878, "rewards/rejected": 1.9357383251190186, "step": 58910 }, { "epoch": 2.7355030409954035, "grad_norm": 42.42445755004883, "learning_rate": 1.3594502994567993e-07, "logits/chosen": -18.75316047668457, "logits/rejected": -18.796648025512695, "logps/chosen": -372.9765625, "logps/rejected": -372.4486389160156, "loss": 0.781, "rewards/accuracies": 0.5, "rewards/chosen": 3.087162971496582, "rewards/margins": 0.5704144239425659, "rewards/rejected": 2.5167484283447266, "step": 58920 }, { "epoch": 2.7359673151028367, "grad_norm": 63.3141975402832, "learning_rate": 1.3591717349923394e-07, "logits/chosen": -18.29568862915039, "logits/rejected": -18.501850128173828, "logps/chosen": -324.1813049316406, "logps/rejected": -346.9815979003906, "loss": 0.6539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4687070846557617, "rewards/margins": 0.7341762781143188, "rewards/rejected": 1.7345308065414429, "step": 58930 }, { "epoch": 2.7364315892102695, "grad_norm": 45.71302032470703, "learning_rate": 1.3588931705278796e-07, "logits/chosen": -20.054279327392578, "logits/rejected": -19.4395694732666, "logps/chosen": -420.92333984375, "logps/rejected": -479.25830078125, "loss": 0.8834, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 5.198434829711914, "rewards/margins": 0.6075781583786011, "rewards/rejected": 4.59085750579834, "step": 58940 }, { "epoch": 2.7368958633177027, "grad_norm": 60.626609802246094, "learning_rate": 1.3586146060634197e-07, "logits/chosen": -19.514053344726562, "logits/rejected": -18.67761993408203, "logps/chosen": -349.64312744140625, "logps/rejected": -278.3044738769531, "loss": 0.4876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.297235012054443, "rewards/margins": 1.2371208667755127, "rewards/rejected": 3.0601141452789307, "step": 58950 }, { "epoch": 2.737360137425136, "grad_norm": 109.95901489257812, "learning_rate": 1.3583360415989601e-07, "logits/chosen": -19.219295501708984, "logits/rejected": -18.186214447021484, "logps/chosen": -416.97027587890625, "logps/rejected": -335.7234802246094, "loss": 1.0792, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.942931652069092, "rewards/margins": 0.9247319102287292, "rewards/rejected": 3.018200159072876, "step": 58960 }, { "epoch": 2.7378244115325687, "grad_norm": 38.86781692504883, "learning_rate": 1.3580574771345e-07, "logits/chosen": -19.525390625, "logits/rejected": -19.151641845703125, "logps/chosen": -475.527587890625, "logps/rejected": -363.49761962890625, "loss": 0.317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.556723117828369, "rewards/margins": 1.7903308868408203, "rewards/rejected": 2.766392230987549, "step": 58970 }, { "epoch": 2.738288685640002, "grad_norm": 15.997171401977539, "learning_rate": 1.3577789126700404e-07, "logits/chosen": -19.729318618774414, "logits/rejected": -19.672494888305664, "logps/chosen": -323.1788330078125, "logps/rejected": -316.2220153808594, "loss": 1.0241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.066838026046753, "rewards/margins": 0.7628340721130371, "rewards/rejected": 2.304004192352295, "step": 58980 }, { "epoch": 2.738752959747435, "grad_norm": 173.84115600585938, "learning_rate": 1.3575003482055806e-07, "logits/chosen": -19.35971450805664, "logits/rejected": -18.14458465576172, "logps/chosen": -376.61737060546875, "logps/rejected": -315.93182373046875, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.915698528289795, "rewards/margins": 1.335229516029358, "rewards/rejected": 1.5804691314697266, "step": 58990 }, { "epoch": 2.739217233854868, "grad_norm": 34.33047866821289, "learning_rate": 1.3572217837411207e-07, "logits/chosen": -19.40774917602539, "logits/rejected": -18.909360885620117, "logps/chosen": -404.0766906738281, "logps/rejected": -391.9238586425781, "loss": 0.6668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9675827026367188, "rewards/margins": 0.7564017176628113, "rewards/rejected": 3.2111809253692627, "step": 59000 }, { "epoch": 2.7396815079623007, "grad_norm": 21.622230529785156, "learning_rate": 1.356943219276661e-07, "logits/chosen": -19.30168914794922, "logits/rejected": -19.0856876373291, "logps/chosen": -414.5935974121094, "logps/rejected": -394.1545104980469, "loss": 0.6628, "rewards/accuracies": 0.5, "rewards/chosen": 3.3225064277648926, "rewards/margins": 0.2408680021762848, "rewards/rejected": 3.0816383361816406, "step": 59010 }, { "epoch": 2.740145782069734, "grad_norm": 136.01025390625, "learning_rate": 1.356664654812201e-07, "logits/chosen": -18.720712661743164, "logits/rejected": -18.342754364013672, "logps/chosen": -477.31951904296875, "logps/rejected": -376.9857177734375, "loss": 0.7729, "rewards/accuracies": 0.5, "rewards/chosen": 4.327385902404785, "rewards/margins": 0.8690311312675476, "rewards/rejected": 3.4583544731140137, "step": 59020 }, { "epoch": 2.740610056177167, "grad_norm": 179.43283081054688, "learning_rate": 1.3563860903477412e-07, "logits/chosen": -18.2651309967041, "logits/rejected": -18.054067611694336, "logps/chosen": -330.68682861328125, "logps/rejected": -273.3557434082031, "loss": 0.8036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.743598222732544, "rewards/margins": 0.569196879863739, "rewards/rejected": 2.174400806427002, "step": 59030 }, { "epoch": 2.7410743302846, "grad_norm": 42.32966613769531, "learning_rate": 1.3561075258832813e-07, "logits/chosen": -18.39266586303711, "logits/rejected": -17.646224975585938, "logps/chosen": -325.32025146484375, "logps/rejected": -241.98593139648438, "loss": 0.5262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.773876667022705, "rewards/margins": 1.152773141860962, "rewards/rejected": 1.6211035251617432, "step": 59040 }, { "epoch": 2.741538604392033, "grad_norm": 25.46912384033203, "learning_rate": 1.3558289614188217e-07, "logits/chosen": -18.65485382080078, "logits/rejected": -18.014644622802734, "logps/chosen": -379.98504638671875, "logps/rejected": -317.2298889160156, "loss": 0.6464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7488436698913574, "rewards/margins": 1.106659173965454, "rewards/rejected": 2.6421847343444824, "step": 59050 }, { "epoch": 2.7420028784994663, "grad_norm": 5.876946926116943, "learning_rate": 1.3555503969543619e-07, "logits/chosen": -20.049652099609375, "logits/rejected": -18.81625747680664, "logps/chosen": -371.6142578125, "logps/rejected": -295.0741271972656, "loss": 0.4418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3192715644836426, "rewards/margins": 1.1743654012680054, "rewards/rejected": 2.1449062824249268, "step": 59060 }, { "epoch": 2.742467152606899, "grad_norm": 112.81370544433594, "learning_rate": 1.355271832489902e-07, "logits/chosen": -18.631017684936523, "logits/rejected": -17.32929801940918, "logps/chosen": -255.73583984375, "logps/rejected": -194.96719360351562, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.006871223449707, "rewards/margins": 1.1043815612792969, "rewards/rejected": 0.9024895429611206, "step": 59070 }, { "epoch": 2.742931426714332, "grad_norm": 2.51078724861145, "learning_rate": 1.3549932680254422e-07, "logits/chosen": -20.55022621154785, "logits/rejected": -18.75861930847168, "logps/chosen": -411.7186584472656, "logps/rejected": -260.87457275390625, "loss": 0.2258, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.835564136505127, "rewards/margins": 2.5821163654327393, "rewards/rejected": 2.253448009490967, "step": 59080 }, { "epoch": 2.743395700821765, "grad_norm": 93.36175537109375, "learning_rate": 1.3547147035609823e-07, "logits/chosen": -19.349149703979492, "logits/rejected": -18.95669174194336, "logps/chosen": -416.3714294433594, "logps/rejected": -363.1354675292969, "loss": 1.0478, "rewards/accuracies": 0.5, "rewards/chosen": 3.252440929412842, "rewards/margins": -0.2905123829841614, "rewards/rejected": 3.5429534912109375, "step": 59090 }, { "epoch": 2.7438599749291983, "grad_norm": 0.1796138882637024, "learning_rate": 1.3544361390965224e-07, "logits/chosen": -19.687673568725586, "logits/rejected": -18.690149307250977, "logps/chosen": -365.66278076171875, "logps/rejected": -264.38690185546875, "loss": 0.5406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7544169425964355, "rewards/margins": 1.7116515636444092, "rewards/rejected": 2.0427653789520264, "step": 59100 }, { "epoch": 2.744324249036631, "grad_norm": 19.237180709838867, "learning_rate": 1.3541575746320629e-07, "logits/chosen": -18.552579879760742, "logits/rejected": -18.076732635498047, "logps/chosen": -436.40325927734375, "logps/rejected": -339.77642822265625, "loss": 0.3309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9604289531707764, "rewards/margins": 1.5223877429962158, "rewards/rejected": 2.4380412101745605, "step": 59110 }, { "epoch": 2.7447885231440643, "grad_norm": 200.2522430419922, "learning_rate": 1.3538790101676027e-07, "logits/chosen": -20.271621704101562, "logits/rejected": -19.932016372680664, "logps/chosen": -421.2586975097656, "logps/rejected": -358.4852600097656, "loss": 0.6078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.973132610321045, "rewards/margins": 0.9051241874694824, "rewards/rejected": 3.0680088996887207, "step": 59120 }, { "epoch": 2.7452527972514975, "grad_norm": 98.06048583984375, "learning_rate": 1.3536004457031431e-07, "logits/chosen": -19.55626678466797, "logits/rejected": -18.722606658935547, "logps/chosen": -350.30596923828125, "logps/rejected": -302.1592102050781, "loss": 0.872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6920573711395264, "rewards/margins": 0.47315025329589844, "rewards/rejected": 2.218907356262207, "step": 59130 }, { "epoch": 2.7457170713589303, "grad_norm": 175.3114013671875, "learning_rate": 1.3533218812386833e-07, "logits/chosen": -19.62625503540039, "logits/rejected": -18.98505973815918, "logps/chosen": -451.96234130859375, "logps/rejected": -335.33245849609375, "loss": 0.4789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.424891948699951, "rewards/margins": 1.0334075689315796, "rewards/rejected": 2.391484498977661, "step": 59140 }, { "epoch": 2.7461813454663635, "grad_norm": 75.48133850097656, "learning_rate": 1.3530433167742234e-07, "logits/chosen": -18.669843673706055, "logits/rejected": -18.64168930053711, "logps/chosen": -451.20654296875, "logps/rejected": -461.559326171875, "loss": 0.9326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.931236982345581, "rewards/margins": 0.2446870356798172, "rewards/rejected": 3.686549663543701, "step": 59150 }, { "epoch": 2.7466456195737963, "grad_norm": 65.93888854980469, "learning_rate": 1.3527647523097636e-07, "logits/chosen": -18.84572982788086, "logits/rejected": -18.039981842041016, "logps/chosen": -401.7539367675781, "logps/rejected": -317.0482177734375, "loss": 0.7808, "rewards/accuracies": 0.5, "rewards/chosen": 3.952693462371826, "rewards/margins": 1.113795280456543, "rewards/rejected": 2.838898181915283, "step": 59160 }, { "epoch": 2.7471098936812295, "grad_norm": 180.2366180419922, "learning_rate": 1.352486187845304e-07, "logits/chosen": -18.608694076538086, "logits/rejected": -18.251821517944336, "logps/chosen": -337.4072570800781, "logps/rejected": -283.94622802734375, "loss": 1.5972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.107485294342041, "rewards/margins": -0.6836915612220764, "rewards/rejected": 2.7911765575408936, "step": 59170 }, { "epoch": 2.7475741677886623, "grad_norm": 31.760940551757812, "learning_rate": 1.352207623380844e-07, "logits/chosen": -19.900644302368164, "logits/rejected": -18.933170318603516, "logps/chosen": -422.0599060058594, "logps/rejected": -310.04266357421875, "loss": 0.8364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.773808717727661, "rewards/margins": 0.8897860646247864, "rewards/rejected": 2.8840231895446777, "step": 59180 }, { "epoch": 2.7480384418960955, "grad_norm": 45.91594696044922, "learning_rate": 1.351929058916384e-07, "logits/chosen": -19.892053604125977, "logits/rejected": -19.50625228881836, "logps/chosen": -364.6783752441406, "logps/rejected": -344.2578430175781, "loss": 0.8225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8795619010925293, "rewards/margins": 0.9193552732467651, "rewards/rejected": 2.960206985473633, "step": 59190 }, { "epoch": 2.7485027160035287, "grad_norm": 153.70068359375, "learning_rate": 1.3516504944519244e-07, "logits/chosen": -19.247722625732422, "logits/rejected": -18.35863494873047, "logps/chosen": -338.71929931640625, "logps/rejected": -316.81256103515625, "loss": 0.5183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.425780773162842, "rewards/margins": 1.4171746969223022, "rewards/rejected": 2.008606195449829, "step": 59200 }, { "epoch": 2.7489669901109615, "grad_norm": 89.26194763183594, "learning_rate": 1.3513719299874646e-07, "logits/chosen": -18.409927368164062, "logits/rejected": -18.752744674682617, "logps/chosen": -289.00164794921875, "logps/rejected": -326.57000732421875, "loss": 1.7438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2804434299468994, "rewards/margins": -1.0681421756744385, "rewards/rejected": 3.348585605621338, "step": 59210 }, { "epoch": 2.7494312642183947, "grad_norm": 5.218371391296387, "learning_rate": 1.3510933655230047e-07, "logits/chosen": -19.096900939941406, "logits/rejected": -18.9055233001709, "logps/chosen": -378.4829406738281, "logps/rejected": -368.0540466308594, "loss": 0.8653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0427045822143555, "rewards/margins": 1.2246307134628296, "rewards/rejected": 2.818073272705078, "step": 59220 }, { "epoch": 2.7498955383258274, "grad_norm": 52.63943099975586, "learning_rate": 1.3508148010585449e-07, "logits/chosen": -18.641389846801758, "logits/rejected": -17.588130950927734, "logps/chosen": -400.0216979980469, "logps/rejected": -309.1974182128906, "loss": 0.7461, "rewards/accuracies": 0.5, "rewards/chosen": 3.1890852451324463, "rewards/margins": 0.8252977132797241, "rewards/rejected": 2.3637874126434326, "step": 59230 }, { "epoch": 2.7503598124332607, "grad_norm": 184.60911560058594, "learning_rate": 1.350536236594085e-07, "logits/chosen": -18.460224151611328, "logits/rejected": -17.669078826904297, "logps/chosen": -390.6666564941406, "logps/rejected": -299.4966125488281, "loss": 0.8257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8918709754943848, "rewards/margins": 0.9054887890815735, "rewards/rejected": 1.9863824844360352, "step": 59240 }, { "epoch": 2.7508240865406934, "grad_norm": 25.46298599243164, "learning_rate": 1.3502576721296252e-07, "logits/chosen": -18.808589935302734, "logits/rejected": -18.19732093811035, "logps/chosen": -457.4170837402344, "logps/rejected": -382.1622314453125, "loss": 0.947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.738323211669922, "rewards/margins": 0.7639613747596741, "rewards/rejected": 2.9743614196777344, "step": 59250 }, { "epoch": 2.7512883606481267, "grad_norm": 34.43467330932617, "learning_rate": 1.3499791076651656e-07, "logits/chosen": -18.93802261352539, "logits/rejected": -18.75894546508789, "logps/chosen": -394.15460205078125, "logps/rejected": -328.2576904296875, "loss": 0.3143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7885615825653076, "rewards/margins": 1.3242075443267822, "rewards/rejected": 2.4643540382385254, "step": 59260 }, { "epoch": 2.75175263475556, "grad_norm": 21.951215744018555, "learning_rate": 1.3497005432007057e-07, "logits/chosen": -18.754863739013672, "logits/rejected": -17.69598388671875, "logps/chosen": -240.43017578125, "logps/rejected": -252.5128173828125, "loss": 1.115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2226791381835938, "rewards/margins": 0.580825924873352, "rewards/rejected": 1.6418530941009521, "step": 59270 }, { "epoch": 2.7522169088629926, "grad_norm": 149.09542846679688, "learning_rate": 1.3494219787362459e-07, "logits/chosen": -18.615520477294922, "logits/rejected": -18.113183975219727, "logps/chosen": -352.8477478027344, "logps/rejected": -321.71917724609375, "loss": 0.7259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3410353660583496, "rewards/margins": 1.143976092338562, "rewards/rejected": 2.1970596313476562, "step": 59280 }, { "epoch": 2.752681182970426, "grad_norm": 65.4943618774414, "learning_rate": 1.349143414271786e-07, "logits/chosen": -18.934669494628906, "logits/rejected": -18.109689712524414, "logps/chosen": -354.183349609375, "logps/rejected": -326.91064453125, "loss": 1.4178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.301368474960327, "rewards/margins": -0.38422712683677673, "rewards/rejected": 2.685595989227295, "step": 59290 }, { "epoch": 2.7531454570778586, "grad_norm": 57.5106315612793, "learning_rate": 1.3488648498073261e-07, "logits/chosen": -19.99334716796875, "logits/rejected": -19.44976234436035, "logps/chosen": -442.6813049316406, "logps/rejected": -363.08660888671875, "loss": 0.7995, "rewards/accuracies": 0.5, "rewards/chosen": 3.8690345287323, "rewards/margins": 0.5192053318023682, "rewards/rejected": 3.3498291969299316, "step": 59300 }, { "epoch": 2.753609731185292, "grad_norm": 2.324991464614868, "learning_rate": 1.3485862853428663e-07, "logits/chosen": -18.77833366394043, "logits/rejected": -18.104618072509766, "logps/chosen": -348.212158203125, "logps/rejected": -260.00775146484375, "loss": 0.7025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6944327354431152, "rewards/margins": 0.8615356683731079, "rewards/rejected": 1.8328971862792969, "step": 59310 }, { "epoch": 2.7540740052927246, "grad_norm": 126.5543441772461, "learning_rate": 1.3483077208784067e-07, "logits/chosen": -19.507701873779297, "logits/rejected": -19.06338119506836, "logps/chosen": -346.8182067871094, "logps/rejected": -357.45355224609375, "loss": 0.9714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7969493865966797, "rewards/margins": 1.159506916999817, "rewards/rejected": 2.6374423503875732, "step": 59320 }, { "epoch": 2.754538279400158, "grad_norm": 1.1824547052383423, "learning_rate": 1.3480291564139466e-07, "logits/chosen": -17.594249725341797, "logits/rejected": -17.716405868530273, "logps/chosen": -264.26605224609375, "logps/rejected": -320.3725280761719, "loss": 1.4261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9233258962631226, "rewards/margins": 0.44082388281822205, "rewards/rejected": 1.4825019836425781, "step": 59330 }, { "epoch": 2.755002553507591, "grad_norm": 16.76246452331543, "learning_rate": 1.347750591949487e-07, "logits/chosen": -20.148752212524414, "logits/rejected": -19.304378509521484, "logps/chosen": -374.415283203125, "logps/rejected": -257.482177734375, "loss": 0.4301, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.026519298553467, "rewards/margins": 1.1145232915878296, "rewards/rejected": 2.9119961261749268, "step": 59340 }, { "epoch": 2.755466827615024, "grad_norm": 3.241680383682251, "learning_rate": 1.3474720274850271e-07, "logits/chosen": -18.91666603088379, "logits/rejected": -17.506122589111328, "logps/chosen": -366.72991943359375, "logps/rejected": -293.94512939453125, "loss": 0.3423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4032111167907715, "rewards/margins": 2.0411057472229004, "rewards/rejected": 1.3621060848236084, "step": 59350 }, { "epoch": 2.755931101722457, "grad_norm": 40.89659118652344, "learning_rate": 1.3471934630205673e-07, "logits/chosen": -18.047910690307617, "logits/rejected": -17.91830825805664, "logps/chosen": -350.7098693847656, "logps/rejected": -351.64105224609375, "loss": 1.076, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.607480049133301, "rewards/margins": 0.11822564899921417, "rewards/rejected": 2.4892544746398926, "step": 59360 }, { "epoch": 2.75639537582989, "grad_norm": 55.47212219238281, "learning_rate": 1.3469148985561074e-07, "logits/chosen": -18.572742462158203, "logits/rejected": -18.303081512451172, "logps/chosen": -405.3597106933594, "logps/rejected": -340.3768005371094, "loss": 0.4744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.179499626159668, "rewards/margins": 1.017836332321167, "rewards/rejected": 3.1616640090942383, "step": 59370 }, { "epoch": 2.756859649937323, "grad_norm": 33.966678619384766, "learning_rate": 1.3466363340916478e-07, "logits/chosen": -19.161365509033203, "logits/rejected": -18.606258392333984, "logps/chosen": -409.7093200683594, "logps/rejected": -364.01397705078125, "loss": 1.298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.121528148651123, "rewards/margins": 0.04481922835111618, "rewards/rejected": 3.076709270477295, "step": 59380 }, { "epoch": 2.757323924044756, "grad_norm": 215.88607788085938, "learning_rate": 1.3463577696271877e-07, "logits/chosen": -18.23194694519043, "logits/rejected": -18.723691940307617, "logps/chosen": -316.79852294921875, "logps/rejected": -266.72503662109375, "loss": 0.9753, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3701372146606445, "rewards/margins": 0.5536531209945679, "rewards/rejected": 1.8164842128753662, "step": 59390 }, { "epoch": 2.757788198152189, "grad_norm": 19.844928741455078, "learning_rate": 1.3460792051627279e-07, "logits/chosen": -19.580101013183594, "logits/rejected": -18.422746658325195, "logps/chosen": -383.5919494628906, "logps/rejected": -284.9311218261719, "loss": 0.3473, "rewards/accuracies": 1.0, "rewards/chosen": 3.779520034790039, "rewards/margins": 1.8535468578338623, "rewards/rejected": 1.9259729385375977, "step": 59400 }, { "epoch": 2.7582524722596222, "grad_norm": 7.23036003112793, "learning_rate": 1.3458006406982683e-07, "logits/chosen": -18.892078399658203, "logits/rejected": -17.953014373779297, "logps/chosen": -362.135009765625, "logps/rejected": -320.98468017578125, "loss": 0.7873, "rewards/accuracies": 0.5, "rewards/chosen": 2.7866294384002686, "rewards/margins": 0.7670997381210327, "rewards/rejected": 2.0195298194885254, "step": 59410 }, { "epoch": 2.758716746367055, "grad_norm": 126.32032012939453, "learning_rate": 1.3455220762338084e-07, "logits/chosen": -19.05886459350586, "logits/rejected": -17.663658142089844, "logps/chosen": -399.8750305175781, "logps/rejected": -308.3240966796875, "loss": 0.6064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0768208503723145, "rewards/margins": 1.5281682014465332, "rewards/rejected": 1.5486526489257812, "step": 59420 }, { "epoch": 2.759181020474488, "grad_norm": 45.41233444213867, "learning_rate": 1.3452435117693486e-07, "logits/chosen": -18.924589157104492, "logits/rejected": -18.909503936767578, "logps/chosen": -430.70501708984375, "logps/rejected": -351.32452392578125, "loss": 0.5129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7663848400115967, "rewards/margins": 1.2415143251419067, "rewards/rejected": 2.5248703956604004, "step": 59430 }, { "epoch": 2.7596452945819214, "grad_norm": 74.55751037597656, "learning_rate": 1.3449649473048887e-07, "logits/chosen": -18.389463424682617, "logits/rejected": -17.799406051635742, "logps/chosen": -393.0228576660156, "logps/rejected": -388.69268798828125, "loss": 0.7898, "rewards/accuracies": 0.5, "rewards/chosen": 2.669363498687744, "rewards/margins": 0.5186188817024231, "rewards/rejected": 2.150744915008545, "step": 59440 }, { "epoch": 2.760109568689354, "grad_norm": 18.202970504760742, "learning_rate": 1.3446863828404289e-07, "logits/chosen": -19.90570831298828, "logits/rejected": -18.30870246887207, "logps/chosen": -324.80487060546875, "logps/rejected": -242.7030792236328, "loss": 0.7488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1751017570495605, "rewards/margins": 1.675743818283081, "rewards/rejected": 1.4993584156036377, "step": 59450 }, { "epoch": 2.760573842796787, "grad_norm": 55.53717041015625, "learning_rate": 1.344407818375969e-07, "logits/chosen": -19.207929611206055, "logits/rejected": -18.9835205078125, "logps/chosen": -290.90374755859375, "logps/rejected": -308.7209777832031, "loss": 0.5991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9807772636413574, "rewards/margins": 0.7023979425430298, "rewards/rejected": 2.278379201889038, "step": 59460 }, { "epoch": 2.76103811690422, "grad_norm": 193.12940979003906, "learning_rate": 1.3441292539115094e-07, "logits/chosen": -18.662311553955078, "logits/rejected": -18.86018943786621, "logps/chosen": -308.88140869140625, "logps/rejected": -347.6560363769531, "loss": 1.4721, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.9242223501205444, "rewards/margins": -0.9714091420173645, "rewards/rejected": 2.8956313133239746, "step": 59470 }, { "epoch": 2.7615023910116534, "grad_norm": 68.56892395019531, "learning_rate": 1.3438506894470496e-07, "logits/chosen": -19.496662139892578, "logits/rejected": -18.377971649169922, "logps/chosen": -398.8756408691406, "logps/rejected": -272.49017333984375, "loss": 0.4458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9885897636413574, "rewards/margins": 1.3567087650299072, "rewards/rejected": 2.6318814754486084, "step": 59480 }, { "epoch": 2.761966665119086, "grad_norm": 160.1490936279297, "learning_rate": 1.3435721249825897e-07, "logits/chosen": -19.181114196777344, "logits/rejected": -17.275562286376953, "logps/chosen": -365.5980529785156, "logps/rejected": -246.907958984375, "loss": 0.6164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.778531312942505, "rewards/margins": 1.5794497728347778, "rewards/rejected": 1.1990816593170166, "step": 59490 }, { "epoch": 2.7624309392265194, "grad_norm": 0.009539098478853703, "learning_rate": 1.3432935605181298e-07, "logits/chosen": -19.24468994140625, "logits/rejected": -18.919483184814453, "logps/chosen": -378.03265380859375, "logps/rejected": -333.8011169433594, "loss": 0.7141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5163021087646484, "rewards/margins": 1.1623265743255615, "rewards/rejected": 2.353975772857666, "step": 59500 }, { "epoch": 2.7628952133339526, "grad_norm": 127.43717193603516, "learning_rate": 1.34301499605367e-07, "logits/chosen": -19.19704246520996, "logits/rejected": -18.967266082763672, "logps/chosen": -309.47332763671875, "logps/rejected": -357.71124267578125, "loss": 1.3546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6910324096679688, "rewards/margins": 0.23089997470378876, "rewards/rejected": 2.460132360458374, "step": 59510 }, { "epoch": 2.7633594874413854, "grad_norm": 4.503198146820068, "learning_rate": 1.3427364315892101e-07, "logits/chosen": -18.766559600830078, "logits/rejected": -18.72588539123535, "logps/chosen": -394.67999267578125, "logps/rejected": -341.2554931640625, "loss": 0.5317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.267843723297119, "rewards/margins": 0.9248016476631165, "rewards/rejected": 2.3430418968200684, "step": 59520 }, { "epoch": 2.763823761548818, "grad_norm": 39.885894775390625, "learning_rate": 1.3424578671247505e-07, "logits/chosen": -18.739055633544922, "logits/rejected": -17.542150497436523, "logps/chosen": -421.2118225097656, "logps/rejected": -338.36932373046875, "loss": 0.7934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8294174671173096, "rewards/margins": 0.9733540415763855, "rewards/rejected": 1.8560632467269897, "step": 59530 }, { "epoch": 2.7642880356562514, "grad_norm": 216.45828247070312, "learning_rate": 1.3421793026602904e-07, "logits/chosen": -19.543710708618164, "logits/rejected": -18.88327407836914, "logps/chosen": -350.3052062988281, "logps/rejected": -322.1024475097656, "loss": 0.7477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.543841600418091, "rewards/margins": 1.8277860879898071, "rewards/rejected": 1.7160558700561523, "step": 59540 }, { "epoch": 2.7647523097636846, "grad_norm": 128.43704223632812, "learning_rate": 1.3419007381958308e-07, "logits/chosen": -18.318063735961914, "logits/rejected": -18.19573211669922, "logps/chosen": -373.8177490234375, "logps/rejected": -329.5249938964844, "loss": 1.5889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6740779876708984, "rewards/margins": -0.29147031903266907, "rewards/rejected": 2.965548276901245, "step": 59550 }, { "epoch": 2.7652165838711174, "grad_norm": 51.54543685913086, "learning_rate": 1.341622173731371e-07, "logits/chosen": -19.032155990600586, "logits/rejected": -18.401525497436523, "logps/chosen": -381.37396240234375, "logps/rejected": -329.14373779296875, "loss": 0.2645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.673109531402588, "rewards/margins": 1.5641744136810303, "rewards/rejected": 2.1089353561401367, "step": 59560 }, { "epoch": 2.7656808579785506, "grad_norm": 2.52329158782959, "learning_rate": 1.341343609266911e-07, "logits/chosen": -18.515687942504883, "logits/rejected": -17.97028160095215, "logps/chosen": -252.5813446044922, "logps/rejected": -276.08245849609375, "loss": 0.9465, "rewards/accuracies": 0.5, "rewards/chosen": 2.957451820373535, "rewards/margins": 0.6708194613456726, "rewards/rejected": 2.286632537841797, "step": 59570 }, { "epoch": 2.766145132085984, "grad_norm": 110.68071746826172, "learning_rate": 1.3410650448024513e-07, "logits/chosen": -19.728078842163086, "logits/rejected": -18.468151092529297, "logps/chosen": -491.4425354003906, "logps/rejected": -382.17572021484375, "loss": 0.266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.748936653137207, "rewards/margins": 1.927109956741333, "rewards/rejected": 2.821826934814453, "step": 59580 }, { "epoch": 2.7666094061934166, "grad_norm": 0.6522961258888245, "learning_rate": 1.3407864803379914e-07, "logits/chosen": -18.71283531188965, "logits/rejected": -18.85262680053711, "logps/chosen": -520.2557373046875, "logps/rejected": -446.993896484375, "loss": 0.8719, "rewards/accuracies": 0.5, "rewards/chosen": 4.420825004577637, "rewards/margins": 0.6239311695098877, "rewards/rejected": 3.796894073486328, "step": 59590 }, { "epoch": 2.7670736803008498, "grad_norm": 147.1169891357422, "learning_rate": 1.3405079158735316e-07, "logits/chosen": -18.70684814453125, "logits/rejected": -17.641342163085938, "logps/chosen": -432.5802307128906, "logps/rejected": -317.3032531738281, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": 3.4569172859191895, "rewards/margins": 0.885849118232727, "rewards/rejected": 2.5710675716400146, "step": 59600 }, { "epoch": 2.7675379544082825, "grad_norm": 82.20929718017578, "learning_rate": 1.3402293514090717e-07, "logits/chosen": -19.851579666137695, "logits/rejected": -19.006412506103516, "logps/chosen": -410.465087890625, "logps/rejected": -363.134521484375, "loss": 0.4387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.954850435256958, "rewards/margins": 1.216921329498291, "rewards/rejected": 2.737929105758667, "step": 59610 }, { "epoch": 2.7680022285157158, "grad_norm": 123.66663360595703, "learning_rate": 1.339950786944612e-07, "logits/chosen": -19.324016571044922, "logits/rejected": -18.027618408203125, "logps/chosen": -531.6319580078125, "logps/rejected": -371.94451904296875, "loss": 0.3061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.812195301055908, "rewards/margins": 1.618600845336914, "rewards/rejected": 3.1935946941375732, "step": 59620 }, { "epoch": 2.7684665026231485, "grad_norm": 166.6925506591797, "learning_rate": 1.3396722224801523e-07, "logits/chosen": -18.87717056274414, "logits/rejected": -18.288423538208008, "logps/chosen": -417.28802490234375, "logps/rejected": -414.9664611816406, "loss": 0.7054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9290614128112793, "rewards/margins": 0.7949934601783752, "rewards/rejected": 3.134068727493286, "step": 59630 }, { "epoch": 2.7689307767305817, "grad_norm": 51.580745697021484, "learning_rate": 1.3393936580156924e-07, "logits/chosen": -18.871822357177734, "logits/rejected": -18.61641502380371, "logps/chosen": -408.4404296875, "logps/rejected": -349.73809814453125, "loss": 0.6402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7277445793151855, "rewards/margins": 1.282882809638977, "rewards/rejected": 2.444861888885498, "step": 59640 }, { "epoch": 2.769395050838015, "grad_norm": 1.198667287826538, "learning_rate": 1.3391150935512326e-07, "logits/chosen": -19.777355194091797, "logits/rejected": -18.590381622314453, "logps/chosen": -464.271728515625, "logps/rejected": -406.37652587890625, "loss": 0.5815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.160930156707764, "rewards/margins": 1.6268322467803955, "rewards/rejected": 3.5340981483459473, "step": 59650 }, { "epoch": 2.7698593249454477, "grad_norm": 9.848145484924316, "learning_rate": 1.3388365290867727e-07, "logits/chosen": -18.758216857910156, "logits/rejected": -18.325969696044922, "logps/chosen": -352.75, "logps/rejected": -321.5440979003906, "loss": 0.6889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4574408531188965, "rewards/margins": 0.6056663393974304, "rewards/rejected": 1.8517744541168213, "step": 59660 }, { "epoch": 2.770323599052881, "grad_norm": 127.00702667236328, "learning_rate": 1.3385579646223128e-07, "logits/chosen": -18.92172622680664, "logits/rejected": -18.143211364746094, "logps/chosen": -401.8511962890625, "logps/rejected": -341.30059814453125, "loss": 0.7115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7348074913024902, "rewards/margins": 1.4699220657348633, "rewards/rejected": 2.264885425567627, "step": 59670 }, { "epoch": 2.7707878731603137, "grad_norm": 37.02269744873047, "learning_rate": 1.3382794001578533e-07, "logits/chosen": -19.155630111694336, "logits/rejected": -18.25322151184082, "logps/chosen": -352.3583984375, "logps/rejected": -339.935546875, "loss": 0.6622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.326394557952881, "rewards/margins": 1.1004493236541748, "rewards/rejected": 2.225944995880127, "step": 59680 }, { "epoch": 2.771252147267747, "grad_norm": 37.04296875, "learning_rate": 1.3380008356933934e-07, "logits/chosen": -19.95534896850586, "logits/rejected": -19.06740379333496, "logps/chosen": -299.6817626953125, "logps/rejected": -253.7994842529297, "loss": 0.9556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.394552230834961, "rewards/margins": 1.3944059610366821, "rewards/rejected": 2.0001463890075684, "step": 59690 }, { "epoch": 2.7717164213751797, "grad_norm": 152.6716766357422, "learning_rate": 1.3377222712289335e-07, "logits/chosen": -18.82090950012207, "logits/rejected": -18.696880340576172, "logps/chosen": -392.6399841308594, "logps/rejected": -317.67333984375, "loss": 1.0884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9357686042785645, "rewards/margins": 1.1947100162506104, "rewards/rejected": 2.741058349609375, "step": 59700 }, { "epoch": 2.772180695482613, "grad_norm": 0.9180259704589844, "learning_rate": 1.3374437067644737e-07, "logits/chosen": -19.305164337158203, "logits/rejected": -17.752498626708984, "logps/chosen": -425.26531982421875, "logps/rejected": -315.55401611328125, "loss": 0.706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5856449604034424, "rewards/margins": 1.6872689723968506, "rewards/rejected": 1.8983761072158813, "step": 59710 }, { "epoch": 2.772644969590046, "grad_norm": 128.19761657714844, "learning_rate": 1.3371651423000138e-07, "logits/chosen": -19.718379974365234, "logits/rejected": -19.31182861328125, "logps/chosen": -393.82513427734375, "logps/rejected": -392.411376953125, "loss": 0.8133, "rewards/accuracies": 0.5, "rewards/chosen": 3.4344935417175293, "rewards/margins": -0.012775301933288574, "rewards/rejected": 3.4472689628601074, "step": 59720 }, { "epoch": 2.773109243697479, "grad_norm": 224.96656799316406, "learning_rate": 1.336886577835554e-07, "logits/chosen": -18.611257553100586, "logits/rejected": -19.148143768310547, "logps/chosen": -432.52642822265625, "logps/rejected": -367.028076171875, "loss": 0.9592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3469040393829346, "rewards/margins": 0.518261730670929, "rewards/rejected": 2.828641891479492, "step": 59730 }, { "epoch": 2.773573517804912, "grad_norm": 102.06951904296875, "learning_rate": 1.3366080133710944e-07, "logits/chosen": -20.00224494934082, "logits/rejected": -19.692996978759766, "logps/chosen": -454.03863525390625, "logps/rejected": -423.9483947753906, "loss": 0.758, "rewards/accuracies": 0.5, "rewards/chosen": 3.8064074516296387, "rewards/margins": -0.08313067257404327, "rewards/rejected": 3.889538288116455, "step": 59740 }, { "epoch": 2.774037791912345, "grad_norm": 211.1859893798828, "learning_rate": 1.3363573053530803e-07, "logits/chosen": -18.783845901489258, "logits/rejected": -18.187984466552734, "logps/chosen": -529.1303100585938, "logps/rejected": -401.240234375, "loss": 0.8816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1090264320373535, "rewards/margins": 0.6560457348823547, "rewards/rejected": 3.4529800415039062, "step": 59750 }, { "epoch": 2.774502066019778, "grad_norm": 96.08440399169922, "learning_rate": 1.3360787408886205e-07, "logits/chosen": -19.335798263549805, "logits/rejected": -18.734342575073242, "logps/chosen": -402.51654052734375, "logps/rejected": -412.9269104003906, "loss": 0.5705, "rewards/accuracies": 0.5, "rewards/chosen": 3.5209362506866455, "rewards/margins": 0.7306564450263977, "rewards/rejected": 2.7902796268463135, "step": 59760 }, { "epoch": 2.774966340127211, "grad_norm": 98.32647705078125, "learning_rate": 1.335800176424161e-07, "logits/chosen": -18.57681655883789, "logits/rejected": -17.174060821533203, "logps/chosen": -406.7201232910156, "logps/rejected": -272.51873779296875, "loss": 0.6225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8552448749542236, "rewards/margins": 1.7218952178955078, "rewards/rejected": 1.1333495378494263, "step": 59770 }, { "epoch": 2.775430614234644, "grad_norm": 57.098724365234375, "learning_rate": 1.335521611959701e-07, "logits/chosen": -19.453022003173828, "logits/rejected": -18.804521560668945, "logps/chosen": -461.7474670410156, "logps/rejected": -450.1659240722656, "loss": 0.588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7869086265563965, "rewards/margins": 1.342020869255066, "rewards/rejected": 2.4448883533477783, "step": 59780 }, { "epoch": 2.7758948883420773, "grad_norm": 4.417945384979248, "learning_rate": 1.3352430474952412e-07, "logits/chosen": -18.167211532592773, "logits/rejected": -17.353944778442383, "logps/chosen": -333.98406982421875, "logps/rejected": -249.88980102539062, "loss": 0.6377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.571958065032959, "rewards/margins": 1.322416067123413, "rewards/rejected": 1.249541997909546, "step": 59790 }, { "epoch": 2.77635916244951, "grad_norm": 232.1321258544922, "learning_rate": 1.3349644830307813e-07, "logits/chosen": -19.054452896118164, "logits/rejected": -18.26755142211914, "logps/chosen": -461.0738220214844, "logps/rejected": -354.12518310546875, "loss": 0.8467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.8879876136779785, "rewards/margins": 1.73245108127594, "rewards/rejected": 3.155536651611328, "step": 59800 }, { "epoch": 2.7768234365569433, "grad_norm": 0.010993532836437225, "learning_rate": 1.3346859185663215e-07, "logits/chosen": -19.327505111694336, "logits/rejected": -17.774391174316406, "logps/chosen": -374.991943359375, "logps/rejected": -312.7057189941406, "loss": 0.5925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.613311767578125, "rewards/margins": 1.8067634105682373, "rewards/rejected": 1.8065478801727295, "step": 59810 }, { "epoch": 2.7772877106643765, "grad_norm": 221.38096618652344, "learning_rate": 1.3344073541018616e-07, "logits/chosen": -19.12445831298828, "logits/rejected": -19.61260986328125, "logps/chosen": -400.05377197265625, "logps/rejected": -396.11590576171875, "loss": 1.2087, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.132753372192383, "rewards/margins": -0.19011399149894714, "rewards/rejected": 3.3228676319122314, "step": 59820 }, { "epoch": 2.7777519847718093, "grad_norm": 73.6674575805664, "learning_rate": 1.334128789637402e-07, "logits/chosen": -19.343856811523438, "logits/rejected": -18.49526596069336, "logps/chosen": -362.32684326171875, "logps/rejected": -325.5784912109375, "loss": 0.6168, "rewards/accuracies": 0.5, "rewards/chosen": 3.6908371448516846, "rewards/margins": 0.6910315752029419, "rewards/rejected": 2.999805450439453, "step": 59830 }, { "epoch": 2.778216258879242, "grad_norm": 4.647933006286621, "learning_rate": 1.3338502251729422e-07, "logits/chosen": -18.41779899597168, "logits/rejected": -18.042461395263672, "logps/chosen": -342.95849609375, "logps/rejected": -269.1963806152344, "loss": 0.5984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.171255588531494, "rewards/margins": 1.6836049556732178, "rewards/rejected": 1.4876502752304077, "step": 59840 }, { "epoch": 2.7786805329866753, "grad_norm": 11.019855499267578, "learning_rate": 1.333571660708482e-07, "logits/chosen": -19.715816497802734, "logits/rejected": -18.243396759033203, "logps/chosen": -464.88507080078125, "logps/rejected": -306.2593994140625, "loss": 0.8097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.143892765045166, "rewards/margins": 2.061298131942749, "rewards/rejected": 2.0825939178466797, "step": 59850 }, { "epoch": 2.7791448070941085, "grad_norm": 125.04791259765625, "learning_rate": 1.3332930962440225e-07, "logits/chosen": -19.64118194580078, "logits/rejected": -19.267852783203125, "logps/chosen": -467.23223876953125, "logps/rejected": -334.51214599609375, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": 3.9568862915039062, "rewards/margins": 1.110822319984436, "rewards/rejected": 2.8460640907287598, "step": 59860 }, { "epoch": 2.7796090812015413, "grad_norm": 1.1972371339797974, "learning_rate": 1.3330145317795626e-07, "logits/chosen": -18.980846405029297, "logits/rejected": -18.457677841186523, "logps/chosen": -336.94439697265625, "logps/rejected": -263.61260986328125, "loss": 0.8237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.198060989379883, "rewards/margins": 0.7382999658584595, "rewards/rejected": 2.459761142730713, "step": 59870 }, { "epoch": 2.7800733553089745, "grad_norm": 25.09601402282715, "learning_rate": 1.3327359673151028e-07, "logits/chosen": -19.067569732666016, "logits/rejected": -18.599971771240234, "logps/chosen": -300.9952697753906, "logps/rejected": -217.4537353515625, "loss": 0.6011, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4906508922576904, "rewards/margins": 0.750777006149292, "rewards/rejected": 1.7398738861083984, "step": 59880 }, { "epoch": 2.7805376294164077, "grad_norm": 97.12236022949219, "learning_rate": 1.332457402850643e-07, "logits/chosen": -20.21426773071289, "logits/rejected": -19.496244430541992, "logps/chosen": -359.82733154296875, "logps/rejected": -257.6953430175781, "loss": 0.4796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.431532382965088, "rewards/margins": 1.4214298725128174, "rewards/rejected": 2.0101027488708496, "step": 59890 }, { "epoch": 2.7810019035238405, "grad_norm": 38.3177375793457, "learning_rate": 1.332178838386183e-07, "logits/chosen": -20.009801864624023, "logits/rejected": -18.508529663085938, "logps/chosen": -320.2009582519531, "logps/rejected": -230.39175415039062, "loss": 0.441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.686972141265869, "rewards/margins": 1.7247833013534546, "rewards/rejected": 0.9621890783309937, "step": 59900 }, { "epoch": 2.7814661776312732, "grad_norm": 55.313453674316406, "learning_rate": 1.3319002739217232e-07, "logits/chosen": -19.549657821655273, "logits/rejected": -18.943416595458984, "logps/chosen": -424.3916015625, "logps/rejected": -323.9383850097656, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7222607135772705, "rewards/margins": 1.4000872373580933, "rewards/rejected": 2.322173595428467, "step": 59910 }, { "epoch": 2.7819304517387065, "grad_norm": 312.27935791015625, "learning_rate": 1.3316217094572636e-07, "logits/chosen": -18.717432022094727, "logits/rejected": -17.826923370361328, "logps/chosen": -358.0661315917969, "logps/rejected": -283.238525390625, "loss": 0.7594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9700400829315186, "rewards/margins": 1.1948421001434326, "rewards/rejected": 1.775197982788086, "step": 59920 }, { "epoch": 2.7823947258461397, "grad_norm": 1.9888744354248047, "learning_rate": 1.3313431449928037e-07, "logits/chosen": -19.241180419921875, "logits/rejected": -17.97106170654297, "logps/chosen": -413.1971130371094, "logps/rejected": -260.04998779296875, "loss": 0.3554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8612637519836426, "rewards/margins": 2.3214869499206543, "rewards/rejected": 1.5397770404815674, "step": 59930 }, { "epoch": 2.7828589999535724, "grad_norm": 11.347968101501465, "learning_rate": 1.331064580528344e-07, "logits/chosen": -17.503894805908203, "logits/rejected": -17.539995193481445, "logps/chosen": -439.15582275390625, "logps/rejected": -487.67645263671875, "loss": 1.4824, "rewards/accuracies": 0.5, "rewards/chosen": 3.0819926261901855, "rewards/margins": 0.19560790061950684, "rewards/rejected": 2.8863847255706787, "step": 59940 }, { "epoch": 2.7833232740610057, "grad_norm": 9.495543479919434, "learning_rate": 1.330786016063884e-07, "logits/chosen": -19.68314552307129, "logits/rejected": -18.784704208374023, "logps/chosen": -415.91796875, "logps/rejected": -313.10565185546875, "loss": 0.4867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.678343772888184, "rewards/margins": 1.44680917263031, "rewards/rejected": 3.231534481048584, "step": 59950 }, { "epoch": 2.783787548168439, "grad_norm": 51.49540710449219, "learning_rate": 1.3305074515994242e-07, "logits/chosen": -18.887887954711914, "logits/rejected": -18.64859390258789, "logps/chosen": -442.26873779296875, "logps/rejected": -393.68634033203125, "loss": 1.0054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0858688354492188, "rewards/margins": 0.09641792625188828, "rewards/rejected": 2.9894509315490723, "step": 59960 }, { "epoch": 2.7842518222758716, "grad_norm": 24.389720916748047, "learning_rate": 1.3302288871349643e-07, "logits/chosen": -18.400938034057617, "logits/rejected": -18.80746841430664, "logps/chosen": -340.5572814941406, "logps/rejected": -270.8946228027344, "loss": 0.9392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1712026596069336, "rewards/margins": 0.12075283378362656, "rewards/rejected": 3.050449848175049, "step": 59970 }, { "epoch": 2.784716096383305, "grad_norm": 167.8523712158203, "learning_rate": 1.3299503226705047e-07, "logits/chosen": -18.69315528869629, "logits/rejected": -19.16934585571289, "logps/chosen": -350.9598388671875, "logps/rejected": -402.7890319824219, "loss": 1.1393, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.474043846130371, "rewards/margins": -0.35539165139198303, "rewards/rejected": 2.8294355869293213, "step": 59980 }, { "epoch": 2.7851803704907376, "grad_norm": 148.1332550048828, "learning_rate": 1.329671758206045e-07, "logits/chosen": -18.477266311645508, "logits/rejected": -18.13935661315918, "logps/chosen": -397.32373046875, "logps/rejected": -358.0771484375, "loss": 0.6721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.125129461288452, "rewards/margins": 0.8276756405830383, "rewards/rejected": 2.297454357147217, "step": 59990 }, { "epoch": 2.785644644598171, "grad_norm": 56.687469482421875, "learning_rate": 1.329393193741585e-07, "logits/chosen": -19.22671890258789, "logits/rejected": -18.207963943481445, "logps/chosen": -519.6430053710938, "logps/rejected": -401.2112731933594, "loss": 0.7904, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.032734394073486, "rewards/margins": 0.870350182056427, "rewards/rejected": 3.162384271621704, "step": 60000 }, { "epoch": 2.7861089187056036, "grad_norm": 0.6813490390777588, "learning_rate": 1.3291146292771252e-07, "logits/chosen": -19.507631301879883, "logits/rejected": -18.222633361816406, "logps/chosen": -521.0510864257812, "logps/rejected": -378.47509765625, "loss": 0.7242, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.123348236083984, "rewards/margins": 2.47792387008667, "rewards/rejected": 2.6454243659973145, "step": 60010 }, { "epoch": 2.786573192813037, "grad_norm": 50.33627700805664, "learning_rate": 1.3288360648126653e-07, "logits/chosen": -19.419931411743164, "logits/rejected": -17.903579711914062, "logps/chosen": -346.58331298828125, "logps/rejected": -242.0783233642578, "loss": 0.3165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3845038414001465, "rewards/margins": 1.6964504718780518, "rewards/rejected": 1.6880531311035156, "step": 60020 }, { "epoch": 2.78703746692047, "grad_norm": 10.133871078491211, "learning_rate": 1.3285575003482055e-07, "logits/chosen": -20.127262115478516, "logits/rejected": -19.268022537231445, "logps/chosen": -564.0936889648438, "logps/rejected": -387.9908752441406, "loss": 0.476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.712136268615723, "rewards/margins": 1.4819682836532593, "rewards/rejected": 3.2301673889160156, "step": 60030 }, { "epoch": 2.787501741027903, "grad_norm": 63.9385871887207, "learning_rate": 1.3282789358837456e-07, "logits/chosen": -19.769086837768555, "logits/rejected": -17.41679573059082, "logps/chosen": -511.0823669433594, "logps/rejected": -279.06854248046875, "loss": 0.1421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.377724647521973, "rewards/margins": 3.6219735145568848, "rewards/rejected": 0.755751371383667, "step": 60040 }, { "epoch": 2.787966015135336, "grad_norm": 53.4224739074707, "learning_rate": 1.328000371419286e-07, "logits/chosen": -19.047685623168945, "logits/rejected": -17.964786529541016, "logps/chosen": -383.6408386230469, "logps/rejected": -351.3115539550781, "loss": 0.4962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6428122520446777, "rewards/margins": 1.484666347503662, "rewards/rejected": 1.1581456661224365, "step": 60050 }, { "epoch": 2.788430289242769, "grad_norm": 115.07659149169922, "learning_rate": 1.327721806954826e-07, "logits/chosen": -19.77732276916504, "logits/rejected": -19.81360626220703, "logps/chosen": -438.2872009277344, "logps/rejected": -405.32757568359375, "loss": 0.4922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3644514083862305, "rewards/margins": 0.7462567090988159, "rewards/rejected": 3.618195056915283, "step": 60060 }, { "epoch": 2.788894563350202, "grad_norm": 53.038795471191406, "learning_rate": 1.3274432424903663e-07, "logits/chosen": -19.204574584960938, "logits/rejected": -18.341798782348633, "logps/chosen": -371.35137939453125, "logps/rejected": -266.1070861816406, "loss": 0.3788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5785140991210938, "rewards/margins": 2.079113006591797, "rewards/rejected": 1.4994012117385864, "step": 60070 }, { "epoch": 2.789358837457635, "grad_norm": 77.98023223876953, "learning_rate": 1.3271646780259065e-07, "logits/chosen": -19.086217880249023, "logits/rejected": -18.187179565429688, "logps/chosen": -261.5615234375, "logps/rejected": -171.6138458251953, "loss": 0.801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1324353218078613, "rewards/margins": 1.0405056476593018, "rewards/rejected": 1.0919299125671387, "step": 60080 }, { "epoch": 2.789823111565068, "grad_norm": 82.49578857421875, "learning_rate": 1.3268861135614466e-07, "logits/chosen": -18.3158016204834, "logits/rejected": -18.8543701171875, "logps/chosen": -378.62176513671875, "logps/rejected": -414.4285583496094, "loss": 1.5173, "rewards/accuracies": 0.5, "rewards/chosen": 2.7864105701446533, "rewards/margins": -0.3448575735092163, "rewards/rejected": 3.13126802444458, "step": 60090 }, { "epoch": 2.7902873856725012, "grad_norm": 7.52430534362793, "learning_rate": 1.3266075490969867e-07, "logits/chosen": -18.882610321044922, "logits/rejected": -18.503313064575195, "logps/chosen": -487.5343322753906, "logps/rejected": -453.4459533691406, "loss": 0.926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.885438919067383, "rewards/margins": 0.4659823477268219, "rewards/rejected": 3.4194564819335938, "step": 60100 }, { "epoch": 2.790751659779934, "grad_norm": 20.036182403564453, "learning_rate": 1.326328984632527e-07, "logits/chosen": -19.21239471435547, "logits/rejected": -18.846986770629883, "logps/chosen": -395.6483154296875, "logps/rejected": -399.57086181640625, "loss": 0.699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.831916093826294, "rewards/margins": 0.4694945812225342, "rewards/rejected": 1.3624215126037598, "step": 60110 }, { "epoch": 2.791215933887367, "grad_norm": 7.635195732116699, "learning_rate": 1.326050420168067e-07, "logits/chosen": -19.827953338623047, "logits/rejected": -19.071218490600586, "logps/chosen": -303.55023193359375, "logps/rejected": -339.26025390625, "loss": 0.7849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5958282947540283, "rewards/margins": 0.657842218875885, "rewards/rejected": 1.937986135482788, "step": 60120 }, { "epoch": 2.7916802079948, "grad_norm": 52.43032455444336, "learning_rate": 1.3257718557036075e-07, "logits/chosen": -18.929744720458984, "logits/rejected": -18.538982391357422, "logps/chosen": -426.32830810546875, "logps/rejected": -449.816650390625, "loss": 1.0138, "rewards/accuracies": 0.5, "rewards/chosen": 3.5853142738342285, "rewards/margins": 0.35373252630233765, "rewards/rejected": 3.231581926345825, "step": 60130 }, { "epoch": 2.792144482102233, "grad_norm": 10.245655059814453, "learning_rate": 1.3254932912391476e-07, "logits/chosen": -18.650592803955078, "logits/rejected": -18.993152618408203, "logps/chosen": -334.24847412109375, "logps/rejected": -293.60577392578125, "loss": 1.6819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.109609603881836, "rewards/margins": -0.7433875799179077, "rewards/rejected": 2.852997303009033, "step": 60140 }, { "epoch": 2.792608756209666, "grad_norm": 0.26851582527160645, "learning_rate": 1.3252147267746877e-07, "logits/chosen": -18.738208770751953, "logits/rejected": -17.689579010009766, "logps/chosen": -413.8472595214844, "logps/rejected": -371.8302917480469, "loss": 0.9029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.736742973327637, "rewards/margins": 1.6366970539093018, "rewards/rejected": 3.100045680999756, "step": 60150 }, { "epoch": 2.793073030317099, "grad_norm": 22.95103645324707, "learning_rate": 1.324936162310228e-07, "logits/chosen": -19.336750030517578, "logits/rejected": -18.579971313476562, "logps/chosen": -331.06732177734375, "logps/rejected": -257.7081604003906, "loss": 0.4871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.589282274246216, "rewards/margins": 1.140489935874939, "rewards/rejected": 2.4487926959991455, "step": 60160 }, { "epoch": 2.7935373044245324, "grad_norm": 102.09601593017578, "learning_rate": 1.324657597845768e-07, "logits/chosen": -19.072248458862305, "logits/rejected": -17.803865432739258, "logps/chosen": -469.1836853027344, "logps/rejected": -341.2926025390625, "loss": 0.4022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.08809757232666, "rewards/margins": 1.959158182144165, "rewards/rejected": 2.1289398670196533, "step": 60170 }, { "epoch": 2.794001578531965, "grad_norm": 50.595096588134766, "learning_rate": 1.3243790333813082e-07, "logits/chosen": -19.714195251464844, "logits/rejected": -18.62726593017578, "logps/chosen": -397.25079345703125, "logps/rejected": -281.55841064453125, "loss": 0.4138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4692511558532715, "rewards/margins": 1.688916563987732, "rewards/rejected": 1.7803348302841187, "step": 60180 }, { "epoch": 2.7944658526393984, "grad_norm": 34.64838790893555, "learning_rate": 1.3241004689168486e-07, "logits/chosen": -19.414012908935547, "logits/rejected": -18.151700973510742, "logps/chosen": -235.48959350585938, "logps/rejected": -148.0179901123047, "loss": 0.3861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4890174865722656, "rewards/margins": 2.036059856414795, "rewards/rejected": 0.45295730233192444, "step": 60190 }, { "epoch": 2.794930126746831, "grad_norm": 55.2763786315918, "learning_rate": 1.3238219044523887e-07, "logits/chosen": -18.571157455444336, "logits/rejected": -17.88980484008789, "logps/chosen": -324.4678649902344, "logps/rejected": -282.53765869140625, "loss": 0.6823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.135175943374634, "rewards/margins": 0.8867724537849426, "rewards/rejected": 2.248403310775757, "step": 60200 }, { "epoch": 2.7953944008542644, "grad_norm": 0.4103746712207794, "learning_rate": 1.323543339987929e-07, "logits/chosen": -18.810611724853516, "logits/rejected": -17.638647079467773, "logps/chosen": -395.68133544921875, "logps/rejected": -332.80902099609375, "loss": 0.2009, "rewards/accuracies": 1.0, "rewards/chosen": 4.668247222900391, "rewards/margins": 2.5956602096557617, "rewards/rejected": 2.0725865364074707, "step": 60210 }, { "epoch": 2.795858674961697, "grad_norm": 160.0723114013672, "learning_rate": 1.323264775523469e-07, "logits/chosen": -18.22891616821289, "logits/rejected": -17.9589900970459, "logps/chosen": -550.09716796875, "logps/rejected": -430.51123046875, "loss": 0.8427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.676840782165527, "rewards/margins": 1.2532317638397217, "rewards/rejected": 3.4236087799072266, "step": 60220 }, { "epoch": 2.7963229490691304, "grad_norm": 55.64217758178711, "learning_rate": 1.3229862110590092e-07, "logits/chosen": -19.421894073486328, "logits/rejected": -18.81168556213379, "logps/chosen": -395.4251403808594, "logps/rejected": -248.855224609375, "loss": 0.3245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9083969593048096, "rewards/margins": 2.0535545349121094, "rewards/rejected": 1.8548424243927002, "step": 60230 }, { "epoch": 2.7967872231765636, "grad_norm": 0.8210662007331848, "learning_rate": 1.3227076465945493e-07, "logits/chosen": -19.902164459228516, "logits/rejected": -18.718151092529297, "logps/chosen": -474.91241455078125, "logps/rejected": -345.4139404296875, "loss": 0.7439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.169689655303955, "rewards/margins": 1.4167301654815674, "rewards/rejected": 2.752959728240967, "step": 60240 }, { "epoch": 2.7972514972839964, "grad_norm": 18.137557983398438, "learning_rate": 1.3224290821300895e-07, "logits/chosen": -18.833438873291016, "logits/rejected": -18.781673431396484, "logps/chosen": -368.5995788574219, "logps/rejected": -355.0928649902344, "loss": 0.779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.670614242553711, "rewards/margins": 0.2639501094818115, "rewards/rejected": 2.4066641330718994, "step": 60250 }, { "epoch": 2.7977157713914296, "grad_norm": 33.07622528076172, "learning_rate": 1.32215051766563e-07, "logits/chosen": -19.682889938354492, "logits/rejected": -19.574604034423828, "logps/chosen": -240.3751983642578, "logps/rejected": -257.90826416015625, "loss": 0.8718, "rewards/accuracies": 0.5, "rewards/chosen": 2.258600950241089, "rewards/margins": 0.3486554026603699, "rewards/rejected": 1.9099452495574951, "step": 60260 }, { "epoch": 2.798180045498863, "grad_norm": 55.341522216796875, "learning_rate": 1.3218719532011698e-07, "logits/chosen": -19.665576934814453, "logits/rejected": -17.84994125366211, "logps/chosen": -514.2388916015625, "logps/rejected": -348.04888916015625, "loss": 0.2761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.754696846008301, "rewards/margins": 2.3308300971984863, "rewards/rejected": 2.4238667488098145, "step": 60270 }, { "epoch": 2.7986443196062956, "grad_norm": 16.940637588500977, "learning_rate": 1.3215933887367102e-07, "logits/chosen": -18.873538970947266, "logits/rejected": -18.56118392944336, "logps/chosen": -324.251220703125, "logps/rejected": -268.28900146484375, "loss": 0.5608, "rewards/accuracies": 0.5, "rewards/chosen": 3.0881590843200684, "rewards/margins": 1.2373816967010498, "rewards/rejected": 1.85077702999115, "step": 60280 }, { "epoch": 2.7991085937137283, "grad_norm": 0.16666077077388763, "learning_rate": 1.3213148242722503e-07, "logits/chosen": -18.49599838256836, "logits/rejected": -17.011035919189453, "logps/chosen": -354.0647277832031, "logps/rejected": -167.80014038085938, "loss": 0.6862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.198819637298584, "rewards/margins": 2.035158395767212, "rewards/rejected": 1.163661241531372, "step": 60290 }, { "epoch": 2.7995728678211615, "grad_norm": 1.5121173858642578, "learning_rate": 1.3210362598077905e-07, "logits/chosen": -18.751815795898438, "logits/rejected": -17.833370208740234, "logps/chosen": -318.619140625, "logps/rejected": -253.6565704345703, "loss": 0.6455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.383389711380005, "rewards/margins": 0.8474346399307251, "rewards/rejected": 1.5359550714492798, "step": 60300 }, { "epoch": 2.8000371419285948, "grad_norm": 52.28451919555664, "learning_rate": 1.3207576953433306e-07, "logits/chosen": -19.84272575378418, "logits/rejected": -19.274322509765625, "logps/chosen": -353.47540283203125, "logps/rejected": -316.736572265625, "loss": 0.2476, "rewards/accuracies": 1.0, "rewards/chosen": 4.046445846557617, "rewards/margins": 1.802152395248413, "rewards/rejected": 2.244293689727783, "step": 60310 }, { "epoch": 2.8005014160360275, "grad_norm": 124.13054656982422, "learning_rate": 1.3204791308788707e-07, "logits/chosen": -19.782756805419922, "logits/rejected": -19.23094367980957, "logps/chosen": -400.74493408203125, "logps/rejected": -375.4144592285156, "loss": 1.012, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0706348419189453, "rewards/margins": 0.012257611379027367, "rewards/rejected": 3.058377504348755, "step": 60320 }, { "epoch": 2.8009656901434608, "grad_norm": 91.57192993164062, "learning_rate": 1.320200566414411e-07, "logits/chosen": -19.486732482910156, "logits/rejected": -18.62470245361328, "logps/chosen": -519.47900390625, "logps/rejected": -437.13702392578125, "loss": 0.8907, "rewards/accuracies": 0.5, "rewards/chosen": 4.452885150909424, "rewards/margins": 0.5720095038414001, "rewards/rejected": 3.880875825881958, "step": 60330 }, { "epoch": 2.801429964250894, "grad_norm": 0.20503875613212585, "learning_rate": 1.3199220019499513e-07, "logits/chosen": -18.016021728515625, "logits/rejected": -17.450864791870117, "logps/chosen": -379.0794677734375, "logps/rejected": -326.46124267578125, "loss": 0.9295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.694227695465088, "rewards/margins": 0.5238481760025024, "rewards/rejected": 2.170379161834717, "step": 60340 }, { "epoch": 2.8018942383583267, "grad_norm": 95.54479217529297, "learning_rate": 1.3196434374854914e-07, "logits/chosen": -18.54638671875, "logits/rejected": -18.16357421875, "logps/chosen": -411.8387756347656, "logps/rejected": -350.0539245605469, "loss": 0.9719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3879055976867676, "rewards/margins": 0.4152916967868805, "rewards/rejected": 2.972613573074341, "step": 60350 }, { "epoch": 2.8023585124657595, "grad_norm": 230.59791564941406, "learning_rate": 1.3193648730210316e-07, "logits/chosen": -18.892675399780273, "logits/rejected": -18.046865463256836, "logps/chosen": -412.5159606933594, "logps/rejected": -305.6805419921875, "loss": 0.6159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8793609142303467, "rewards/margins": 1.128039002418518, "rewards/rejected": 2.751321792602539, "step": 60360 }, { "epoch": 2.8028227865731927, "grad_norm": 3.664628744125366, "learning_rate": 1.3190863085565717e-07, "logits/chosen": -18.323516845703125, "logits/rejected": -16.977691650390625, "logps/chosen": -393.94476318359375, "logps/rejected": -294.3681945800781, "loss": 0.6726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1674957275390625, "rewards/margins": 1.857972502708435, "rewards/rejected": 2.309523105621338, "step": 60370 }, { "epoch": 2.803287060680626, "grad_norm": 207.7684326171875, "learning_rate": 1.318807744092112e-07, "logits/chosen": -18.85678482055664, "logits/rejected": -18.25372314453125, "logps/chosen": -326.27447509765625, "logps/rejected": -298.05950927734375, "loss": 0.835, "rewards/accuracies": 0.5, "rewards/chosen": 2.709676742553711, "rewards/margins": 0.2977278530597687, "rewards/rejected": 2.4119489192962646, "step": 60380 }, { "epoch": 2.8037513347880587, "grad_norm": 156.7176513671875, "learning_rate": 1.318529179627652e-07, "logits/chosen": -18.549152374267578, "logits/rejected": -18.832460403442383, "logps/chosen": -299.93798828125, "logps/rejected": -327.55035400390625, "loss": 0.869, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7716381549835205, "rewards/margins": 0.17762663960456848, "rewards/rejected": 2.5940115451812744, "step": 60390 }, { "epoch": 2.804215608895492, "grad_norm": 214.17991638183594, "learning_rate": 1.3182506151631924e-07, "logits/chosen": -17.681987762451172, "logits/rejected": -17.75920867919922, "logps/chosen": -282.912109375, "logps/rejected": -261.9631652832031, "loss": 1.1003, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9711847305297852, "rewards/margins": -0.39016932249069214, "rewards/rejected": 2.361354112625122, "step": 60400 }, { "epoch": 2.804679883002925, "grad_norm": 18.521142959594727, "learning_rate": 1.3179720506987326e-07, "logits/chosen": -18.86116600036621, "logits/rejected": -17.681913375854492, "logps/chosen": -464.9866638183594, "logps/rejected": -316.71563720703125, "loss": 0.315, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.687772750854492, "rewards/margins": 1.6269724369049072, "rewards/rejected": 2.060800313949585, "step": 60410 }, { "epoch": 2.805144157110358, "grad_norm": 86.40337371826172, "learning_rate": 1.3176934862342727e-07, "logits/chosen": -18.89960289001465, "logits/rejected": -17.503498077392578, "logps/chosen": -503.15716552734375, "logps/rejected": -370.34613037109375, "loss": 0.892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.8431596755981445, "rewards/margins": 1.9070850610733032, "rewards/rejected": 2.936074733734131, "step": 60420 }, { "epoch": 2.805608431217791, "grad_norm": 3.2626259326934814, "learning_rate": 1.317414921769813e-07, "logits/chosen": -19.070199966430664, "logits/rejected": -18.79216766357422, "logps/chosen": -362.92108154296875, "logps/rejected": -344.7010498046875, "loss": 1.0961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.200615644454956, "rewards/margins": 0.9169621467590332, "rewards/rejected": 2.283653736114502, "step": 60430 }, { "epoch": 2.806072705325224, "grad_norm": 3.4830434322357178, "learning_rate": 1.317136357305353e-07, "logits/chosen": -18.616302490234375, "logits/rejected": -19.075679779052734, "logps/chosen": -301.5516357421875, "logps/rejected": -367.3711853027344, "loss": 1.9222, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6218929290771484, "rewards/margins": -0.9403187036514282, "rewards/rejected": 3.562211513519287, "step": 60440 }, { "epoch": 2.806536979432657, "grad_norm": 43.83848571777344, "learning_rate": 1.3168577928408932e-07, "logits/chosen": -19.087825775146484, "logits/rejected": -18.980398178100586, "logps/chosen": -355.3920593261719, "logps/rejected": -284.48004150390625, "loss": 1.7791, "rewards/accuracies": 0.5, "rewards/chosen": 2.1013522148132324, "rewards/margins": -0.5873622298240662, "rewards/rejected": 2.6887147426605225, "step": 60450 }, { "epoch": 2.80700125354009, "grad_norm": 213.0029754638672, "learning_rate": 1.3165792283764333e-07, "logits/chosen": -18.85761833190918, "logits/rejected": -18.206432342529297, "logps/chosen": -378.4559020996094, "logps/rejected": -291.6247863769531, "loss": 0.7312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5560977458953857, "rewards/margins": 1.5433752536773682, "rewards/rejected": 2.0127227306365967, "step": 60460 }, { "epoch": 2.807465527647523, "grad_norm": 66.673828125, "learning_rate": 1.3163006639119737e-07, "logits/chosen": -19.062944412231445, "logits/rejected": -18.496370315551758, "logps/chosen": -477.1502380371094, "logps/rejected": -470.51116943359375, "loss": 0.6631, "rewards/accuracies": 0.5, "rewards/chosen": 3.6384758949279785, "rewards/margins": 0.530087947845459, "rewards/rejected": 3.1083881855010986, "step": 60470 }, { "epoch": 2.8079298017549563, "grad_norm": 0.7675713300704956, "learning_rate": 1.3160220994475136e-07, "logits/chosen": -19.188587188720703, "logits/rejected": -17.582727432250977, "logps/chosen": -407.5517272949219, "logps/rejected": -264.43121337890625, "loss": 0.3302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.948854446411133, "rewards/margins": 2.5750279426574707, "rewards/rejected": 1.3738263845443726, "step": 60480 }, { "epoch": 2.808394075862389, "grad_norm": 40.88237762451172, "learning_rate": 1.315743534983054e-07, "logits/chosen": -20.252033233642578, "logits/rejected": -20.162601470947266, "logps/chosen": -394.7894592285156, "logps/rejected": -341.8460998535156, "loss": 0.5723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.047947883605957, "rewards/margins": 0.6364110708236694, "rewards/rejected": 3.4115371704101562, "step": 60490 }, { "epoch": 2.8088583499698223, "grad_norm": 94.62081909179688, "learning_rate": 1.3154649705185942e-07, "logits/chosen": -18.05219841003418, "logits/rejected": -18.317092895507812, "logps/chosen": -309.74853515625, "logps/rejected": -383.48956298828125, "loss": 1.8749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.193471908569336, "rewards/margins": -0.965815544128418, "rewards/rejected": 3.159287691116333, "step": 60500 }, { "epoch": 2.809322624077255, "grad_norm": 51.56110382080078, "learning_rate": 1.3151864060541343e-07, "logits/chosen": -18.716388702392578, "logits/rejected": -18.527732849121094, "logps/chosen": -375.2516784667969, "logps/rejected": -333.16363525390625, "loss": 0.809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.664787530899048, "rewards/margins": 0.2833310663700104, "rewards/rejected": 2.3814563751220703, "step": 60510 }, { "epoch": 2.8097868981846883, "grad_norm": 89.23539733886719, "learning_rate": 1.3149078415896744e-07, "logits/chosen": -18.852344512939453, "logits/rejected": -18.410953521728516, "logps/chosen": -398.44818115234375, "logps/rejected": -330.16302490234375, "loss": 0.4755, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9826552867889404, "rewards/margins": 1.365472435951233, "rewards/rejected": 2.617182731628418, "step": 60520 }, { "epoch": 2.810251172292121, "grad_norm": 80.00524139404297, "learning_rate": 1.3146292771252146e-07, "logits/chosen": -19.15407943725586, "logits/rejected": -18.160526275634766, "logps/chosen": -437.35809326171875, "logps/rejected": -285.642822265625, "loss": 0.4462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.510838508605957, "rewards/margins": 2.261514186859131, "rewards/rejected": 2.249323606491089, "step": 60530 }, { "epoch": 2.8107154463995543, "grad_norm": 7.634256839752197, "learning_rate": 1.3143507126607547e-07, "logits/chosen": -18.700607299804688, "logits/rejected": -17.726133346557617, "logps/chosen": -284.5091857910156, "logps/rejected": -204.89144897460938, "loss": 0.7745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4138693809509277, "rewards/margins": 1.6273834705352783, "rewards/rejected": 0.7864858508110046, "step": 60540 }, { "epoch": 2.8111797205069875, "grad_norm": 25.995243072509766, "learning_rate": 1.3140721481962951e-07, "logits/chosen": -19.388050079345703, "logits/rejected": -18.4669189453125, "logps/chosen": -446.7310485839844, "logps/rejected": -412.54376220703125, "loss": 0.344, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.094932556152344, "rewards/margins": 1.7233047485351562, "rewards/rejected": 3.37162709236145, "step": 60550 }, { "epoch": 2.8116439946144203, "grad_norm": 264.96484375, "learning_rate": 1.3137935837318353e-07, "logits/chosen": -19.378122329711914, "logits/rejected": -18.220346450805664, "logps/chosen": -396.04339599609375, "logps/rejected": -312.92840576171875, "loss": 0.5702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7415249347686768, "rewards/margins": 1.8878662586212158, "rewards/rejected": 1.853658676147461, "step": 60560 }, { "epoch": 2.8121082687218535, "grad_norm": 120.30509948730469, "learning_rate": 1.3135150192673754e-07, "logits/chosen": -19.9459171295166, "logits/rejected": -17.757320404052734, "logps/chosen": -502.43682861328125, "logps/rejected": -291.65643310546875, "loss": 0.3082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.588985919952393, "rewards/margins": 2.7683582305908203, "rewards/rejected": 1.820626974105835, "step": 60570 }, { "epoch": 2.8125725428292863, "grad_norm": 32.96819305419922, "learning_rate": 1.3132364548029156e-07, "logits/chosen": -19.284656524658203, "logits/rejected": -18.088741302490234, "logps/chosen": -490.10791015625, "logps/rejected": -338.0669860839844, "loss": 0.4954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.096289157867432, "rewards/margins": 2.290219306945801, "rewards/rejected": 1.806069016456604, "step": 60580 }, { "epoch": 2.8130368169367195, "grad_norm": 0.9350904226303101, "learning_rate": 1.3129578903384557e-07, "logits/chosen": -19.410736083984375, "logits/rejected": -18.390056610107422, "logps/chosen": -525.3635864257812, "logps/rejected": -323.3400573730469, "loss": 0.4426, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.063904285430908, "rewards/margins": 2.6503708362579346, "rewards/rejected": 2.4135336875915527, "step": 60590 }, { "epoch": 2.8135010910441522, "grad_norm": 217.84095764160156, "learning_rate": 1.312679325873996e-07, "logits/chosen": -19.270227432250977, "logits/rejected": -18.907411575317383, "logps/chosen": -391.90411376953125, "logps/rejected": -381.4127197265625, "loss": 1.0337, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.4454197883605957, "rewards/margins": -0.43817591667175293, "rewards/rejected": 3.8835957050323486, "step": 60600 }, { "epoch": 2.8139653651515855, "grad_norm": 139.5398712158203, "learning_rate": 1.3124007614095363e-07, "logits/chosen": -19.289743423461914, "logits/rejected": -19.302772521972656, "logps/chosen": -368.91064453125, "logps/rejected": -310.30474853515625, "loss": 0.9696, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8647007942199707, "rewards/margins": -0.002170348074287176, "rewards/rejected": 2.8668713569641113, "step": 60610 }, { "epoch": 2.8144296392590187, "grad_norm": 198.54136657714844, "learning_rate": 1.3121221969450764e-07, "logits/chosen": -18.711380004882812, "logits/rejected": -18.425207138061523, "logps/chosen": -371.32452392578125, "logps/rejected": -371.12261962890625, "loss": 0.9268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.419546604156494, "rewards/margins": 0.291395902633667, "rewards/rejected": 3.128150701522827, "step": 60620 }, { "epoch": 2.8148939133664515, "grad_norm": 189.08164978027344, "learning_rate": 1.3118436324806163e-07, "logits/chosen": -18.04332160949707, "logits/rejected": -17.385379791259766, "logps/chosen": -264.0582275390625, "logps/rejected": -168.51901245117188, "loss": 0.7414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.486816883087158, "rewards/margins": 1.9048635959625244, "rewards/rejected": 0.5819533467292786, "step": 60630 }, { "epoch": 2.8153581874738847, "grad_norm": 57.13983154296875, "learning_rate": 1.3115650680161567e-07, "logits/chosen": -18.464412689208984, "logits/rejected": -16.83701515197754, "logps/chosen": -393.69061279296875, "logps/rejected": -230.08932495117188, "loss": 0.2597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9882659912109375, "rewards/margins": 2.849792718887329, "rewards/rejected": 1.1384729146957397, "step": 60640 }, { "epoch": 2.815822461581318, "grad_norm": 23.612863540649414, "learning_rate": 1.3112865035516969e-07, "logits/chosen": -19.70672035217285, "logits/rejected": -19.309545516967773, "logps/chosen": -546.8175048828125, "logps/rejected": -480.4647521972656, "loss": 0.5535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.700041770935059, "rewards/margins": 0.7929742932319641, "rewards/rejected": 3.9070677757263184, "step": 60650 }, { "epoch": 2.8162867356887507, "grad_norm": 17.71929168701172, "learning_rate": 1.311007939087237e-07, "logits/chosen": -19.30746078491211, "logits/rejected": -17.8762149810791, "logps/chosen": -333.28863525390625, "logps/rejected": -223.6605682373047, "loss": 0.5811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0138869285583496, "rewards/margins": 1.996999979019165, "rewards/rejected": 1.0168869495391846, "step": 60660 }, { "epoch": 2.8167510097961834, "grad_norm": 148.8046875, "learning_rate": 1.3107293746227772e-07, "logits/chosen": -19.188993453979492, "logits/rejected": -18.668655395507812, "logps/chosen": -324.30169677734375, "logps/rejected": -294.4498596191406, "loss": 1.2053, "rewards/accuracies": 0.5, "rewards/chosen": 2.836702585220337, "rewards/margins": 0.08080185949802399, "rewards/rejected": 2.7559001445770264, "step": 60670 }, { "epoch": 2.8172152839036166, "grad_norm": 93.45429229736328, "learning_rate": 1.3104508101583176e-07, "logits/chosen": -19.273303985595703, "logits/rejected": -17.260751724243164, "logps/chosen": -327.5769958496094, "logps/rejected": -209.02392578125, "loss": 0.4095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.373844861984253, "rewards/margins": 2.4398727416992188, "rewards/rejected": 0.9339722394943237, "step": 60680 }, { "epoch": 2.81767955801105, "grad_norm": 167.4591064453125, "learning_rate": 1.3101722456938574e-07, "logits/chosen": -20.115489959716797, "logits/rejected": -20.201416015625, "logps/chosen": -477.375244140625, "logps/rejected": -509.8675231933594, "loss": 1.4652, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.370516538619995, "rewards/margins": -0.4347567558288574, "rewards/rejected": 3.8052735328674316, "step": 60690 }, { "epoch": 2.8181438321184826, "grad_norm": 54.47417449951172, "learning_rate": 1.3098936812293979e-07, "logits/chosen": -19.306882858276367, "logits/rejected": -18.44723129272461, "logps/chosen": -385.5355224609375, "logps/rejected": -282.87286376953125, "loss": 0.4383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1938586235046387, "rewards/margins": 1.4260517358779907, "rewards/rejected": 1.7678067684173584, "step": 60700 }, { "epoch": 2.818608106225916, "grad_norm": 117.56126403808594, "learning_rate": 1.309615116764938e-07, "logits/chosen": -19.111751556396484, "logits/rejected": -18.802982330322266, "logps/chosen": -314.0384216308594, "logps/rejected": -359.76470947265625, "loss": 1.0896, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2387444972991943, "rewards/margins": -0.4362984597682953, "rewards/rejected": 2.6750428676605225, "step": 60710 }, { "epoch": 2.819072380333349, "grad_norm": 8.619914054870605, "learning_rate": 1.3093365523004781e-07, "logits/chosen": -19.725297927856445, "logits/rejected": -19.341064453125, "logps/chosen": -366.3616027832031, "logps/rejected": -321.22320556640625, "loss": 0.5256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.373457670211792, "rewards/margins": 1.0974500179290771, "rewards/rejected": 2.276007890701294, "step": 60720 }, { "epoch": 2.819536654440782, "grad_norm": 20.69402503967285, "learning_rate": 1.3090579878360183e-07, "logits/chosen": -19.5433292388916, "logits/rejected": -18.44797134399414, "logps/chosen": -410.53680419921875, "logps/rejected": -349.88104248046875, "loss": 0.2965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.645501613616943, "rewards/margins": 1.7845560312271118, "rewards/rejected": 2.860945463180542, "step": 60730 }, { "epoch": 2.8200009285482146, "grad_norm": 164.9582977294922, "learning_rate": 1.3087794233715584e-07, "logits/chosen": -19.48378562927246, "logits/rejected": -17.7355899810791, "logps/chosen": -558.7599487304688, "logps/rejected": -392.7276916503906, "loss": 0.6451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.785626411437988, "rewards/margins": 2.1086928844451904, "rewards/rejected": 3.6769332885742188, "step": 60740 }, { "epoch": 2.820465202655648, "grad_norm": 85.27350616455078, "learning_rate": 1.3085008589070986e-07, "logits/chosen": -18.41477394104004, "logits/rejected": -17.5412654876709, "logps/chosen": -300.31622314453125, "logps/rejected": -228.9553985595703, "loss": 0.4698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.976266384124756, "rewards/margins": 1.289538025856018, "rewards/rejected": 1.6867284774780273, "step": 60750 }, { "epoch": 2.820929476763081, "grad_norm": 85.27970123291016, "learning_rate": 1.308222294442639e-07, "logits/chosen": -18.6656551361084, "logits/rejected": -17.37540054321289, "logps/chosen": -390.3704833984375, "logps/rejected": -277.25994873046875, "loss": 0.3136, "rewards/accuracies": 1.0, "rewards/chosen": 3.4289557933807373, "rewards/margins": 2.031694173812866, "rewards/rejected": 1.397261619567871, "step": 60760 }, { "epoch": 2.821393750870514, "grad_norm": 0.3629871606826782, "learning_rate": 1.3079437299781791e-07, "logits/chosen": -19.128028869628906, "logits/rejected": -17.786752700805664, "logps/chosen": -487.28369140625, "logps/rejected": -354.5374755859375, "loss": 0.5927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.826320171356201, "rewards/margins": 1.3123406171798706, "rewards/rejected": 3.51397967338562, "step": 60770 }, { "epoch": 2.821858024977947, "grad_norm": 9.26563835144043, "learning_rate": 1.3076651655137193e-07, "logits/chosen": -19.26495361328125, "logits/rejected": -19.153274536132812, "logps/chosen": -493.43890380859375, "logps/rejected": -379.8627624511719, "loss": 0.5153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.184886932373047, "rewards/margins": 1.1466623544692993, "rewards/rejected": 3.038224458694458, "step": 60780 }, { "epoch": 2.8223222990853802, "grad_norm": 55.46475601196289, "learning_rate": 1.3073866010492594e-07, "logits/chosen": -19.561723709106445, "logits/rejected": -17.342849731445312, "logps/chosen": -509.8109436035156, "logps/rejected": -263.7168273925781, "loss": 0.2021, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.730109214782715, "rewards/margins": 2.961031436920166, "rewards/rejected": 1.7690784931182861, "step": 60790 }, { "epoch": 2.822786573192813, "grad_norm": 76.58504486083984, "learning_rate": 1.3071080365847996e-07, "logits/chosen": -18.729705810546875, "logits/rejected": -18.242206573486328, "logps/chosen": -419.59539794921875, "logps/rejected": -357.73406982421875, "loss": 0.8352, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.028989791870117, "rewards/margins": 0.4758409559726715, "rewards/rejected": 2.5531487464904785, "step": 60800 }, { "epoch": 2.8232508473002462, "grad_norm": 109.45211791992188, "learning_rate": 1.3068294721203397e-07, "logits/chosen": -19.345348358154297, "logits/rejected": -18.335418701171875, "logps/chosen": -375.76007080078125, "logps/rejected": -290.30364990234375, "loss": 0.3554, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.632591962814331, "rewards/margins": 1.2902133464813232, "rewards/rejected": 2.342378616333008, "step": 60810 }, { "epoch": 2.823715121407679, "grad_norm": 91.91978454589844, "learning_rate": 1.30655090765588e-07, "logits/chosen": -18.81131362915039, "logits/rejected": -17.737932205200195, "logps/chosen": -435.41473388671875, "logps/rejected": -315.1284484863281, "loss": 0.9647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7845678329467773, "rewards/margins": 1.0703649520874023, "rewards/rejected": 1.714202642440796, "step": 60820 }, { "epoch": 2.824179395515112, "grad_norm": 150.83248901367188, "learning_rate": 1.3062723431914203e-07, "logits/chosen": -18.67772102355957, "logits/rejected": -18.54364776611328, "logps/chosen": -429.85577392578125, "logps/rejected": -361.86773681640625, "loss": 0.736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8387629985809326, "rewards/margins": 0.9421457052230835, "rewards/rejected": 2.8966171741485596, "step": 60830 }, { "epoch": 2.824643669622545, "grad_norm": 135.4611053466797, "learning_rate": 1.3059937787269602e-07, "logits/chosen": -18.83882713317871, "logits/rejected": -18.40869903564453, "logps/chosen": -459.66162109375, "logps/rejected": -332.2162170410156, "loss": 0.5265, "rewards/accuracies": 0.5, "rewards/chosen": 4.196540832519531, "rewards/margins": 2.0442938804626465, "rewards/rejected": 2.1522467136383057, "step": 60840 }, { "epoch": 2.825107943729978, "grad_norm": 7.757257461547852, "learning_rate": 1.3057152142625006e-07, "logits/chosen": -18.721277236938477, "logits/rejected": -17.53160285949707, "logps/chosen": -477.2076721191406, "logps/rejected": -339.0891418457031, "loss": 0.5194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.61381196975708, "rewards/margins": 1.9343172311782837, "rewards/rejected": 2.6794943809509277, "step": 60850 }, { "epoch": 2.8255722178374114, "grad_norm": 80.45075988769531, "learning_rate": 1.3054366497980407e-07, "logits/chosen": -19.020137786865234, "logits/rejected": -18.515962600708008, "logps/chosen": -267.5530700683594, "logps/rejected": -212.68984985351562, "loss": 0.5036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1919634342193604, "rewards/margins": 1.0645087957382202, "rewards/rejected": 1.1274545192718506, "step": 60860 }, { "epoch": 2.826036491944844, "grad_norm": 131.21832275390625, "learning_rate": 1.3051580853335809e-07, "logits/chosen": -18.618337631225586, "logits/rejected": -18.766857147216797, "logps/chosen": -384.2307434082031, "logps/rejected": -427.96807861328125, "loss": 1.0878, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.5441040992736816, "rewards/margins": -0.4360150396823883, "rewards/rejected": 2.980119228363037, "step": 60870 }, { "epoch": 2.8265007660522774, "grad_norm": 99.96994018554688, "learning_rate": 1.304879520869121e-07, "logits/chosen": -19.800552368164062, "logits/rejected": -19.003965377807617, "logps/chosen": -341.58795166015625, "logps/rejected": -263.7792663574219, "loss": 0.524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.729041337966919, "rewards/margins": 1.1757862567901611, "rewards/rejected": 2.5532543659210205, "step": 60880 }, { "epoch": 2.82696504015971, "grad_norm": 73.18248748779297, "learning_rate": 1.3046009564046614e-07, "logits/chosen": -19.34367561340332, "logits/rejected": -17.824951171875, "logps/chosen": -452.1351623535156, "logps/rejected": -330.6581115722656, "loss": 0.195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.458133697509766, "rewards/margins": 2.803464412689209, "rewards/rejected": 1.6546694040298462, "step": 60890 }, { "epoch": 2.8274293142671434, "grad_norm": 31.695466995239258, "learning_rate": 1.3043223919402013e-07, "logits/chosen": -18.348087310791016, "logits/rejected": -18.418548583984375, "logps/chosen": -328.64520263671875, "logps/rejected": -339.90313720703125, "loss": 0.9526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5928139686584473, "rewards/margins": 0.705553412437439, "rewards/rejected": 1.8872604370117188, "step": 60900 }, { "epoch": 2.827893588374576, "grad_norm": 197.11175537109375, "learning_rate": 1.3040438274757417e-07, "logits/chosen": -18.28061294555664, "logits/rejected": -18.199424743652344, "logps/chosen": -314.04827880859375, "logps/rejected": -326.7762145996094, "loss": 1.2686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0311903953552246, "rewards/margins": -0.04434084892272949, "rewards/rejected": 2.075531244277954, "step": 60910 }, { "epoch": 2.8283578624820094, "grad_norm": 4.130640506744385, "learning_rate": 1.3037652630112818e-07, "logits/chosen": -18.836837768554688, "logits/rejected": -18.24308967590332, "logps/chosen": -382.55255126953125, "logps/rejected": -304.60308837890625, "loss": 0.2434, "rewards/accuracies": 1.0, "rewards/chosen": 4.189295291900635, "rewards/margins": 2.1703708171844482, "rewards/rejected": 2.018923759460449, "step": 60920 }, { "epoch": 2.8288221365894426, "grad_norm": 233.93722534179688, "learning_rate": 1.303486698546822e-07, "logits/chosen": -18.65788459777832, "logits/rejected": -18.132896423339844, "logps/chosen": -482.61700439453125, "logps/rejected": -376.2564697265625, "loss": 1.0604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8336288928985596, "rewards/margins": 0.9744795560836792, "rewards/rejected": 2.859149694442749, "step": 60930 }, { "epoch": 2.8292864106968754, "grad_norm": 17.567895889282227, "learning_rate": 1.3032081340823621e-07, "logits/chosen": -17.989126205444336, "logits/rejected": -17.340579986572266, "logps/chosen": -476.39984130859375, "logps/rejected": -378.7346496582031, "loss": 1.399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8610363006591797, "rewards/margins": 0.19272062182426453, "rewards/rejected": 2.6683156490325928, "step": 60940 }, { "epoch": 2.8297506848043086, "grad_norm": 225.0409393310547, "learning_rate": 1.3029295696179023e-07, "logits/chosen": -19.221683502197266, "logits/rejected": -18.405258178710938, "logps/chosen": -489.2669372558594, "logps/rejected": -434.06988525390625, "loss": 0.8722, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.180741310119629, "rewards/margins": 0.6020756959915161, "rewards/rejected": 3.5786662101745605, "step": 60950 }, { "epoch": 2.8302149589117414, "grad_norm": 123.2507553100586, "learning_rate": 1.3026510051534424e-07, "logits/chosen": -19.065134048461914, "logits/rejected": -18.078022003173828, "logps/chosen": -394.9527893066406, "logps/rejected": -285.78167724609375, "loss": 0.465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6474857330322266, "rewards/margins": 1.3502153158187866, "rewards/rejected": 2.2972702980041504, "step": 60960 }, { "epoch": 2.8306792330191746, "grad_norm": 1.9926505088806152, "learning_rate": 1.3023724406889828e-07, "logits/chosen": -19.397703170776367, "logits/rejected": -19.14142417907715, "logps/chosen": -425.88311767578125, "logps/rejected": -351.5262756347656, "loss": 0.7763, "rewards/accuracies": 0.5, "rewards/chosen": 4.049749374389648, "rewards/margins": 1.224868893623352, "rewards/rejected": 2.824880599975586, "step": 60970 }, { "epoch": 2.8311435071266073, "grad_norm": 92.30023193359375, "learning_rate": 1.302093876224523e-07, "logits/chosen": -19.51497459411621, "logits/rejected": -18.210063934326172, "logps/chosen": -499.60858154296875, "logps/rejected": -319.42828369140625, "loss": 0.2402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.131643772125244, "rewards/margins": 2.42026424407959, "rewards/rejected": 1.7113800048828125, "step": 60980 }, { "epoch": 2.8316077812340406, "grad_norm": 39.21818161010742, "learning_rate": 1.301815311760063e-07, "logits/chosen": -18.26702880859375, "logits/rejected": -17.690181732177734, "logps/chosen": -305.6435241699219, "logps/rejected": -242.8948516845703, "loss": 1.1361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6805708408355713, "rewards/margins": 0.7553647756576538, "rewards/rejected": 1.9252058267593384, "step": 60990 }, { "epoch": 2.8320720553414738, "grad_norm": 142.82501220703125, "learning_rate": 1.3015367472956033e-07, "logits/chosen": -18.614154815673828, "logits/rejected": -17.706132888793945, "logps/chosen": -257.8081970214844, "logps/rejected": -176.70791625976562, "loss": 0.4556, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.245826005935669, "rewards/margins": 1.0091540813446045, "rewards/rejected": 1.2366719245910645, "step": 61000 }, { "epoch": 2.8325363294489065, "grad_norm": 63.8195686340332, "learning_rate": 1.3012581828311434e-07, "logits/chosen": -19.633718490600586, "logits/rejected": -18.93602180480957, "logps/chosen": -461.752197265625, "logps/rejected": -380.56817626953125, "loss": 0.609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.633221626281738, "rewards/margins": 0.9320012927055359, "rewards/rejected": 3.701220750808716, "step": 61010 }, { "epoch": 2.8330006035563398, "grad_norm": 1.4438185691833496, "learning_rate": 1.3009796183666836e-07, "logits/chosen": -19.269779205322266, "logits/rejected": -18.320655822753906, "logps/chosen": -368.31317138671875, "logps/rejected": -301.6342468261719, "loss": 0.6933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0678796768188477, "rewards/margins": 1.8763456344604492, "rewards/rejected": 1.191534161567688, "step": 61020 }, { "epoch": 2.8334648776637725, "grad_norm": 79.27416229248047, "learning_rate": 1.3007010539022237e-07, "logits/chosen": -18.586389541625977, "logits/rejected": -18.194435119628906, "logps/chosen": -362.4524230957031, "logps/rejected": -335.33758544921875, "loss": 0.9212, "rewards/accuracies": 0.5, "rewards/chosen": 3.8261032104492188, "rewards/margins": 1.1137663125991821, "rewards/rejected": 2.712337017059326, "step": 61030 }, { "epoch": 2.8339291517712057, "grad_norm": 1.451125144958496, "learning_rate": 1.300422489437764e-07, "logits/chosen": -19.109886169433594, "logits/rejected": -18.885576248168945, "logps/chosen": -448.11187744140625, "logps/rejected": -443.43743896484375, "loss": 1.0432, "rewards/accuracies": 0.5, "rewards/chosen": 3.698096752166748, "rewards/margins": 0.3207995295524597, "rewards/rejected": 3.3772971630096436, "step": 61040 }, { "epoch": 2.8343934258786385, "grad_norm": 77.51628875732422, "learning_rate": 1.300143924973304e-07, "logits/chosen": -19.39822006225586, "logits/rejected": -19.169055938720703, "logps/chosen": -315.1167907714844, "logps/rejected": -336.31292724609375, "loss": 0.9381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4635562896728516, "rewards/margins": 0.5004009008407593, "rewards/rejected": 2.963155508041382, "step": 61050 }, { "epoch": 2.8348576999860717, "grad_norm": 59.7675895690918, "learning_rate": 1.2998653605088444e-07, "logits/chosen": -18.454206466674805, "logits/rejected": -18.86404037475586, "logps/chosen": -386.84149169921875, "logps/rejected": -309.4539489746094, "loss": 0.4113, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.294004440307617, "rewards/margins": 0.9928116798400879, "rewards/rejected": 2.3011929988861084, "step": 61060 }, { "epoch": 2.835321974093505, "grad_norm": 95.40010070800781, "learning_rate": 1.2995867960443846e-07, "logits/chosen": -19.85057258605957, "logits/rejected": -18.296428680419922, "logps/chosen": -432.25640869140625, "logps/rejected": -345.7568359375, "loss": 1.2839, "rewards/accuracies": 0.5, "rewards/chosen": 3.44102144241333, "rewards/margins": 0.5152767896652222, "rewards/rejected": 2.9257445335388184, "step": 61070 }, { "epoch": 2.8357862482009377, "grad_norm": 36.846824645996094, "learning_rate": 1.2993082315799247e-07, "logits/chosen": -20.736469268798828, "logits/rejected": -18.946857452392578, "logps/chosen": -497.4017639160156, "logps/rejected": -252.82888793945312, "loss": 0.3778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.432773113250732, "rewards/margins": 1.9550294876098633, "rewards/rejected": 2.477743625640869, "step": 61080 }, { "epoch": 2.836250522308371, "grad_norm": 53.56031799316406, "learning_rate": 1.2990296671154648e-07, "logits/chosen": -19.695009231567383, "logits/rejected": -19.33251953125, "logps/chosen": -371.4471130371094, "logps/rejected": -290.47393798828125, "loss": 0.3239, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.857691526412964, "rewards/margins": 1.658926248550415, "rewards/rejected": 2.198765277862549, "step": 61090 }, { "epoch": 2.836714796415804, "grad_norm": 85.40975189208984, "learning_rate": 1.2987511026510053e-07, "logits/chosen": -18.34387969970703, "logits/rejected": -17.410701751708984, "logps/chosen": -420.3460998535156, "logps/rejected": -306.6360778808594, "loss": 0.6186, "rewards/accuracies": 0.5, "rewards/chosen": 3.221808910369873, "rewards/margins": 1.2994959354400635, "rewards/rejected": 1.9223129749298096, "step": 61100 }, { "epoch": 2.837179070523237, "grad_norm": 244.26077270507812, "learning_rate": 1.2984725381865451e-07, "logits/chosen": -18.577280044555664, "logits/rejected": -17.957618713378906, "logps/chosen": -428.1689453125, "logps/rejected": -436.697021484375, "loss": 0.4373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2212018966674805, "rewards/margins": 1.6186403036117554, "rewards/rejected": 2.6025612354278564, "step": 61110 }, { "epoch": 2.8376433446306697, "grad_norm": 193.30897521972656, "learning_rate": 1.2981939737220855e-07, "logits/chosen": -18.321468353271484, "logits/rejected": -17.77601432800293, "logps/chosen": -366.8459777832031, "logps/rejected": -302.0293884277344, "loss": 0.5219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.596487522125244, "rewards/margins": 1.609994888305664, "rewards/rejected": 1.986492395401001, "step": 61120 }, { "epoch": 2.838107618738103, "grad_norm": 100.29283142089844, "learning_rate": 1.2979154092576257e-07, "logits/chosen": -19.01148223876953, "logits/rejected": -18.050386428833008, "logps/chosen": -327.7029724121094, "logps/rejected": -304.91241455078125, "loss": 0.5618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.157315492630005, "rewards/margins": 1.0479692220687866, "rewards/rejected": 2.1093461513519287, "step": 61130 }, { "epoch": 2.838571892845536, "grad_norm": 37.96760940551758, "learning_rate": 1.2976368447931658e-07, "logits/chosen": -18.90803337097168, "logits/rejected": -18.903873443603516, "logps/chosen": -373.31927490234375, "logps/rejected": -330.0785217285156, "loss": 0.9594, "rewards/accuracies": 0.5, "rewards/chosen": 2.660339832305908, "rewards/margins": 0.08991684764623642, "rewards/rejected": 2.570423126220703, "step": 61140 }, { "epoch": 2.839036166952969, "grad_norm": 42.90053939819336, "learning_rate": 1.297358280328706e-07, "logits/chosen": -19.862159729003906, "logits/rejected": -17.799339294433594, "logps/chosen": -454.7547912597656, "logps/rejected": -233.7509765625, "loss": 0.2514, "rewards/accuracies": 1.0, "rewards/chosen": 3.983485698699951, "rewards/margins": 2.374279260635376, "rewards/rejected": 1.6092065572738647, "step": 61150 }, { "epoch": 2.839500441060402, "grad_norm": 58.652015686035156, "learning_rate": 1.297079715864246e-07, "logits/chosen": -18.408300399780273, "logits/rejected": -18.447559356689453, "logps/chosen": -347.2890625, "logps/rejected": -332.37042236328125, "loss": 1.5186, "rewards/accuracies": 0.5, "rewards/chosen": 2.6094138622283936, "rewards/margins": -0.5683904886245728, "rewards/rejected": 3.1778039932250977, "step": 61160 }, { "epoch": 2.8399647151678353, "grad_norm": 46.21342086791992, "learning_rate": 1.2968011513997863e-07, "logits/chosen": -18.487285614013672, "logits/rejected": -18.11240577697754, "logps/chosen": -381.76220703125, "logps/rejected": -336.5098571777344, "loss": 0.79, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.424560070037842, "rewards/margins": 0.4986484944820404, "rewards/rejected": 2.9259116649627686, "step": 61170 }, { "epoch": 2.840428989275268, "grad_norm": 27.9765625, "learning_rate": 1.2965225869353267e-07, "logits/chosen": -19.532833099365234, "logits/rejected": -18.734119415283203, "logps/chosen": -403.18756103515625, "logps/rejected": -347.66400146484375, "loss": 0.5736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.668560743331909, "rewards/margins": 1.2205266952514648, "rewards/rejected": 2.4480338096618652, "step": 61180 }, { "epoch": 2.840893263382701, "grad_norm": 120.1939697265625, "learning_rate": 1.2962440224708668e-07, "logits/chosen": -19.875747680664062, "logits/rejected": -19.232942581176758, "logps/chosen": -429.52362060546875, "logps/rejected": -334.52410888671875, "loss": 0.6359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6929588317871094, "rewards/margins": 0.9277287721633911, "rewards/rejected": 2.7652299404144287, "step": 61190 }, { "epoch": 2.841357537490134, "grad_norm": 13.581438064575195, "learning_rate": 1.295965458006407e-07, "logits/chosen": -19.520549774169922, "logits/rejected": -18.33885955810547, "logps/chosen": -372.0572509765625, "logps/rejected": -256.22479248046875, "loss": 0.2919, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.099246025085449, "rewards/margins": 2.0272433757781982, "rewards/rejected": 2.07200288772583, "step": 61200 }, { "epoch": 2.8418218115975673, "grad_norm": 15.301058769226074, "learning_rate": 1.295686893541947e-07, "logits/chosen": -18.9749698638916, "logits/rejected": -18.10650062561035, "logps/chosen": -544.1591186523438, "logps/rejected": -453.1250915527344, "loss": 0.5033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.011785984039307, "rewards/margins": 1.144142508506775, "rewards/rejected": 2.867642879486084, "step": 61210 }, { "epoch": 2.842286085705, "grad_norm": 1.3983098268508911, "learning_rate": 1.2954083290774873e-07, "logits/chosen": -19.04926300048828, "logits/rejected": -17.890317916870117, "logps/chosen": -352.00830078125, "logps/rejected": -323.7268981933594, "loss": 0.5081, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2908730506896973, "rewards/margins": 1.1449640989303589, "rewards/rejected": 2.1459085941314697, "step": 61220 }, { "epoch": 2.8427503598124333, "grad_norm": 20.99199867248535, "learning_rate": 1.2951297646130274e-07, "logits/chosen": -20.025516510009766, "logits/rejected": -18.479019165039062, "logps/chosen": -413.3937072753906, "logps/rejected": -359.2284240722656, "loss": 0.3295, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.446322441101074, "rewards/margins": 1.7299550771713257, "rewards/rejected": 2.716367483139038, "step": 61230 }, { "epoch": 2.8432146339198665, "grad_norm": 130.73414611816406, "learning_rate": 1.2948512001485676e-07, "logits/chosen": -19.570987701416016, "logits/rejected": -18.679000854492188, "logps/chosen": -428.262451171875, "logps/rejected": -328.8089294433594, "loss": 0.5596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.375758647918701, "rewards/margins": 1.8015724420547485, "rewards/rejected": 2.574185848236084, "step": 61240 }, { "epoch": 2.8436789080272993, "grad_norm": 119.80248260498047, "learning_rate": 1.294572635684108e-07, "logits/chosen": -20.201875686645508, "logits/rejected": -18.331966400146484, "logps/chosen": -452.18719482421875, "logps/rejected": -282.0046691894531, "loss": 0.2888, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.452340126037598, "rewards/margins": 2.7571024894714355, "rewards/rejected": 1.6952375173568726, "step": 61250 }, { "epoch": 2.8441431821347325, "grad_norm": 23.960073471069336, "learning_rate": 1.2942940712196478e-07, "logits/chosen": -18.489532470703125, "logits/rejected": -17.915620803833008, "logps/chosen": -362.8989562988281, "logps/rejected": -299.22021484375, "loss": 1.4193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8989930152893066, "rewards/margins": 0.8230697512626648, "rewards/rejected": 2.075923204421997, "step": 61260 }, { "epoch": 2.8446074562421653, "grad_norm": 87.78538513183594, "learning_rate": 1.2940155067551883e-07, "logits/chosen": -19.65314483642578, "logits/rejected": -19.403722763061523, "logps/chosen": -305.7899475097656, "logps/rejected": -231.95938110351562, "loss": 0.7136, "rewards/accuracies": 0.5, "rewards/chosen": 2.7219414710998535, "rewards/margins": 0.3956538140773773, "rewards/rejected": 2.326287269592285, "step": 61270 }, { "epoch": 2.8450717303495985, "grad_norm": 220.14971923828125, "learning_rate": 1.2937369422907284e-07, "logits/chosen": -19.525583267211914, "logits/rejected": -18.780744552612305, "logps/chosen": -412.4642639160156, "logps/rejected": -384.74847412109375, "loss": 0.8083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.174544334411621, "rewards/margins": 0.43345069885253906, "rewards/rejected": 2.741093635559082, "step": 61280 }, { "epoch": 2.8455360044570313, "grad_norm": 103.23051452636719, "learning_rate": 1.2934583778262685e-07, "logits/chosen": -17.914648056030273, "logits/rejected": -18.022716522216797, "logps/chosen": -284.1118469238281, "logps/rejected": -375.62664794921875, "loss": 1.2251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0837206840515137, "rewards/margins": 0.1883390247821808, "rewards/rejected": 1.8953815698623657, "step": 61290 }, { "epoch": 2.8460002785644645, "grad_norm": 13.768566131591797, "learning_rate": 1.2931798133618087e-07, "logits/chosen": -18.363311767578125, "logits/rejected": -17.471616744995117, "logps/chosen": -445.12481689453125, "logps/rejected": -386.0799865722656, "loss": 1.3296, "rewards/accuracies": 0.5, "rewards/chosen": 4.435412406921387, "rewards/margins": 1.1593847274780273, "rewards/rejected": 3.276027202606201, "step": 61300 }, { "epoch": 2.8464645526718977, "grad_norm": 177.4146728515625, "learning_rate": 1.292901248897349e-07, "logits/chosen": -19.749406814575195, "logits/rejected": -19.298137664794922, "logps/chosen": -382.37530517578125, "logps/rejected": -370.36871337890625, "loss": 0.8955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.939007043838501, "rewards/margins": -0.12164850533008575, "rewards/rejected": 3.060655117034912, "step": 61310 }, { "epoch": 2.8469288267793305, "grad_norm": 113.78040313720703, "learning_rate": 1.292622684432889e-07, "logits/chosen": -18.765087127685547, "logits/rejected": -18.103816986083984, "logps/chosen": -380.7839660644531, "logps/rejected": -301.623779296875, "loss": 0.6574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.273749351501465, "rewards/margins": 1.992579698562622, "rewards/rejected": 2.2811691761016846, "step": 61320 }, { "epoch": 2.8473931008867637, "grad_norm": 4.879153728485107, "learning_rate": 1.2923441199684294e-07, "logits/chosen": -18.28795051574707, "logits/rejected": -18.630443572998047, "logps/chosen": -354.00396728515625, "logps/rejected": -262.4916687011719, "loss": 0.7072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3424031734466553, "rewards/margins": 0.6540884971618652, "rewards/rejected": 1.6883147954940796, "step": 61330 }, { "epoch": 2.8478573749941964, "grad_norm": 31.81420135498047, "learning_rate": 1.2920655555039695e-07, "logits/chosen": -19.703805923461914, "logits/rejected": -19.850496292114258, "logps/chosen": -472.2579040527344, "logps/rejected": -336.44012451171875, "loss": 0.3345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.985623359680176, "rewards/margins": 2.303739070892334, "rewards/rejected": 2.681884288787842, "step": 61340 }, { "epoch": 2.8483216491016297, "grad_norm": 25.227825164794922, "learning_rate": 1.2917869910395097e-07, "logits/chosen": -18.40703582763672, "logits/rejected": -17.82724952697754, "logps/chosen": -416.97247314453125, "logps/rejected": -333.62152099609375, "loss": 0.4585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3503222465515137, "rewards/margins": 1.70514714717865, "rewards/rejected": 1.6451747417449951, "step": 61350 }, { "epoch": 2.8487859232090624, "grad_norm": 2.2041938304901123, "learning_rate": 1.2915084265750498e-07, "logits/chosen": -19.136213302612305, "logits/rejected": -17.847280502319336, "logps/chosen": -421.403076171875, "logps/rejected": -262.0962219238281, "loss": 0.2637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.428823947906494, "rewards/margins": 2.4251790046691895, "rewards/rejected": 2.0036449432373047, "step": 61360 }, { "epoch": 2.8492501973164956, "grad_norm": 115.0162582397461, "learning_rate": 1.29122986211059e-07, "logits/chosen": -19.768539428710938, "logits/rejected": -18.93415069580078, "logps/chosen": -548.2692260742188, "logps/rejected": -390.47259521484375, "loss": 0.6597, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.050477504730225, "rewards/margins": 1.7920408248901367, "rewards/rejected": 3.258436679840088, "step": 61370 }, { "epoch": 2.849714471423929, "grad_norm": 21.406484603881836, "learning_rate": 1.29095129764613e-07, "logits/chosen": -18.987642288208008, "logits/rejected": -18.067569732666016, "logps/chosen": -370.91668701171875, "logps/rejected": -322.47857666015625, "loss": 0.6507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.062441349029541, "rewards/margins": 0.7887638211250305, "rewards/rejected": 2.273677349090576, "step": 61380 }, { "epoch": 2.8501787455313616, "grad_norm": 5.942811965942383, "learning_rate": 1.2906727331816705e-07, "logits/chosen": -19.98133087158203, "logits/rejected": -18.106843948364258, "logps/chosen": -587.5795288085938, "logps/rejected": -286.1024475097656, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 4.770535469055176, "rewards/margins": 3.726076126098633, "rewards/rejected": 1.0444591045379639, "step": 61390 }, { "epoch": 2.850643019638795, "grad_norm": 27.600879669189453, "learning_rate": 1.2903941687172107e-07, "logits/chosen": -18.636138916015625, "logits/rejected": -17.34545135498047, "logps/chosen": -306.9010925292969, "logps/rejected": -229.7641143798828, "loss": 0.9235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9036775827407837, "rewards/margins": 0.6561813354492188, "rewards/rejected": 1.2474961280822754, "step": 61400 }, { "epoch": 2.8511072937462276, "grad_norm": 13.026374816894531, "learning_rate": 1.2901156042527508e-07, "logits/chosen": -19.2581729888916, "logits/rejected": -18.089250564575195, "logps/chosen": -337.68426513671875, "logps/rejected": -243.382080078125, "loss": 1.0541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3409764766693115, "rewards/margins": 0.7975881099700928, "rewards/rejected": 1.5433883666992188, "step": 61410 }, { "epoch": 2.851571567853661, "grad_norm": 172.94381713867188, "learning_rate": 1.289837039788291e-07, "logits/chosen": -19.904117584228516, "logits/rejected": -19.53507423400879, "logps/chosen": -400.9702453613281, "logps/rejected": -366.4508361816406, "loss": 1.2741, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2642154693603516, "rewards/margins": -0.2891313433647156, "rewards/rejected": 3.553347110748291, "step": 61420 }, { "epoch": 2.8520358419610936, "grad_norm": 186.28390502929688, "learning_rate": 1.289558475323831e-07, "logits/chosen": -20.262561798095703, "logits/rejected": -18.906042098999023, "logps/chosen": -532.6788940429688, "logps/rejected": -483.4988708496094, "loss": 0.5005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.5908613204956055, "rewards/margins": 0.9316703677177429, "rewards/rejected": 3.6591906547546387, "step": 61430 }, { "epoch": 2.852500116068527, "grad_norm": 54.77458190917969, "learning_rate": 1.2892799108593713e-07, "logits/chosen": -18.201467514038086, "logits/rejected": -18.104726791381836, "logps/chosen": -337.36444091796875, "logps/rejected": -305.0979309082031, "loss": 0.8325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.583425998687744, "rewards/margins": 0.022849297150969505, "rewards/rejected": 2.560576915740967, "step": 61440 }, { "epoch": 2.85296439017596, "grad_norm": 185.4120635986328, "learning_rate": 1.2890013463949114e-07, "logits/chosen": -18.552791595458984, "logits/rejected": -18.05196189880371, "logps/chosen": -461.0696716308594, "logps/rejected": -368.21392822265625, "loss": 0.3556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.184088706970215, "rewards/margins": 2.578977108001709, "rewards/rejected": 2.605111598968506, "step": 61450 }, { "epoch": 2.853428664283393, "grad_norm": 0.3608390688896179, "learning_rate": 1.2887227819304518e-07, "logits/chosen": -19.502395629882812, "logits/rejected": -18.46519660949707, "logps/chosen": -440.77593994140625, "logps/rejected": -288.9714050292969, "loss": 0.2025, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.260651588439941, "rewards/margins": 2.3749232292175293, "rewards/rejected": 1.885728120803833, "step": 61460 }, { "epoch": 2.853892938390826, "grad_norm": 68.80633544921875, "learning_rate": 1.2884442174659917e-07, "logits/chosen": -19.275407791137695, "logits/rejected": -19.474245071411133, "logps/chosen": -328.897705078125, "logps/rejected": -320.873291015625, "loss": 0.4995, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3577327728271484, "rewards/margins": 1.0781700611114502, "rewards/rejected": 2.2795627117156982, "step": 61470 }, { "epoch": 2.8543572124982592, "grad_norm": 7.131060600280762, "learning_rate": 1.288165653001532e-07, "logits/chosen": -20.019821166992188, "logits/rejected": -19.35973358154297, "logps/chosen": -366.69219970703125, "logps/rejected": -330.8354187011719, "loss": 0.5237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.578728437423706, "rewards/margins": 1.243829369544983, "rewards/rejected": 2.3348987102508545, "step": 61480 }, { "epoch": 2.854821486605692, "grad_norm": 112.48269653320312, "learning_rate": 1.2878870885370722e-07, "logits/chosen": -19.15219497680664, "logits/rejected": -18.30670928955078, "logps/chosen": -411.31304931640625, "logps/rejected": -287.6126708984375, "loss": 0.8266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7211265563964844, "rewards/margins": 1.097461223602295, "rewards/rejected": 2.6236653327941895, "step": 61490 }, { "epoch": 2.855285760713125, "grad_norm": 256.72271728515625, "learning_rate": 1.2876085240726124e-07, "logits/chosen": -19.274391174316406, "logits/rejected": -18.45410919189453, "logps/chosen": -500.12115478515625, "logps/rejected": -467.28741455078125, "loss": 0.8202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.318332672119141, "rewards/margins": 0.6074284911155701, "rewards/rejected": 3.7109038829803467, "step": 61500 }, { "epoch": 2.855750034820558, "grad_norm": 35.83342742919922, "learning_rate": 1.2873299596081525e-07, "logits/chosen": -19.691463470458984, "logits/rejected": -18.27652359008789, "logps/chosen": -456.15802001953125, "logps/rejected": -326.03594970703125, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 4.982100009918213, "rewards/margins": 3.0569872856140137, "rewards/rejected": 1.9251127243041992, "step": 61510 }, { "epoch": 2.856214308927991, "grad_norm": 31.32142448425293, "learning_rate": 1.287051395143693e-07, "logits/chosen": -19.527359008789062, "logits/rejected": -18.863861083984375, "logps/chosen": -417.6202087402344, "logps/rejected": -399.6742248535156, "loss": 0.642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.234095096588135, "rewards/margins": 0.9006631970405579, "rewards/rejected": 3.3334319591522217, "step": 61520 }, { "epoch": 2.856678583035424, "grad_norm": 8.92909049987793, "learning_rate": 1.2867728306792328e-07, "logits/chosen": -19.741931915283203, "logits/rejected": -17.539752960205078, "logps/chosen": -411.033935546875, "logps/rejected": -243.6111602783203, "loss": 0.1428, "rewards/accuracies": 1.0, "rewards/chosen": 4.119909763336182, "rewards/margins": 2.76731538772583, "rewards/rejected": 1.3525943756103516, "step": 61530 }, { "epoch": 2.857142857142857, "grad_norm": 83.51432800292969, "learning_rate": 1.2864942662147732e-07, "logits/chosen": -19.49905776977539, "logits/rejected": -18.723337173461914, "logps/chosen": -278.36199951171875, "logps/rejected": -286.4870300292969, "loss": 0.7164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3241052627563477, "rewards/margins": 0.9808250665664673, "rewards/rejected": 1.3432801961898804, "step": 61540 }, { "epoch": 2.8576071312502904, "grad_norm": 17.264892578125, "learning_rate": 1.2862157017503134e-07, "logits/chosen": -19.817913055419922, "logits/rejected": -18.923294067382812, "logps/chosen": -406.13348388671875, "logps/rejected": -344.4644470214844, "loss": 0.6095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.024235725402832, "rewards/margins": 0.9391849637031555, "rewards/rejected": 3.0850508213043213, "step": 61550 }, { "epoch": 2.858071405357723, "grad_norm": 31.082204818725586, "learning_rate": 1.2859371372858535e-07, "logits/chosen": -19.831165313720703, "logits/rejected": -18.89938735961914, "logps/chosen": -462.06463623046875, "logps/rejected": -382.8188781738281, "loss": 0.5321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.705887317657471, "rewards/margins": 1.143791913986206, "rewards/rejected": 3.5620951652526855, "step": 61560 }, { "epoch": 2.858535679465156, "grad_norm": 98.4959487915039, "learning_rate": 1.2856585728213937e-07, "logits/chosen": -19.296899795532227, "logits/rejected": -18.92734718322754, "logps/chosen": -431.33660888671875, "logps/rejected": -334.05316162109375, "loss": 0.5839, "rewards/accuracies": 0.5, "rewards/chosen": 3.8378307819366455, "rewards/margins": 0.5771169066429138, "rewards/rejected": 3.260714054107666, "step": 61570 }, { "epoch": 2.858999953572589, "grad_norm": 131.44741821289062, "learning_rate": 1.2853800083569338e-07, "logits/chosen": -18.620325088500977, "logits/rejected": -18.80196189880371, "logps/chosen": -377.56634521484375, "logps/rejected": -312.8955383300781, "loss": 1.3677, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.290454864501953, "rewards/margins": -0.10321033000946045, "rewards/rejected": 2.393665313720703, "step": 61580 }, { "epoch": 2.8594642276800224, "grad_norm": 102.47694396972656, "learning_rate": 1.285101443892474e-07, "logits/chosen": -18.394926071166992, "logits/rejected": -17.974138259887695, "logps/chosen": -362.7051696777344, "logps/rejected": -262.449462890625, "loss": 0.3611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.304288864135742, "rewards/margins": 1.7515023946762085, "rewards/rejected": 1.5527867078781128, "step": 61590 }, { "epoch": 2.859928501787455, "grad_norm": 201.7952423095703, "learning_rate": 1.2848228794280144e-07, "logits/chosen": -19.715190887451172, "logits/rejected": -19.45497703552246, "logps/chosen": -421.29730224609375, "logps/rejected": -323.14630126953125, "loss": 0.8069, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4600353240966797, "rewards/margins": 0.24678072333335876, "rewards/rejected": 3.213254451751709, "step": 61600 }, { "epoch": 2.8603927758948884, "grad_norm": 48.22173309326172, "learning_rate": 1.2845443149635545e-07, "logits/chosen": -19.339462280273438, "logits/rejected": -18.713932037353516, "logps/chosen": -487.48529052734375, "logps/rejected": -385.0633850097656, "loss": 0.7624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6294384002685547, "rewards/margins": 0.44514068961143494, "rewards/rejected": 3.184297800064087, "step": 61610 }, { "epoch": 2.8608570500023216, "grad_norm": 41.076297760009766, "learning_rate": 1.2842657504990944e-07, "logits/chosen": -20.37218475341797, "logits/rejected": -18.361740112304688, "logps/chosen": -493.65057373046875, "logps/rejected": -416.1650390625, "loss": 0.3852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.016749382019043, "rewards/margins": 1.8966319561004639, "rewards/rejected": 3.1201164722442627, "step": 61620 }, { "epoch": 2.8613213241097544, "grad_norm": 147.3404541015625, "learning_rate": 1.2839871860346348e-07, "logits/chosen": -18.827919006347656, "logits/rejected": -18.018280029296875, "logps/chosen": -380.21148681640625, "logps/rejected": -285.32257080078125, "loss": 0.8346, "rewards/accuracies": 0.5, "rewards/chosen": 3.6535003185272217, "rewards/margins": 1.4599844217300415, "rewards/rejected": 2.193516492843628, "step": 61630 }, { "epoch": 2.8617855982171876, "grad_norm": 0.051060114055871964, "learning_rate": 1.283708621570175e-07, "logits/chosen": -18.939592361450195, "logits/rejected": -17.415447235107422, "logps/chosen": -416.6166076660156, "logps/rejected": -256.86065673828125, "loss": 0.4811, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.563422679901123, "rewards/margins": 2.3735525608062744, "rewards/rejected": 1.1898701190948486, "step": 61640 }, { "epoch": 2.8622498723246204, "grad_norm": 122.30259704589844, "learning_rate": 1.283430057105715e-07, "logits/chosen": -18.785160064697266, "logits/rejected": -17.897289276123047, "logps/chosen": -348.39031982421875, "logps/rejected": -291.7398986816406, "loss": 0.4912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.13738751411438, "rewards/margins": 1.2203971147537231, "rewards/rejected": 1.9169902801513672, "step": 61650 }, { "epoch": 2.8627141464320536, "grad_norm": 159.11761474609375, "learning_rate": 1.2831514926412553e-07, "logits/chosen": -19.274690628051758, "logits/rejected": -18.648311614990234, "logps/chosen": -387.4300231933594, "logps/rejected": -319.9922790527344, "loss": 0.9472, "rewards/accuracies": 0.5, "rewards/chosen": 3.2933528423309326, "rewards/margins": 0.33485597372055054, "rewards/rejected": 2.9584968090057373, "step": 61660 }, { "epoch": 2.8631784205394863, "grad_norm": 163.5049285888672, "learning_rate": 1.2828729281767957e-07, "logits/chosen": -18.781005859375, "logits/rejected": -17.700124740600586, "logps/chosen": -273.694580078125, "logps/rejected": -281.1107482910156, "loss": 0.4071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6292192935943604, "rewards/margins": 1.217876672744751, "rewards/rejected": 1.4113423824310303, "step": 61670 }, { "epoch": 2.8636426946469196, "grad_norm": 13.493573188781738, "learning_rate": 1.2825943637123355e-07, "logits/chosen": -18.481637954711914, "logits/rejected": -17.806156158447266, "logps/chosen": -406.5058288574219, "logps/rejected": -320.2126159667969, "loss": 0.7262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9413018226623535, "rewards/margins": 1.4960201978683472, "rewards/rejected": 2.445281505584717, "step": 61680 }, { "epoch": 2.8641069687543528, "grad_norm": 103.56424713134766, "learning_rate": 1.282315799247876e-07, "logits/chosen": -18.894710540771484, "logits/rejected": -17.993316650390625, "logps/chosen": -346.2865295410156, "logps/rejected": -254.55386352539062, "loss": 0.4764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1987133026123047, "rewards/margins": 1.5024880170822144, "rewards/rejected": 1.6962254047393799, "step": 61690 }, { "epoch": 2.8645712428617855, "grad_norm": 65.50711822509766, "learning_rate": 1.282037234783416e-07, "logits/chosen": -18.934112548828125, "logits/rejected": -17.358135223388672, "logps/chosen": -384.50360107421875, "logps/rejected": -277.22711181640625, "loss": 0.4398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.120312690734863, "rewards/margins": 2.3715460300445557, "rewards/rejected": 1.748766303062439, "step": 61700 }, { "epoch": 2.8650355169692188, "grad_norm": 66.71186065673828, "learning_rate": 1.2817586703189562e-07, "logits/chosen": -18.681732177734375, "logits/rejected": -18.08798599243164, "logps/chosen": -390.74560546875, "logps/rejected": -293.10308837890625, "loss": 0.7172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.889871835708618, "rewards/margins": 1.289609432220459, "rewards/rejected": 1.6002620458602905, "step": 61710 }, { "epoch": 2.8654997910766515, "grad_norm": 19.72673797607422, "learning_rate": 1.2814801058544964e-07, "logits/chosen": -19.56219482421875, "logits/rejected": -18.651065826416016, "logps/chosen": -368.6396789550781, "logps/rejected": -244.4879150390625, "loss": 0.6771, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7910709381103516, "rewards/margins": 1.7808603048324585, "rewards/rejected": 2.0102109909057617, "step": 61720 }, { "epoch": 2.8659640651840848, "grad_norm": 9.495028495788574, "learning_rate": 1.2812015413900368e-07, "logits/chosen": -19.80984115600586, "logits/rejected": -19.77394676208496, "logps/chosen": -362.1114807128906, "logps/rejected": -398.9052429199219, "loss": 1.1562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.040651321411133, "rewards/margins": -0.24893832206726074, "rewards/rejected": 3.2895896434783936, "step": 61730 }, { "epoch": 2.8664283392915175, "grad_norm": 36.64958572387695, "learning_rate": 1.2809229769255767e-07, "logits/chosen": -18.803369522094727, "logits/rejected": -18.20743751525879, "logps/chosen": -399.05596923828125, "logps/rejected": -342.20123291015625, "loss": 0.4533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9408233165740967, "rewards/margins": 1.55283522605896, "rewards/rejected": 2.387988567352295, "step": 61740 }, { "epoch": 2.8668926133989507, "grad_norm": 68.6508560180664, "learning_rate": 1.280644412461117e-07, "logits/chosen": -19.67833137512207, "logits/rejected": -18.415761947631836, "logps/chosen": -450.2728576660156, "logps/rejected": -378.07666015625, "loss": 0.5734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1239707469940186, "rewards/margins": 0.9084668159484863, "rewards/rejected": 2.2155039310455322, "step": 61750 }, { "epoch": 2.867356887506384, "grad_norm": 119.84790802001953, "learning_rate": 1.2803658479966572e-07, "logits/chosen": -18.593713760375977, "logits/rejected": -17.82597541809082, "logps/chosen": -292.2236022949219, "logps/rejected": -338.69439697265625, "loss": 1.1925, "rewards/accuracies": 0.5, "rewards/chosen": 3.2891182899475098, "rewards/margins": 0.6494773030281067, "rewards/rejected": 2.639641046524048, "step": 61760 }, { "epoch": 2.8678211616138167, "grad_norm": 9.561314582824707, "learning_rate": 1.2800872835321974e-07, "logits/chosen": -18.96902084350586, "logits/rejected": -18.179157257080078, "logps/chosen": -356.8648681640625, "logps/rejected": -280.7621154785156, "loss": 0.6429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7607319355010986, "rewards/margins": 2.1028783321380615, "rewards/rejected": 1.6578534841537476, "step": 61770 }, { "epoch": 2.86828543572125, "grad_norm": 14.18879222869873, "learning_rate": 1.2798087190677375e-07, "logits/chosen": -19.706478118896484, "logits/rejected": -18.531734466552734, "logps/chosen": -447.89862060546875, "logps/rejected": -331.5962829589844, "loss": 0.3843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.165255546569824, "rewards/margins": 1.6282612085342407, "rewards/rejected": 2.536994695663452, "step": 61780 }, { "epoch": 2.8687497098286827, "grad_norm": 18.727230072021484, "learning_rate": 1.2795301546032777e-07, "logits/chosen": -18.628782272338867, "logits/rejected": -17.618907928466797, "logps/chosen": -330.27557373046875, "logps/rejected": -329.3089294433594, "loss": 0.4909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.366199016571045, "rewards/margins": 1.2216665744781494, "rewards/rejected": 2.1445329189300537, "step": 61790 }, { "epoch": 2.869213983936116, "grad_norm": 15.488061904907227, "learning_rate": 1.2792515901388178e-07, "logits/chosen": -20.15952491760254, "logits/rejected": -19.289505004882812, "logps/chosen": -380.613525390625, "logps/rejected": -332.0251770019531, "loss": 0.8552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.7090959548950195, "rewards/margins": 1.33552086353302, "rewards/rejected": 3.373575210571289, "step": 61800 }, { "epoch": 2.8696782580435487, "grad_norm": 0.4934312701225281, "learning_rate": 1.2789730256743582e-07, "logits/chosen": -19.83453369140625, "logits/rejected": -17.293270111083984, "logps/chosen": -433.7344665527344, "logps/rejected": -317.6955261230469, "loss": 0.4996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.387702465057373, "rewards/margins": 2.398923397064209, "rewards/rejected": 1.9887784719467163, "step": 61810 }, { "epoch": 2.870142532150982, "grad_norm": 100.27215576171875, "learning_rate": 1.2786944612098984e-07, "logits/chosen": -17.220285415649414, "logits/rejected": -18.451820373535156, "logps/chosen": -266.72039794921875, "logps/rejected": -265.3028259277344, "loss": 1.1644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8922582864761353, "rewards/margins": 0.16664843261241913, "rewards/rejected": 1.7256097793579102, "step": 61820 }, { "epoch": 2.870606806258415, "grad_norm": 36.45017623901367, "learning_rate": 1.2784158967454383e-07, "logits/chosen": -19.076021194458008, "logits/rejected": -18.358280181884766, "logps/chosen": -402.5402526855469, "logps/rejected": -349.8326110839844, "loss": 0.5402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.675783157348633, "rewards/margins": 0.7712408304214478, "rewards/rejected": 2.9045419692993164, "step": 61830 }, { "epoch": 2.871071080365848, "grad_norm": 8.497418403625488, "learning_rate": 1.2781373322809787e-07, "logits/chosen": -20.258319854736328, "logits/rejected": -18.257699966430664, "logps/chosen": -330.0408935546875, "logps/rejected": -317.402099609375, "loss": 0.8981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4944241046905518, "rewards/margins": 0.31528156995773315, "rewards/rejected": 3.179142475128174, "step": 61840 }, { "epoch": 2.871535354473281, "grad_norm": 35.07126235961914, "learning_rate": 1.2778587678165188e-07, "logits/chosen": -19.38172149658203, "logits/rejected": -19.045089721679688, "logps/chosen": -438.3468322753906, "logps/rejected": -366.2168884277344, "loss": 0.833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.042983531951904, "rewards/margins": 0.8745361566543579, "rewards/rejected": 4.168447971343994, "step": 61850 }, { "epoch": 2.871999628580714, "grad_norm": 0.28735852241516113, "learning_rate": 1.277580203352059e-07, "logits/chosen": -18.812166213989258, "logits/rejected": -19.29885482788086, "logps/chosen": -356.9217834472656, "logps/rejected": -310.02703857421875, "loss": 1.6059, "rewards/accuracies": 0.5, "rewards/chosen": 2.7177491188049316, "rewards/margins": -0.2183031588792801, "rewards/rejected": 2.936052083969116, "step": 61860 }, { "epoch": 2.872463902688147, "grad_norm": 5.555150985717773, "learning_rate": 1.277301638887599e-07, "logits/chosen": -18.008785247802734, "logits/rejected": -18.122684478759766, "logps/chosen": -355.2751159667969, "logps/rejected": -368.1480712890625, "loss": 1.1523, "rewards/accuracies": 0.5, "rewards/chosen": 2.88311767578125, "rewards/margins": -0.1147594228386879, "rewards/rejected": 2.9978766441345215, "step": 61870 }, { "epoch": 2.87292817679558, "grad_norm": 148.44786071777344, "learning_rate": 1.2770230744231395e-07, "logits/chosen": -18.280414581298828, "logits/rejected": -17.61263084411621, "logps/chosen": -389.0825500488281, "logps/rejected": -280.5442810058594, "loss": 0.7065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2553229331970215, "rewards/margins": 1.223473310470581, "rewards/rejected": 2.0318498611450195, "step": 61880 }, { "epoch": 2.873392450903013, "grad_norm": 33.881168365478516, "learning_rate": 1.2767445099586794e-07, "logits/chosen": -17.994050979614258, "logits/rejected": -17.54057502746582, "logps/chosen": -331.6745300292969, "logps/rejected": -233.6094970703125, "loss": 0.6494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.066117525100708, "rewards/margins": 0.513871431350708, "rewards/rejected": 1.5522462129592896, "step": 61890 }, { "epoch": 2.8738567250104463, "grad_norm": 137.85768127441406, "learning_rate": 1.2764659454942198e-07, "logits/chosen": -19.050540924072266, "logits/rejected": -18.096965789794922, "logps/chosen": -388.033935546875, "logps/rejected": -372.54693603515625, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 3.2057833671569824, "rewards/margins": 0.9032270312309265, "rewards/rejected": 2.302556276321411, "step": 61900 }, { "epoch": 2.874320999117879, "grad_norm": 210.16416931152344, "learning_rate": 1.27618738102976e-07, "logits/chosen": -19.28714370727539, "logits/rejected": -18.122303009033203, "logps/chosen": -531.954345703125, "logps/rejected": -409.3602600097656, "loss": 0.8252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.732603073120117, "rewards/margins": 0.9620128870010376, "rewards/rejected": 2.770590305328369, "step": 61910 }, { "epoch": 2.8747852732253123, "grad_norm": 222.10208129882812, "learning_rate": 1.2759088165653e-07, "logits/chosen": -19.64216423034668, "logits/rejected": -19.311355590820312, "logps/chosen": -324.76904296875, "logps/rejected": -317.8683776855469, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.790513515472412, "rewards/margins": 0.6522451639175415, "rewards/rejected": 2.138268232345581, "step": 61920 }, { "epoch": 2.8752495473327455, "grad_norm": 51.0882682800293, "learning_rate": 1.2756302521008402e-07, "logits/chosen": -19.153217315673828, "logits/rejected": -18.358673095703125, "logps/chosen": -328.68408203125, "logps/rejected": -284.5134582519531, "loss": 0.737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.480693817138672, "rewards/margins": 0.6030797362327576, "rewards/rejected": 1.8776137828826904, "step": 61930 }, { "epoch": 2.8757138214401783, "grad_norm": 55.04555892944336, "learning_rate": 1.2753516876363806e-07, "logits/chosen": -19.33743667602539, "logits/rejected": -17.431257247924805, "logps/chosen": -354.1731872558594, "logps/rejected": -207.91818237304688, "loss": 0.3115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9321906566619873, "rewards/margins": 2.7493844032287598, "rewards/rejected": 1.1828062534332275, "step": 61940 }, { "epoch": 2.876178095547611, "grad_norm": 55.917171478271484, "learning_rate": 1.2750731231719205e-07, "logits/chosen": -19.052555084228516, "logits/rejected": -18.559123992919922, "logps/chosen": -384.34796142578125, "logps/rejected": -298.44464111328125, "loss": 0.9179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.361732006072998, "rewards/margins": 1.4560314416885376, "rewards/rejected": 1.9057010412216187, "step": 61950 }, { "epoch": 2.8766423696550443, "grad_norm": 95.88104248046875, "learning_rate": 1.274794558707461e-07, "logits/chosen": -19.328792572021484, "logits/rejected": -18.61056137084961, "logps/chosen": -418.4937438964844, "logps/rejected": -337.587158203125, "loss": 0.7611, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.762387752532959, "rewards/margins": 0.8566468358039856, "rewards/rejected": 1.9057413339614868, "step": 61960 }, { "epoch": 2.8771066437624775, "grad_norm": 60.69015884399414, "learning_rate": 1.274515994243001e-07, "logits/chosen": -18.312564849853516, "logits/rejected": -17.92943000793457, "logps/chosen": -395.57843017578125, "logps/rejected": -338.04071044921875, "loss": 1.5262, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7470431327819824, "rewards/margins": -0.6120367050170898, "rewards/rejected": 4.3590803146362305, "step": 61970 }, { "epoch": 2.8775709178699103, "grad_norm": 11.790430068969727, "learning_rate": 1.2742374297785412e-07, "logits/chosen": -19.43935775756836, "logits/rejected": -18.539920806884766, "logps/chosen": -404.64703369140625, "logps/rejected": -336.72747802734375, "loss": 0.6619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.286496162414551, "rewards/margins": 1.1852728128433228, "rewards/rejected": 3.1012234687805176, "step": 61980 }, { "epoch": 2.8780351919773435, "grad_norm": 24.909517288208008, "learning_rate": 1.2739588653140814e-07, "logits/chosen": -19.089008331298828, "logits/rejected": -18.450061798095703, "logps/chosen": -325.67059326171875, "logps/rejected": -244.4690704345703, "loss": 0.4259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0884041786193848, "rewards/margins": 1.023959755897522, "rewards/rejected": 1.064444661140442, "step": 61990 }, { "epoch": 2.8784994660847767, "grad_norm": 79.02584075927734, "learning_rate": 1.2736803008496215e-07, "logits/chosen": -19.768869400024414, "logits/rejected": -18.771739959716797, "logps/chosen": -552.0637817382812, "logps/rejected": -414.5755920410156, "loss": 0.5517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.972312927246094, "rewards/margins": 1.2960865497589111, "rewards/rejected": 3.6762256622314453, "step": 62000 }, { "epoch": 2.8789637401922095, "grad_norm": 9.713190078735352, "learning_rate": 1.2734017363851617e-07, "logits/chosen": -18.587135314941406, "logits/rejected": -17.2849178314209, "logps/chosen": -556.2659912109375, "logps/rejected": -399.1429138183594, "loss": 0.3463, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.4505462646484375, "rewards/margins": 2.6307148933410645, "rewards/rejected": 2.819831371307373, "step": 62010 }, { "epoch": 2.8794280142996422, "grad_norm": 104.73357391357422, "learning_rate": 1.2731231719207018e-07, "logits/chosen": -19.07699203491211, "logits/rejected": -18.189363479614258, "logps/chosen": -306.9685363769531, "logps/rejected": -539.3859252929688, "loss": 0.8077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8570945262908936, "rewards/margins": 2.2739810943603516, "rewards/rejected": 0.5831134915351868, "step": 62020 }, { "epoch": 2.8798922884070755, "grad_norm": 244.45684814453125, "learning_rate": 1.2728446074562422e-07, "logits/chosen": -18.184844970703125, "logits/rejected": -18.64626693725586, "logps/chosen": -329.29632568359375, "logps/rejected": -344.0041198730469, "loss": 1.4727, "rewards/accuracies": 0.5, "rewards/chosen": 2.4738311767578125, "rewards/margins": -0.7185128927230835, "rewards/rejected": 3.1923441886901855, "step": 62030 }, { "epoch": 2.8803565625145087, "grad_norm": 77.0319595336914, "learning_rate": 1.272566042991782e-07, "logits/chosen": -18.621057510375977, "logits/rejected": -18.327945709228516, "logps/chosen": -440.32928466796875, "logps/rejected": -377.6973571777344, "loss": 0.8454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.649764060974121, "rewards/margins": 0.8652752041816711, "rewards/rejected": 1.7844889163970947, "step": 62040 }, { "epoch": 2.8808208366219414, "grad_norm": 128.93739318847656, "learning_rate": 1.2722874785273225e-07, "logits/chosen": -19.060028076171875, "logits/rejected": -18.49835968017578, "logps/chosen": -488.59814453125, "logps/rejected": -409.67840576171875, "loss": 0.7773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.417159080505371, "rewards/margins": 0.5065416097640991, "rewards/rejected": 3.9106178283691406, "step": 62050 }, { "epoch": 2.8812851107293747, "grad_norm": 122.63096618652344, "learning_rate": 1.2720089140628627e-07, "logits/chosen": -19.747161865234375, "logits/rejected": -18.57224464416504, "logps/chosen": -344.52276611328125, "logps/rejected": -253.004150390625, "loss": 0.4761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.167538166046143, "rewards/margins": 1.9029855728149414, "rewards/rejected": 2.264552593231201, "step": 62060 }, { "epoch": 2.881749384836808, "grad_norm": 189.1659393310547, "learning_rate": 1.2717303495984028e-07, "logits/chosen": -18.472400665283203, "logits/rejected": -18.6929988861084, "logps/chosen": -309.8118591308594, "logps/rejected": -366.57269287109375, "loss": 1.0585, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.883178234100342, "rewards/margins": -0.10956878960132599, "rewards/rejected": 2.9927468299865723, "step": 62070 }, { "epoch": 2.8822136589442406, "grad_norm": 13.58247184753418, "learning_rate": 1.271451785133943e-07, "logits/chosen": -19.906457901000977, "logits/rejected": -18.231657028198242, "logps/chosen": -399.2486267089844, "logps/rejected": -233.17947387695312, "loss": 0.3953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5557150840759277, "rewards/margins": 1.7969915866851807, "rewards/rejected": 1.758723497390747, "step": 62080 }, { "epoch": 2.882677933051674, "grad_norm": 73.39716339111328, "learning_rate": 1.2711732206694834e-07, "logits/chosen": -19.138803482055664, "logits/rejected": -18.59914779663086, "logps/chosen": -398.82037353515625, "logps/rejected": -304.4642028808594, "loss": 0.3603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.901637315750122, "rewards/margins": 1.6180671453475952, "rewards/rejected": 2.2835702896118164, "step": 62090 }, { "epoch": 2.8831422071591066, "grad_norm": 40.745967864990234, "learning_rate": 1.2708946562050232e-07, "logits/chosen": -18.89916229248047, "logits/rejected": -19.223957061767578, "logps/chosen": -318.22393798828125, "logps/rejected": -317.7513122558594, "loss": 0.8255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5297489166259766, "rewards/margins": 0.5685023069381714, "rewards/rejected": 1.9612468481063843, "step": 62100 }, { "epoch": 2.88360648126654, "grad_norm": 175.40750122070312, "learning_rate": 1.2706160917405636e-07, "logits/chosen": -19.1790714263916, "logits/rejected": -18.07853126525879, "logps/chosen": -434.60595703125, "logps/rejected": -362.68609619140625, "loss": 0.5191, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.536339521408081, "rewards/margins": 1.175662875175476, "rewards/rejected": 2.3606765270233154, "step": 62110 }, { "epoch": 2.8840707553739726, "grad_norm": 54.5599250793457, "learning_rate": 1.2703375272761038e-07, "logits/chosen": -18.22165298461914, "logits/rejected": -17.51719856262207, "logps/chosen": -379.06671142578125, "logps/rejected": -303.1722106933594, "loss": 0.4647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5688257217407227, "rewards/margins": 1.0211784839630127, "rewards/rejected": 1.54764723777771, "step": 62120 }, { "epoch": 2.884535029481406, "grad_norm": 3.055389165878296, "learning_rate": 1.270058962811644e-07, "logits/chosen": -19.049850463867188, "logits/rejected": -18.63278579711914, "logps/chosen": -311.06658935546875, "logps/rejected": -329.3075256347656, "loss": 0.7783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8190107345581055, "rewards/margins": 0.6577960252761841, "rewards/rejected": 2.16121506690979, "step": 62130 }, { "epoch": 2.884999303588839, "grad_norm": 13.416027069091797, "learning_rate": 1.269780398347184e-07, "logits/chosen": -19.010120391845703, "logits/rejected": -18.071748733520508, "logps/chosen": -339.8201599121094, "logps/rejected": -279.0923767089844, "loss": 0.3959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.295398712158203, "rewards/margins": 1.5997183322906494, "rewards/rejected": 1.6956806182861328, "step": 62140 }, { "epoch": 2.885463577696272, "grad_norm": 120.89078521728516, "learning_rate": 1.2695018338827245e-07, "logits/chosen": -18.70223045349121, "logits/rejected": -18.07101058959961, "logps/chosen": -430.72210693359375, "logps/rejected": -319.3336486816406, "loss": 0.4906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5232558250427246, "rewards/margins": 1.3032528162002563, "rewards/rejected": 2.220003128051758, "step": 62150 }, { "epoch": 2.885927851803705, "grad_norm": 68.36161804199219, "learning_rate": 1.2692232694182644e-07, "logits/chosen": -19.224695205688477, "logits/rejected": -18.849000930786133, "logps/chosen": -543.0202026367188, "logps/rejected": -467.9376525878906, "loss": 0.876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.577155590057373, "rewards/margins": 0.770442008972168, "rewards/rejected": 3.8067142963409424, "step": 62160 }, { "epoch": 2.886392125911138, "grad_norm": 36.40321731567383, "learning_rate": 1.2689447049538048e-07, "logits/chosen": -18.584211349487305, "logits/rejected": -18.412639617919922, "logps/chosen": -469.3899841308594, "logps/rejected": -346.27911376953125, "loss": 0.6537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.918623685836792, "rewards/margins": 1.3869385719299316, "rewards/rejected": 2.5316853523254395, "step": 62170 }, { "epoch": 2.886856400018571, "grad_norm": 6.161555290222168, "learning_rate": 1.268666140489345e-07, "logits/chosen": -19.08014678955078, "logits/rejected": -17.85210609436035, "logps/chosen": -307.46856689453125, "logps/rejected": -265.9548645019531, "loss": 0.1947, "rewards/accuracies": 1.0, "rewards/chosen": 3.3795418739318848, "rewards/margins": 2.059680461883545, "rewards/rejected": 1.319861650466919, "step": 62180 }, { "epoch": 2.887320674126004, "grad_norm": 124.0721435546875, "learning_rate": 1.268387576024885e-07, "logits/chosen": -18.56435775756836, "logits/rejected": -18.101741790771484, "logps/chosen": -241.6383819580078, "logps/rejected": -198.52735900878906, "loss": 0.6269, "rewards/accuracies": 0.5, "rewards/chosen": 1.4727413654327393, "rewards/margins": 0.30193981528282166, "rewards/rejected": 1.1708016395568848, "step": 62190 }, { "epoch": 2.887784948233437, "grad_norm": 62.63687515258789, "learning_rate": 1.2681090115604252e-07, "logits/chosen": -19.488149642944336, "logits/rejected": -18.394210815429688, "logps/chosen": -404.68768310546875, "logps/rejected": -327.90081787109375, "loss": 0.6516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.623889446258545, "rewards/margins": 1.344713807106018, "rewards/rejected": 3.2791755199432373, "step": 62200 }, { "epoch": 2.8882492223408702, "grad_norm": 13.754344940185547, "learning_rate": 1.2678583035424114e-07, "logits/chosen": -18.684192657470703, "logits/rejected": -17.979686737060547, "logps/chosen": -377.552001953125, "logps/rejected": -336.38323974609375, "loss": 0.9282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.901839017868042, "rewards/margins": 0.9642478823661804, "rewards/rejected": 2.937591075897217, "step": 62210 }, { "epoch": 2.888713496448303, "grad_norm": 29.14336585998535, "learning_rate": 1.2675797390779516e-07, "logits/chosen": -18.768329620361328, "logits/rejected": -18.016769409179688, "logps/chosen": -392.36572265625, "logps/rejected": -268.9795227050781, "loss": 1.2468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.030932426452637, "rewards/margins": 1.5060685873031616, "rewards/rejected": 2.5248637199401855, "step": 62220 }, { "epoch": 2.889177770555736, "grad_norm": 102.55889892578125, "learning_rate": 1.2673011746134917e-07, "logits/chosen": -18.576480865478516, "logits/rejected": -17.82131576538086, "logps/chosen": -350.7585754394531, "logps/rejected": -232.392822265625, "loss": 0.658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4863967895507812, "rewards/margins": 1.485550880432129, "rewards/rejected": 2.0008461475372314, "step": 62230 }, { "epoch": 2.889642044663169, "grad_norm": 151.8412628173828, "learning_rate": 1.267022610149032e-07, "logits/chosen": -19.31711196899414, "logits/rejected": -19.973499298095703, "logps/chosen": -364.72576904296875, "logps/rejected": -440.8230895996094, "loss": 1.36, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.895343542098999, "rewards/margins": -0.5888751745223999, "rewards/rejected": 3.4842190742492676, "step": 62240 }, { "epoch": 2.890106318770602, "grad_norm": 0.1809758096933365, "learning_rate": 1.266744045684572e-07, "logits/chosen": -18.820693969726562, "logits/rejected": -18.73691749572754, "logps/chosen": -408.49603271484375, "logps/rejected": -356.8032531738281, "loss": 1.588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6568870544433594, "rewards/margins": 1.1173425912857056, "rewards/rejected": 2.539543867111206, "step": 62250 }, { "epoch": 2.890570592878035, "grad_norm": 65.37236785888672, "learning_rate": 1.2664654812201124e-07, "logits/chosen": -19.010639190673828, "logits/rejected": -17.753692626953125, "logps/chosen": -330.3941955566406, "logps/rejected": -251.73666381835938, "loss": 0.4381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7516674995422363, "rewards/margins": 1.9266360998153687, "rewards/rejected": 0.8250313997268677, "step": 62260 }, { "epoch": 2.891034866985468, "grad_norm": 16.350849151611328, "learning_rate": 1.2661869167556526e-07, "logits/chosen": -18.473249435424805, "logits/rejected": -18.098125457763672, "logps/chosen": -398.36859130859375, "logps/rejected": -325.174560546875, "loss": 0.9126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.816295623779297, "rewards/margins": 0.6487770080566406, "rewards/rejected": 3.1675190925598145, "step": 62270 }, { "epoch": 2.8914991410929014, "grad_norm": 220.24929809570312, "learning_rate": 1.2659083522911927e-07, "logits/chosen": -19.063838958740234, "logits/rejected": -19.119617462158203, "logps/chosen": -294.21453857421875, "logps/rejected": -225.2875518798828, "loss": 0.92, "rewards/accuracies": 0.5, "rewards/chosen": 2.460136651992798, "rewards/margins": 0.7468788623809814, "rewards/rejected": 1.7132574319839478, "step": 62280 }, { "epoch": 2.891963415200334, "grad_norm": 66.58487701416016, "learning_rate": 1.2656297878267329e-07, "logits/chosen": -17.925090789794922, "logits/rejected": -17.943748474121094, "logps/chosen": -235.94631958007812, "logps/rejected": -213.09176635742188, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": 2.1461071968078613, "rewards/margins": 0.2662757337093353, "rewards/rejected": 1.8798316717147827, "step": 62290 }, { "epoch": 2.8924276893077674, "grad_norm": 38.690277099609375, "learning_rate": 1.265351223362273e-07, "logits/chosen": -19.624547958374023, "logits/rejected": -18.244396209716797, "logps/chosen": -352.9386291503906, "logps/rejected": -228.88290405273438, "loss": 0.3351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.025518417358398, "rewards/margins": 2.894806385040283, "rewards/rejected": 1.1307121515274048, "step": 62300 }, { "epoch": 2.8928919634152006, "grad_norm": 46.01997756958008, "learning_rate": 1.2650726588978131e-07, "logits/chosen": -18.698827743530273, "logits/rejected": -18.1844425201416, "logps/chosen": -361.50506591796875, "logps/rejected": -342.49871826171875, "loss": 0.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7849860191345215, "rewards/margins": 1.4033502340316772, "rewards/rejected": 2.381635904312134, "step": 62310 }, { "epoch": 2.8933562375226334, "grad_norm": 1.3261120319366455, "learning_rate": 1.2647940944333533e-07, "logits/chosen": -18.641693115234375, "logits/rejected": -18.00004768371582, "logps/chosen": -369.8255920410156, "logps/rejected": -272.2243347167969, "loss": 0.6294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.072155952453613, "rewards/margins": 1.5319632291793823, "rewards/rejected": 2.5401923656463623, "step": 62320 }, { "epoch": 2.893820511630066, "grad_norm": 0.27968549728393555, "learning_rate": 1.2645155299688937e-07, "logits/chosen": -18.967884063720703, "logits/rejected": -17.95828628540039, "logps/chosen": -354.0760803222656, "logps/rejected": -299.36505126953125, "loss": 0.4327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4001986980438232, "rewards/margins": 1.3644052743911743, "rewards/rejected": 1.0357935428619385, "step": 62330 }, { "epoch": 2.8942847857374994, "grad_norm": 19.48273468017578, "learning_rate": 1.2642369655044336e-07, "logits/chosen": -18.01097869873047, "logits/rejected": -17.616363525390625, "logps/chosen": -340.6383361816406, "logps/rejected": -265.55316162109375, "loss": 0.3797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.55942440032959, "rewards/margins": 1.340733289718628, "rewards/rejected": 1.2186915874481201, "step": 62340 }, { "epoch": 2.8947490598449326, "grad_norm": 191.80059814453125, "learning_rate": 1.263958401039974e-07, "logits/chosen": -18.169158935546875, "logits/rejected": -18.71379852294922, "logps/chosen": -289.55059814453125, "logps/rejected": -298.4393310546875, "loss": 1.1807, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7190078496932983, "rewards/margins": -0.6422898173332214, "rewards/rejected": 2.361297607421875, "step": 62350 }, { "epoch": 2.8952133339523654, "grad_norm": 25.46731948852539, "learning_rate": 1.2636798365755141e-07, "logits/chosen": -19.521015167236328, "logits/rejected": -18.966150283813477, "logps/chosen": -339.351806640625, "logps/rejected": -290.21868896484375, "loss": 0.4312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.786630630493164, "rewards/margins": 1.447021484375, "rewards/rejected": 2.339608907699585, "step": 62360 }, { "epoch": 2.8956776080597986, "grad_norm": 0.26087430119514465, "learning_rate": 1.2634012721110543e-07, "logits/chosen": -19.436063766479492, "logits/rejected": -18.609882354736328, "logps/chosen": -301.4085693359375, "logps/rejected": -239.99160766601562, "loss": 1.1437, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9849185943603516, "rewards/margins": 0.5366324186325073, "rewards/rejected": 1.448286533355713, "step": 62370 }, { "epoch": 2.896141882167232, "grad_norm": 58.693946838378906, "learning_rate": 1.2631227076465944e-07, "logits/chosen": -18.668062210083008, "logits/rejected": -18.403223037719727, "logps/chosen": -315.1877746582031, "logps/rejected": -297.8216857910156, "loss": 0.6658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4637739658355713, "rewards/margins": 0.3194466829299927, "rewards/rejected": 2.144327163696289, "step": 62380 }, { "epoch": 2.8966061562746646, "grad_norm": 161.97158813476562, "learning_rate": 1.2628441431821348e-07, "logits/chosen": -18.90622329711914, "logits/rejected": -18.315231323242188, "logps/chosen": -438.8282165527344, "logps/rejected": -378.31842041015625, "loss": 1.1331, "rewards/accuracies": 0.5, "rewards/chosen": 4.293440818786621, "rewards/margins": 0.6414868235588074, "rewards/rejected": 3.651953935623169, "step": 62390 }, { "epoch": 2.8970704303820973, "grad_norm": 173.55535888671875, "learning_rate": 1.2625655787176747e-07, "logits/chosen": -18.769489288330078, "logits/rejected": -18.35517692565918, "logps/chosen": -378.2778625488281, "logps/rejected": -428.30438232421875, "loss": 0.6639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.086358070373535, "rewards/margins": 0.53779137134552, "rewards/rejected": 2.5485665798187256, "step": 62400 }, { "epoch": 2.8975347044895305, "grad_norm": 0.008123637177050114, "learning_rate": 1.262287014253215e-07, "logits/chosen": -19.2718448638916, "logits/rejected": -17.218761444091797, "logps/chosen": -490.4405822753906, "logps/rejected": -336.53143310546875, "loss": 0.2147, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.2592291831970215, "rewards/margins": 3.1316471099853516, "rewards/rejected": 2.127582311630249, "step": 62410 }, { "epoch": 2.8979989785969638, "grad_norm": 0.5624289512634277, "learning_rate": 1.2620084497887553e-07, "logits/chosen": -19.377988815307617, "logits/rejected": -18.08687973022461, "logps/chosen": -308.35858154296875, "logps/rejected": -298.32427978515625, "loss": 0.9038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.936938762664795, "rewards/margins": 0.47568100690841675, "rewards/rejected": 2.4612579345703125, "step": 62420 }, { "epoch": 2.8984632527043965, "grad_norm": 130.78952026367188, "learning_rate": 1.2617298853242954e-07, "logits/chosen": -19.408077239990234, "logits/rejected": -18.909387588500977, "logps/chosen": -502.8836975097656, "logps/rejected": -396.62347412109375, "loss": 0.8577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6921467781066895, "rewards/margins": 0.5844284892082214, "rewards/rejected": 3.107717990875244, "step": 62430 }, { "epoch": 2.8989275268118297, "grad_norm": 86.87294006347656, "learning_rate": 1.2614513208598356e-07, "logits/chosen": -18.760364532470703, "logits/rejected": -17.91957664489746, "logps/chosen": -316.3088684082031, "logps/rejected": -287.6483459472656, "loss": 0.4048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.452695846557617, "rewards/margins": 1.6624656915664673, "rewards/rejected": 1.7902305126190186, "step": 62440 }, { "epoch": 2.899391800919263, "grad_norm": 39.36125946044922, "learning_rate": 1.261172756395376e-07, "logits/chosen": -17.772071838378906, "logits/rejected": -17.76903533935547, "logps/chosen": -362.6493225097656, "logps/rejected": -340.7837829589844, "loss": 1.2535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2130789756774902, "rewards/margins": -0.0312788262963295, "rewards/rejected": 3.2443575859069824, "step": 62450 }, { "epoch": 2.8998560750266957, "grad_norm": 15.023809432983398, "learning_rate": 1.2608941919309159e-07, "logits/chosen": -19.926128387451172, "logits/rejected": -18.746826171875, "logps/chosen": -430.132080078125, "logps/rejected": -306.97088623046875, "loss": 0.2629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3155643939971924, "rewards/margins": 2.0305960178375244, "rewards/rejected": 1.284968614578247, "step": 62460 }, { "epoch": 2.900320349134129, "grad_norm": 0.5977304577827454, "learning_rate": 1.2606156274664563e-07, "logits/chosen": -19.083568572998047, "logits/rejected": -18.36146354675293, "logps/chosen": -357.81121826171875, "logps/rejected": -269.1990661621094, "loss": 0.7277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.396366834640503, "rewards/margins": 1.4542583227157593, "rewards/rejected": 1.942108392715454, "step": 62470 }, { "epoch": 2.9007846232415617, "grad_norm": 0.13796621561050415, "learning_rate": 1.2603370630019964e-07, "logits/chosen": -19.987010955810547, "logits/rejected": -18.516454696655273, "logps/chosen": -390.2494201660156, "logps/rejected": -285.47674560546875, "loss": 0.5404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3353400230407715, "rewards/margins": 2.3725428581237793, "rewards/rejected": 1.9627965688705444, "step": 62480 }, { "epoch": 2.901248897348995, "grad_norm": 29.969615936279297, "learning_rate": 1.2600584985375366e-07, "logits/chosen": -18.957172393798828, "logits/rejected": -17.863182067871094, "logps/chosen": -411.33038330078125, "logps/rejected": -368.7141418457031, "loss": 1.013, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2236602306365967, "rewards/margins": 0.43862050771713257, "rewards/rejected": 2.7850396633148193, "step": 62490 }, { "epoch": 2.9017131714564277, "grad_norm": 48.38371658325195, "learning_rate": 1.2597799340730767e-07, "logits/chosen": -19.461685180664062, "logits/rejected": -18.74765968322754, "logps/chosen": -348.61053466796875, "logps/rejected": -330.472900390625, "loss": 0.7173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.73504900932312, "rewards/margins": 0.8351361155509949, "rewards/rejected": 2.8999130725860596, "step": 62500 }, { "epoch": 2.902177445563861, "grad_norm": 36.15725326538086, "learning_rate": 1.2595013696086168e-07, "logits/chosen": -19.06442642211914, "logits/rejected": -17.89693260192871, "logps/chosen": -386.2347412109375, "logps/rejected": -279.4351806640625, "loss": 0.5748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8045332431793213, "rewards/margins": 1.2146801948547363, "rewards/rejected": 1.589853048324585, "step": 62510 }, { "epoch": 2.902641719671294, "grad_norm": 20.70462989807129, "learning_rate": 1.259222805144157e-07, "logits/chosen": -18.77583122253418, "logits/rejected": -17.280656814575195, "logps/chosen": -501.03167724609375, "logps/rejected": -247.4894561767578, "loss": 0.1769, "rewards/accuracies": 1.0, "rewards/chosen": 4.417090892791748, "rewards/margins": 3.1682422161102295, "rewards/rejected": 1.24884831905365, "step": 62520 }, { "epoch": 2.903105993778727, "grad_norm": 43.466854095458984, "learning_rate": 1.2589442406796971e-07, "logits/chosen": -19.02101707458496, "logits/rejected": -18.121681213378906, "logps/chosen": -437.7596740722656, "logps/rejected": -287.46044921875, "loss": 0.546, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4047741889953613, "rewards/margins": 1.1838760375976562, "rewards/rejected": 2.220897912979126, "step": 62530 }, { "epoch": 2.90357026788616, "grad_norm": 0.589487612247467, "learning_rate": 1.2586935326616833e-07, "logits/chosen": -19.170564651489258, "logits/rejected": -18.81783676147461, "logps/chosen": -381.99114990234375, "logps/rejected": -230.5006561279297, "loss": 0.6603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.428804636001587, "rewards/margins": 1.0115171670913696, "rewards/rejected": 1.4172874689102173, "step": 62540 }, { "epoch": 2.904034541993593, "grad_norm": 54.500343322753906, "learning_rate": 1.2584149681972235e-07, "logits/chosen": -18.958608627319336, "logits/rejected": -17.35953712463379, "logps/chosen": -464.95928955078125, "logps/rejected": -258.4246826171875, "loss": 0.467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.477480888366699, "rewards/margins": 1.72626531124115, "rewards/rejected": 2.7512154579162598, "step": 62550 }, { "epoch": 2.904498816101026, "grad_norm": 47.14289093017578, "learning_rate": 1.2581364037327636e-07, "logits/chosen": -19.93853759765625, "logits/rejected": -18.964466094970703, "logps/chosen": -328.60601806640625, "logps/rejected": -285.76275634765625, "loss": 0.6395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.597181558609009, "rewards/margins": 0.6894342303276062, "rewards/rejected": 1.9077472686767578, "step": 62560 }, { "epoch": 2.904963090208459, "grad_norm": 36.13286209106445, "learning_rate": 1.257857839268304e-07, "logits/chosen": -18.701536178588867, "logits/rejected": -18.0177059173584, "logps/chosen": -545.4730224609375, "logps/rejected": -398.4717102050781, "loss": 0.3715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.96357798576355, "rewards/margins": 1.829063057899475, "rewards/rejected": 2.1345152854919434, "step": 62570 }, { "epoch": 2.905427364315892, "grad_norm": 249.32254028320312, "learning_rate": 1.2575792748038442e-07, "logits/chosen": -18.83895492553711, "logits/rejected": -17.819339752197266, "logps/chosen": -507.37060546875, "logps/rejected": -418.69415283203125, "loss": 0.3586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.812893867492676, "rewards/margins": 2.172043561935425, "rewards/rejected": 2.640850782394409, "step": 62580 }, { "epoch": 2.9058916384233253, "grad_norm": 61.40977096557617, "learning_rate": 1.2573007103393843e-07, "logits/chosen": -18.05216407775879, "logits/rejected": -16.940418243408203, "logps/chosen": -333.20166015625, "logps/rejected": -266.3988342285156, "loss": 0.8176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.661324977874756, "rewards/margins": 2.0718750953674316, "rewards/rejected": 1.5894502401351929, "step": 62590 }, { "epoch": 2.906355912530758, "grad_norm": 11.733933448791504, "learning_rate": 1.2570221458749245e-07, "logits/chosen": -18.62505531311035, "logits/rejected": -18.240360260009766, "logps/chosen": -371.0854797363281, "logps/rejected": -296.0819396972656, "loss": 0.763, "rewards/accuracies": 0.5, "rewards/chosen": 3.3057892322540283, "rewards/margins": 1.6563094854354858, "rewards/rejected": 1.6494795083999634, "step": 62600 }, { "epoch": 2.9068201866381913, "grad_norm": 29.45237922668457, "learning_rate": 1.2567435814104646e-07, "logits/chosen": -19.279279708862305, "logits/rejected": -18.48470687866211, "logps/chosen": -239.58584594726562, "logps/rejected": -263.2966003417969, "loss": 1.1028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.419715404510498, "rewards/margins": 0.09614212810993195, "rewards/rejected": 2.323573350906372, "step": 62610 }, { "epoch": 2.907284460745624, "grad_norm": 105.90007781982422, "learning_rate": 1.2564650169460048e-07, "logits/chosen": -18.630842208862305, "logits/rejected": -18.04960060119629, "logps/chosen": -414.4828186035156, "logps/rejected": -358.61602783203125, "loss": 1.0694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6958136558532715, "rewards/margins": 0.5384188890457153, "rewards/rejected": 3.1573948860168457, "step": 62620 }, { "epoch": 2.9077487348530573, "grad_norm": 148.9222412109375, "learning_rate": 1.2561864524815452e-07, "logits/chosen": -18.637218475341797, "logits/rejected": -18.165775299072266, "logps/chosen": -429.15899658203125, "logps/rejected": -373.3334655761719, "loss": 0.783, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.489250659942627, "rewards/margins": 0.8288734555244446, "rewards/rejected": 2.660377025604248, "step": 62630 }, { "epoch": 2.90821300896049, "grad_norm": 44.91228103637695, "learning_rate": 1.255907888017085e-07, "logits/chosen": -19.083290100097656, "logits/rejected": -18.357135772705078, "logps/chosen": -353.17242431640625, "logps/rejected": -327.58892822265625, "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.102735757827759, "rewards/margins": 1.1106903553009033, "rewards/rejected": 1.9920451641082764, "step": 62640 }, { "epoch": 2.9086772830679233, "grad_norm": 149.30044555664062, "learning_rate": 1.2556293235526255e-07, "logits/chosen": -18.77972412109375, "logits/rejected": -19.21807289123535, "logps/chosen": -428.382568359375, "logps/rejected": -395.649658203125, "loss": 0.7703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0746235847473145, "rewards/margins": 0.8222165107727051, "rewards/rejected": 3.2524070739746094, "step": 62650 }, { "epoch": 2.9091415571753565, "grad_norm": 27.66054916381836, "learning_rate": 1.2553507590881656e-07, "logits/chosen": -18.70888328552246, "logits/rejected": -17.476215362548828, "logps/chosen": -386.61749267578125, "logps/rejected": -222.1768798828125, "loss": 0.6703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3864669799804688, "rewards/margins": 1.6823084354400635, "rewards/rejected": 1.7041585445404053, "step": 62660 }, { "epoch": 2.9096058312827893, "grad_norm": 0.6988691091537476, "learning_rate": 1.2550721946237058e-07, "logits/chosen": -19.238723754882812, "logits/rejected": -17.478836059570312, "logps/chosen": -497.59619140625, "logps/rejected": -243.3216094970703, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 5.141371726989746, "rewards/margins": 3.8017280101776123, "rewards/rejected": 1.3396438360214233, "step": 62670 }, { "epoch": 2.9100701053902225, "grad_norm": 234.55429077148438, "learning_rate": 1.254793630159246e-07, "logits/chosen": -19.459959030151367, "logits/rejected": -18.98923110961914, "logps/chosen": -411.8113708496094, "logps/rejected": -295.9830627441406, "loss": 0.6831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2701334953308105, "rewards/margins": 0.8633264303207397, "rewards/rejected": 1.4068067073822021, "step": 62680 }, { "epoch": 2.9105343794976553, "grad_norm": 179.09974670410156, "learning_rate": 1.2545150656947863e-07, "logits/chosen": -18.85359764099121, "logits/rejected": -18.379024505615234, "logps/chosen": -364.92498779296875, "logps/rejected": -346.66021728515625, "loss": 0.6214, "rewards/accuracies": 0.5, "rewards/chosen": 4.080839157104492, "rewards/margins": 0.792039155960083, "rewards/rejected": 3.288800001144409, "step": 62690 }, { "epoch": 2.9109986536050885, "grad_norm": 57.69447326660156, "learning_rate": 1.2542365012303262e-07, "logits/chosen": -19.612533569335938, "logits/rejected": -18.495044708251953, "logps/chosen": -433.473876953125, "logps/rejected": -367.69171142578125, "loss": 0.9509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3865795135498047, "rewards/margins": 0.3916319012641907, "rewards/rejected": 2.994947910308838, "step": 62700 }, { "epoch": 2.9114629277125212, "grad_norm": 41.66677474975586, "learning_rate": 1.2539579367658666e-07, "logits/chosen": -19.417049407958984, "logits/rejected": -18.95330047607422, "logps/chosen": -437.78179931640625, "logps/rejected": -379.19439697265625, "loss": 0.3084, "rewards/accuracies": 1.0, "rewards/chosen": 4.49387788772583, "rewards/margins": 1.7248786687850952, "rewards/rejected": 2.768998861312866, "step": 62710 }, { "epoch": 2.9119272018199545, "grad_norm": 4.099466323852539, "learning_rate": 1.2536793723014068e-07, "logits/chosen": -19.300573348999023, "logits/rejected": -18.707956314086914, "logps/chosen": -340.85400390625, "logps/rejected": -236.6049346923828, "loss": 0.5565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.450498104095459, "rewards/margins": 1.6544421911239624, "rewards/rejected": 1.796055793762207, "step": 62720 }, { "epoch": 2.9123914759273877, "grad_norm": 137.5384979248047, "learning_rate": 1.253400807836947e-07, "logits/chosen": -19.182209014892578, "logits/rejected": -18.476573944091797, "logps/chosen": -402.7552795410156, "logps/rejected": -361.18310546875, "loss": 0.495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.971231460571289, "rewards/margins": 1.365771770477295, "rewards/rejected": 2.605459690093994, "step": 62730 }, { "epoch": 2.9128557500348204, "grad_norm": 60.144866943359375, "learning_rate": 1.253122243372487e-07, "logits/chosen": -19.47652816772461, "logits/rejected": -18.15018081665039, "logps/chosen": -389.78619384765625, "logps/rejected": -359.1827697753906, "loss": 0.2967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.235306739807129, "rewards/margins": 2.3552730083465576, "rewards/rejected": 1.8800334930419922, "step": 62740 }, { "epoch": 2.9133200241422537, "grad_norm": 1.5529565811157227, "learning_rate": 1.2528436789080272e-07, "logits/chosen": -18.822839736938477, "logits/rejected": -19.69520378112793, "logps/chosen": -394.3839416503906, "logps/rejected": -444.83831787109375, "loss": 0.8491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8091118335723877, "rewards/margins": 0.30755820870399475, "rewards/rejected": 3.501554012298584, "step": 62750 }, { "epoch": 2.913784298249687, "grad_norm": 42.23231506347656, "learning_rate": 1.2525651144435673e-07, "logits/chosen": -19.986574172973633, "logits/rejected": -18.934362411499023, "logps/chosen": -348.62896728515625, "logps/rejected": -301.90008544921875, "loss": 0.9744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7548224925994873, "rewards/margins": 0.4945167005062103, "rewards/rejected": 2.260305643081665, "step": 62760 }, { "epoch": 2.9142485723571196, "grad_norm": 91.58769989013672, "learning_rate": 1.2522865499791075e-07, "logits/chosen": -18.692291259765625, "logits/rejected": -17.856861114501953, "logps/chosen": -348.7126159667969, "logps/rejected": -282.15740966796875, "loss": 0.7972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8805668354034424, "rewards/margins": 1.0486624240875244, "rewards/rejected": 1.8319041728973389, "step": 62770 }, { "epoch": 2.9147128464645524, "grad_norm": 82.66007995605469, "learning_rate": 1.252007985514648e-07, "logits/chosen": -19.326265335083008, "logits/rejected": -18.364519119262695, "logps/chosen": -334.17840576171875, "logps/rejected": -233.35867309570312, "loss": 0.4814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6062583923339844, "rewards/margins": 1.4821773767471313, "rewards/rejected": 1.1240811347961426, "step": 62780 }, { "epoch": 2.9151771205719856, "grad_norm": 129.94073486328125, "learning_rate": 1.251729421050188e-07, "logits/chosen": -18.940814971923828, "logits/rejected": -18.431194305419922, "logps/chosen": -355.50982666015625, "logps/rejected": -281.95001220703125, "loss": 0.7817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9017333984375, "rewards/margins": 0.8715171813964844, "rewards/rejected": 2.0302159786224365, "step": 62790 }, { "epoch": 2.915641394679419, "grad_norm": 0.3539121448993683, "learning_rate": 1.2514508565857282e-07, "logits/chosen": -20.3199405670166, "logits/rejected": -19.66843032836914, "logps/chosen": -475.2330017089844, "logps/rejected": -313.655517578125, "loss": 0.2962, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.17091178894043, "rewards/margins": 2.261132001876831, "rewards/rejected": 2.9097793102264404, "step": 62800 }, { "epoch": 2.9161056687868516, "grad_norm": 11.454623222351074, "learning_rate": 1.2511722921212683e-07, "logits/chosen": -19.477113723754883, "logits/rejected": -18.25359344482422, "logps/chosen": -425.57781982421875, "logps/rejected": -384.6851501464844, "loss": 0.9409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.614806175231934, "rewards/margins": 1.0748710632324219, "rewards/rejected": 3.5399348735809326, "step": 62810 }, { "epoch": 2.916569942894285, "grad_norm": 36.59568405151367, "learning_rate": 1.2508937276568085e-07, "logits/chosen": -19.90216064453125, "logits/rejected": -19.695751190185547, "logps/chosen": -419.8402404785156, "logps/rejected": -374.0797424316406, "loss": 0.8642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1399638652801514, "rewards/margins": 0.23637942969799042, "rewards/rejected": 2.9035840034484863, "step": 62820 }, { "epoch": 2.917034217001718, "grad_norm": 41.695526123046875, "learning_rate": 1.2506151631923486e-07, "logits/chosen": -20.390058517456055, "logits/rejected": -19.35464096069336, "logps/chosen": -387.2584228515625, "logps/rejected": -271.91204833984375, "loss": 0.5702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0783586502075195, "rewards/margins": 1.5532430410385132, "rewards/rejected": 2.525115489959717, "step": 62830 }, { "epoch": 2.917498491109151, "grad_norm": 194.67442321777344, "learning_rate": 1.250336598727889e-07, "logits/chosen": -19.164104461669922, "logits/rejected": -17.560054779052734, "logps/chosen": -397.422119140625, "logps/rejected": -319.38714599609375, "loss": 0.6212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.304039478302002, "rewards/margins": 1.6246525049209595, "rewards/rejected": 2.679387092590332, "step": 62840 }, { "epoch": 2.9179627652165836, "grad_norm": 183.90672302246094, "learning_rate": 1.250058034263429e-07, "logits/chosen": -19.248136520385742, "logits/rejected": -17.83863067626953, "logps/chosen": -463.70281982421875, "logps/rejected": -482.049560546875, "loss": 0.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9655888080596924, "rewards/margins": 0.5341842770576477, "rewards/rejected": 3.4314045906066895, "step": 62850 }, { "epoch": 2.918427039324017, "grad_norm": 44.521602630615234, "learning_rate": 1.2497794697989693e-07, "logits/chosen": -19.14954948425293, "logits/rejected": -18.723581314086914, "logps/chosen": -387.0622863769531, "logps/rejected": -288.53399658203125, "loss": 1.5537, "rewards/accuracies": 0.5, "rewards/chosen": 2.1861512660980225, "rewards/margins": -0.14434058964252472, "rewards/rejected": 2.3304920196533203, "step": 62860 }, { "epoch": 2.91889131343145, "grad_norm": 1.0623745918273926, "learning_rate": 1.2495009053345095e-07, "logits/chosen": -18.9771728515625, "logits/rejected": -18.291431427001953, "logps/chosen": -463.97393798828125, "logps/rejected": -402.76690673828125, "loss": 0.5458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.187462329864502, "rewards/margins": 1.3093957901000977, "rewards/rejected": 2.878065824508667, "step": 62870 }, { "epoch": 2.919355587538883, "grad_norm": 137.73828125, "learning_rate": 1.2492223408700496e-07, "logits/chosen": -20.709918975830078, "logits/rejected": -19.398040771484375, "logps/chosen": -387.0171813964844, "logps/rejected": -263.52545166015625, "loss": 0.4411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.755711793899536, "rewards/margins": 0.9890413284301758, "rewards/rejected": 2.7666707038879395, "step": 62880 }, { "epoch": 2.919819861646316, "grad_norm": 55.02535629272461, "learning_rate": 1.2489437764055898e-07, "logits/chosen": -18.547222137451172, "logits/rejected": -19.004314422607422, "logps/chosen": -267.8970031738281, "logps/rejected": -367.5880126953125, "loss": 1.1255, "rewards/accuracies": 0.5, "rewards/chosen": 2.4304299354553223, "rewards/margins": -0.49186745285987854, "rewards/rejected": 2.922297716140747, "step": 62890 }, { "epoch": 2.9202841357537492, "grad_norm": 87.91539001464844, "learning_rate": 1.2486652119411302e-07, "logits/chosen": -19.408212661743164, "logits/rejected": -20.319089889526367, "logps/chosen": -452.545654296875, "logps/rejected": -478.1493225097656, "loss": 1.0712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6342689990997314, "rewards/margins": -0.050693608820438385, "rewards/rejected": 3.684962034225464, "step": 62900 }, { "epoch": 2.920748409861182, "grad_norm": 155.93386840820312, "learning_rate": 1.24838664747667e-07, "logits/chosen": -19.544986724853516, "logits/rejected": -19.009830474853516, "logps/chosen": -369.2158508300781, "logps/rejected": -348.58477783203125, "loss": 0.8335, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.549792766571045, "rewards/margins": 0.5167677402496338, "rewards/rejected": 3.033025026321411, "step": 62910 }, { "epoch": 2.921212683968615, "grad_norm": 67.18598175048828, "learning_rate": 1.2481080830122105e-07, "logits/chosen": -18.846023559570312, "logits/rejected": -18.19553565979004, "logps/chosen": -358.3208923339844, "logps/rejected": -334.6431884765625, "loss": 1.1807, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.60227632522583, "rewards/margins": 0.002938568592071533, "rewards/rejected": 2.5993378162384033, "step": 62920 }, { "epoch": 2.921676958076048, "grad_norm": 117.17798614501953, "learning_rate": 1.2478295185477506e-07, "logits/chosen": -17.979013442993164, "logits/rejected": -18.059181213378906, "logps/chosen": -313.7515869140625, "logps/rejected": -308.9848937988281, "loss": 0.7767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.081892967224121, "rewards/margins": 0.40653347969055176, "rewards/rejected": 1.6753594875335693, "step": 62930 }, { "epoch": 2.922141232183481, "grad_norm": 149.9634246826172, "learning_rate": 1.2475509540832908e-07, "logits/chosen": -19.178359985351562, "logits/rejected": -18.563983917236328, "logps/chosen": -366.9504089355469, "logps/rejected": -271.13275146484375, "loss": 0.652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.599259853363037, "rewards/margins": 1.2306534051895142, "rewards/rejected": 1.3686063289642334, "step": 62940 }, { "epoch": 2.922605506290914, "grad_norm": 0.06389550119638443, "learning_rate": 1.247272389618831e-07, "logits/chosen": -19.866243362426758, "logits/rejected": -18.63780403137207, "logps/chosen": -396.1796569824219, "logps/rejected": -279.99200439453125, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.513380765914917, "rewards/margins": 1.7831695079803467, "rewards/rejected": 1.7302110195159912, "step": 62950 }, { "epoch": 2.923069780398347, "grad_norm": 278.673828125, "learning_rate": 1.246993825154371e-07, "logits/chosen": -18.901615142822266, "logits/rejected": -19.375764846801758, "logps/chosen": -392.93902587890625, "logps/rejected": -448.2421875, "loss": 1.2901, "rewards/accuracies": 0.5, "rewards/chosen": 2.769594669342041, "rewards/margins": -0.6242049932479858, "rewards/rejected": 3.3937995433807373, "step": 62960 }, { "epoch": 2.9235340545057804, "grad_norm": 6.535861968994141, "learning_rate": 1.2467152606899112e-07, "logits/chosen": -19.19675064086914, "logits/rejected": -17.92098617553711, "logps/chosen": -369.954345703125, "logps/rejected": -214.821044921875, "loss": 0.3575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3367931842803955, "rewards/margins": 1.6764005422592163, "rewards/rejected": 0.6603926420211792, "step": 62970 }, { "epoch": 2.923998328613213, "grad_norm": 79.546630859375, "learning_rate": 1.2464366962254513e-07, "logits/chosen": -19.101465225219727, "logits/rejected": -18.61997413635254, "logps/chosen": -372.0353698730469, "logps/rejected": -303.87408447265625, "loss": 0.7995, "rewards/accuracies": 0.5, "rewards/chosen": 2.6431033611297607, "rewards/margins": 0.632031261920929, "rewards/rejected": 2.0110721588134766, "step": 62980 }, { "epoch": 2.9244626027206464, "grad_norm": 45.313087463378906, "learning_rate": 1.2461581317609917e-07, "logits/chosen": -19.826021194458008, "logits/rejected": -19.949825286865234, "logps/chosen": -352.0834655761719, "logps/rejected": -370.77423095703125, "loss": 1.036, "rewards/accuracies": 0.5, "rewards/chosen": 3.881840229034424, "rewards/margins": -0.010292601771652699, "rewards/rejected": 3.8921332359313965, "step": 62990 }, { "epoch": 2.924926876828079, "grad_norm": 16.8588924407959, "learning_rate": 1.245879567296532e-07, "logits/chosen": -19.507850646972656, "logits/rejected": -18.901885986328125, "logps/chosen": -391.31842041015625, "logps/rejected": -338.780517578125, "loss": 0.9295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.840381622314453, "rewards/margins": 1.5740995407104492, "rewards/rejected": 3.266281843185425, "step": 63000 }, { "epoch": 2.9253911509355124, "grad_norm": 0.059909168630838394, "learning_rate": 1.245601002832072e-07, "logits/chosen": -18.415042877197266, "logits/rejected": -17.706331253051758, "logps/chosen": -383.5172119140625, "logps/rejected": -314.36334228515625, "loss": 0.5992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.236913681030273, "rewards/margins": 1.778658151626587, "rewards/rejected": 2.4582555294036865, "step": 63010 }, { "epoch": 2.925855425042945, "grad_norm": 14.430535316467285, "learning_rate": 1.2453224383676122e-07, "logits/chosen": -18.295963287353516, "logits/rejected": -17.651132583618164, "logps/chosen": -405.2996520996094, "logps/rejected": -380.9566650390625, "loss": 0.9139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0534234046936035, "rewards/margins": 0.7719090580940247, "rewards/rejected": 2.2815146446228027, "step": 63020 }, { "epoch": 2.9263196991503784, "grad_norm": 157.2518310546875, "learning_rate": 1.2450438739031523e-07, "logits/chosen": -18.292375564575195, "logits/rejected": -17.339330673217773, "logps/chosen": -431.9559631347656, "logps/rejected": -277.3096618652344, "loss": 0.7404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6171231269836426, "rewards/margins": 0.7551721930503845, "rewards/rejected": 2.8619508743286133, "step": 63030 }, { "epoch": 2.9267839732578116, "grad_norm": 1.4410008192062378, "learning_rate": 1.2447653094386925e-07, "logits/chosen": -18.902708053588867, "logits/rejected": -17.664125442504883, "logps/chosen": -337.6425476074219, "logps/rejected": -302.09100341796875, "loss": 0.5956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.693281888961792, "rewards/margins": 1.2727535963058472, "rewards/rejected": 1.4205282926559448, "step": 63040 }, { "epoch": 2.9272482473652444, "grad_norm": 10.675631523132324, "learning_rate": 1.244486744974233e-07, "logits/chosen": -19.411212921142578, "logits/rejected": -19.028568267822266, "logps/chosen": -499.71759033203125, "logps/rejected": -400.2601013183594, "loss": 0.4827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.868002891540527, "rewards/margins": 1.0590204000473022, "rewards/rejected": 3.8089821338653564, "step": 63050 }, { "epoch": 2.9277125214726776, "grad_norm": 78.49425506591797, "learning_rate": 1.2442081805097728e-07, "logits/chosen": -18.451358795166016, "logits/rejected": -17.974899291992188, "logps/chosen": -468.9566955566406, "logps/rejected": -387.3644104003906, "loss": 0.8224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.582103729248047, "rewards/margins": 0.8419790267944336, "rewards/rejected": 2.7401249408721924, "step": 63060 }, { "epoch": 2.9281767955801103, "grad_norm": 100.59871673583984, "learning_rate": 1.2439296160453132e-07, "logits/chosen": -18.80712890625, "logits/rejected": -18.11930274963379, "logps/chosen": -418.3861389160156, "logps/rejected": -383.0146789550781, "loss": 0.6968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.846240520477295, "rewards/margins": 1.525750756263733, "rewards/rejected": 2.3204901218414307, "step": 63070 }, { "epoch": 2.9286410696875436, "grad_norm": 83.14065551757812, "learning_rate": 1.2436510515808533e-07, "logits/chosen": -18.73243522644043, "logits/rejected": -18.609256744384766, "logps/chosen": -258.4059753417969, "logps/rejected": -250.1883544921875, "loss": 0.886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8485418558120728, "rewards/margins": 0.20116452872753143, "rewards/rejected": 1.647377371788025, "step": 63080 }, { "epoch": 2.9291053437949763, "grad_norm": 24.379352569580078, "learning_rate": 1.2433724871163935e-07, "logits/chosen": -18.634357452392578, "logits/rejected": -18.624874114990234, "logps/chosen": -467.88043212890625, "logps/rejected": -361.44842529296875, "loss": 0.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6916096210479736, "rewards/margins": 0.8999456167221069, "rewards/rejected": 2.7916643619537354, "step": 63090 }, { "epoch": 2.9295696179024096, "grad_norm": 186.3354034423828, "learning_rate": 1.2430939226519336e-07, "logits/chosen": -18.86119842529297, "logits/rejected": -18.988834381103516, "logps/chosen": -322.93792724609375, "logps/rejected": -383.773681640625, "loss": 1.4699, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.201920747756958, "rewards/margins": -0.5724945068359375, "rewards/rejected": 3.7744152545928955, "step": 63100 }, { "epoch": 2.9300338920098428, "grad_norm": 8.76284122467041, "learning_rate": 1.242815358187474e-07, "logits/chosen": -19.045700073242188, "logits/rejected": -18.789745330810547, "logps/chosen": -399.240234375, "logps/rejected": -421.86505126953125, "loss": 1.1179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9928126335144043, "rewards/margins": 0.24834999442100525, "rewards/rejected": 3.744462490081787, "step": 63110 }, { "epoch": 2.9304981661172755, "grad_norm": 44.191734313964844, "learning_rate": 1.242536793723014e-07, "logits/chosen": -19.032808303833008, "logits/rejected": -18.67921257019043, "logps/chosen": -382.07073974609375, "logps/rejected": -345.28643798828125, "loss": 0.6798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9784996509552, "rewards/margins": 0.5325437784194946, "rewards/rejected": 3.445955991744995, "step": 63120 }, { "epoch": 2.9309624402247088, "grad_norm": 1.4035018682479858, "learning_rate": 1.242258229258554e-07, "logits/chosen": -19.28877830505371, "logits/rejected": -17.730485916137695, "logps/chosen": -318.88250732421875, "logps/rejected": -198.52059936523438, "loss": 0.3613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6366190910339355, "rewards/margins": 2.167433500289917, "rewards/rejected": 0.46918588876724243, "step": 63130 }, { "epoch": 2.931426714332142, "grad_norm": 44.4755859375, "learning_rate": 1.2419796647940945e-07, "logits/chosen": -19.837251663208008, "logits/rejected": -18.349109649658203, "logps/chosen": -526.3748779296875, "logps/rejected": -427.17950439453125, "loss": 0.2884, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.983584403991699, "rewards/margins": 2.4971160888671875, "rewards/rejected": 2.4864680767059326, "step": 63140 }, { "epoch": 2.9318909884395747, "grad_norm": 268.3562316894531, "learning_rate": 1.2417011003296346e-07, "logits/chosen": -20.325151443481445, "logits/rejected": -20.088897705078125, "logps/chosen": -388.21356201171875, "logps/rejected": -422.1708068847656, "loss": 0.8783, "rewards/accuracies": 0.5, "rewards/chosen": 3.4958534240722656, "rewards/margins": 0.30763375759124756, "rewards/rejected": 3.1882195472717285, "step": 63150 }, { "epoch": 2.9323552625470075, "grad_norm": 19.014652252197266, "learning_rate": 1.2414225358651747e-07, "logits/chosen": -18.5540828704834, "logits/rejected": -17.825878143310547, "logps/chosen": -404.8799743652344, "logps/rejected": -348.10430908203125, "loss": 0.9884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.05765700340271, "rewards/margins": 0.4145488142967224, "rewards/rejected": 2.643108606338501, "step": 63160 }, { "epoch": 2.9328195366544407, "grad_norm": 77.17476654052734, "learning_rate": 1.241143971400715e-07, "logits/chosen": -19.311777114868164, "logits/rejected": -18.535951614379883, "logps/chosen": -418.72412109375, "logps/rejected": -382.8927001953125, "loss": 0.3767, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.687860488891602, "rewards/margins": 1.928154706954956, "rewards/rejected": 2.7597060203552246, "step": 63170 }, { "epoch": 2.933283810761874, "grad_norm": 123.97725677490234, "learning_rate": 1.240865406936255e-07, "logits/chosen": -18.73661994934082, "logits/rejected": -18.579286575317383, "logps/chosen": -498.7777404785156, "logps/rejected": -460.484130859375, "loss": 0.8325, "rewards/accuracies": 0.5, "rewards/chosen": 3.988929271697998, "rewards/margins": 0.28971830010414124, "rewards/rejected": 3.6992106437683105, "step": 63180 }, { "epoch": 2.9337480848693067, "grad_norm": 11.017008781433105, "learning_rate": 1.2405868424717952e-07, "logits/chosen": -19.18942642211914, "logits/rejected": -18.241228103637695, "logps/chosen": -407.6993408203125, "logps/rejected": -312.97412109375, "loss": 0.7765, "rewards/accuracies": 0.5, "rewards/chosen": 2.6746749877929688, "rewards/margins": 0.8562618494033813, "rewards/rejected": 1.8184131383895874, "step": 63190 }, { "epoch": 2.93421235897674, "grad_norm": 45.73081588745117, "learning_rate": 1.2403082780073356e-07, "logits/chosen": -19.643238067626953, "logits/rejected": -18.923131942749023, "logps/chosen": -482.0834045410156, "logps/rejected": -399.1572265625, "loss": 0.9773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.360517978668213, "rewards/margins": 1.0738340616226196, "rewards/rejected": 3.286684036254883, "step": 63200 }, { "epoch": 2.934676633084173, "grad_norm": 9.729272842407227, "learning_rate": 1.2400297135428757e-07, "logits/chosen": -19.024959564208984, "logits/rejected": -18.42575454711914, "logps/chosen": -419.05133056640625, "logps/rejected": -344.7752990722656, "loss": 0.469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.071516752243042, "rewards/margins": 1.2377127408981323, "rewards/rejected": 1.8338041305541992, "step": 63210 }, { "epoch": 2.935140907191606, "grad_norm": 63.4168815612793, "learning_rate": 1.239751149078416e-07, "logits/chosen": -18.615554809570312, "logits/rejected": -17.657888412475586, "logps/chosen": -455.75091552734375, "logps/rejected": -320.7696838378906, "loss": 0.4915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8331351280212402, "rewards/margins": 1.3335388898849487, "rewards/rejected": 2.499596118927002, "step": 63220 }, { "epoch": 2.9356051812990387, "grad_norm": 149.12098693847656, "learning_rate": 1.239472584613956e-07, "logits/chosen": -19.412878036499023, "logits/rejected": -18.861156463623047, "logps/chosen": -506.9388732910156, "logps/rejected": -491.9967346191406, "loss": 0.9613, "rewards/accuracies": 0.5, "rewards/chosen": 5.2085089683532715, "rewards/margins": 0.851709246635437, "rewards/rejected": 4.356799602508545, "step": 63230 }, { "epoch": 2.936069455406472, "grad_norm": 0.9298703074455261, "learning_rate": 1.2391940201494962e-07, "logits/chosen": -18.50896453857422, "logits/rejected": -18.288955688476562, "logps/chosen": -406.3099670410156, "logps/rejected": -326.8017272949219, "loss": 0.6882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5211398601531982, "rewards/margins": 2.0599918365478516, "rewards/rejected": 1.4611479043960571, "step": 63240 }, { "epoch": 2.936533729513905, "grad_norm": 24.728782653808594, "learning_rate": 1.2389154556850363e-07, "logits/chosen": -18.059595108032227, "logits/rejected": -17.3575439453125, "logps/chosen": -418.76055908203125, "logps/rejected": -334.7601013183594, "loss": 0.6912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9631292819976807, "rewards/margins": 1.0878849029541016, "rewards/rejected": 1.875244379043579, "step": 63250 }, { "epoch": 2.936998003621338, "grad_norm": 156.51918029785156, "learning_rate": 1.2386368912205767e-07, "logits/chosen": -18.79909324645996, "logits/rejected": -18.052953720092773, "logps/chosen": -351.0901794433594, "logps/rejected": -289.7085876464844, "loss": 1.0402, "rewards/accuracies": 0.5, "rewards/chosen": 3.504054546356201, "rewards/margins": 0.8648084402084351, "rewards/rejected": 2.6392464637756348, "step": 63260 }, { "epoch": 2.937462277728771, "grad_norm": 31.21949577331543, "learning_rate": 1.2383583267561166e-07, "logits/chosen": -19.02462387084961, "logits/rejected": -18.93207359313965, "logps/chosen": -331.3222961425781, "logps/rejected": -339.5601806640625, "loss": 1.0168, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.677410840988159, "rewards/margins": 0.5244124531745911, "rewards/rejected": 2.152998447418213, "step": 63270 }, { "epoch": 2.9379265518362043, "grad_norm": 250.6995391845703, "learning_rate": 1.238079762291657e-07, "logits/chosen": -18.580257415771484, "logits/rejected": -18.149250030517578, "logps/chosen": -406.13226318359375, "logps/rejected": -326.3709411621094, "loss": 0.556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.515235424041748, "rewards/margins": 1.258453369140625, "rewards/rejected": 2.256782054901123, "step": 63280 }, { "epoch": 2.938390825943637, "grad_norm": 266.2303771972656, "learning_rate": 1.2378011978271972e-07, "logits/chosen": -19.7463436126709, "logits/rejected": -18.566692352294922, "logps/chosen": -452.30181884765625, "logps/rejected": -311.1459655761719, "loss": 1.0453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.414236545562744, "rewards/margins": 1.5024759769439697, "rewards/rejected": 2.9117603302001953, "step": 63290 }, { "epoch": 2.9388551000510703, "grad_norm": 0.793285608291626, "learning_rate": 1.2375226333627373e-07, "logits/chosen": -18.97340202331543, "logits/rejected": -18.13518714904785, "logps/chosen": -365.14874267578125, "logps/rejected": -329.0491943359375, "loss": 0.8145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.67457914352417, "rewards/margins": 1.1058406829833984, "rewards/rejected": 2.5687386989593506, "step": 63300 }, { "epoch": 2.939319374158503, "grad_norm": 19.037961959838867, "learning_rate": 1.2372440688982775e-07, "logits/chosen": -18.703319549560547, "logits/rejected": -18.268434524536133, "logps/chosen": -304.62811279296875, "logps/rejected": -259.18536376953125, "loss": 0.4955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5932822227478027, "rewards/margins": 1.0480753183364868, "rewards/rejected": 1.5452067852020264, "step": 63310 }, { "epoch": 2.9397836482659363, "grad_norm": 50.325225830078125, "learning_rate": 1.2369655044338179e-07, "logits/chosen": -19.99062728881836, "logits/rejected": -19.228910446166992, "logps/chosen": -337.24554443359375, "logps/rejected": -267.02581787109375, "loss": 0.3395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4452648162841797, "rewards/margins": 1.68511164188385, "rewards/rejected": 1.76015305519104, "step": 63320 }, { "epoch": 2.940247922373369, "grad_norm": 45.62361526489258, "learning_rate": 1.2366869399693577e-07, "logits/chosen": -17.827316284179688, "logits/rejected": -17.65073013305664, "logps/chosen": -317.71405029296875, "logps/rejected": -244.1896514892578, "loss": 0.9654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0707452297210693, "rewards/margins": 0.49116286635398865, "rewards/rejected": 1.5795822143554688, "step": 63330 }, { "epoch": 2.9407121964808023, "grad_norm": 2.1540422439575195, "learning_rate": 1.236408375504898e-07, "logits/chosen": -18.5123291015625, "logits/rejected": -17.708791732788086, "logps/chosen": -343.9941711425781, "logps/rejected": -217.57852172851562, "loss": 0.8272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1287267208099365, "rewards/margins": 1.2689182758331299, "rewards/rejected": 0.859808623790741, "step": 63340 }, { "epoch": 2.9411764705882355, "grad_norm": 23.242965698242188, "learning_rate": 1.2361298110404383e-07, "logits/chosen": -18.55341148376465, "logits/rejected": -17.250581741333008, "logps/chosen": -461.1244201660156, "logps/rejected": -318.6314392089844, "loss": 0.7319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.811680555343628, "rewards/margins": 1.978959083557129, "rewards/rejected": 1.832721471786499, "step": 63350 }, { "epoch": 2.9416407446956683, "grad_norm": 16.028854370117188, "learning_rate": 1.2358512465759784e-07, "logits/chosen": -20.031574249267578, "logits/rejected": -18.83942222595215, "logps/chosen": -414.332275390625, "logps/rejected": -311.09912109375, "loss": 0.4641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.141493797302246, "rewards/margins": 1.5516201257705688, "rewards/rejected": 2.5898735523223877, "step": 63360 }, { "epoch": 2.9421050188031015, "grad_norm": 146.34664916992188, "learning_rate": 1.2355726821115186e-07, "logits/chosen": -18.38018035888672, "logits/rejected": -19.03474235534668, "logps/chosen": -363.5600891113281, "logps/rejected": -407.3187561035156, "loss": 1.3166, "rewards/accuracies": 0.5, "rewards/chosen": 3.1119112968444824, "rewards/margins": -0.4625609517097473, "rewards/rejected": 3.574471950531006, "step": 63370 }, { "epoch": 2.9425692929105343, "grad_norm": 170.6849822998047, "learning_rate": 1.2352941176470587e-07, "logits/chosen": -18.044645309448242, "logits/rejected": -17.796714782714844, "logps/chosen": -318.97845458984375, "logps/rejected": -278.2467956542969, "loss": 1.0376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.589561700820923, "rewards/margins": 0.12593479454517365, "rewards/rejected": 2.4636266231536865, "step": 63380 }, { "epoch": 2.9430335670179675, "grad_norm": 10.968884468078613, "learning_rate": 1.235015553182599e-07, "logits/chosen": -18.947275161743164, "logits/rejected": -18.049068450927734, "logps/chosen": -388.3095703125, "logps/rejected": -271.19464111328125, "loss": 0.3591, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.927011013031006, "rewards/margins": 1.3642466068267822, "rewards/rejected": 1.5627641677856445, "step": 63390 }, { "epoch": 2.9434978411254002, "grad_norm": 61.63882064819336, "learning_rate": 1.234736988718139e-07, "logits/chosen": -18.616741180419922, "logits/rejected": -18.355060577392578, "logps/chosen": -355.6933898925781, "logps/rejected": -298.1930236816406, "loss": 0.7325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4113850593566895, "rewards/margins": 1.0373393297195435, "rewards/rejected": 2.3740456104278564, "step": 63400 }, { "epoch": 2.9439621152328335, "grad_norm": 3.9541876316070557, "learning_rate": 1.2344584242536794e-07, "logits/chosen": -19.317230224609375, "logits/rejected": -17.74283218383789, "logps/chosen": -410.7726135253906, "logps/rejected": -268.1795959472656, "loss": 0.5084, "rewards/accuracies": 0.5, "rewards/chosen": 3.0870542526245117, "rewards/margins": 1.6292909383773804, "rewards/rejected": 1.457763433456421, "step": 63410 }, { "epoch": 2.9444263893402667, "grad_norm": 38.33717727661133, "learning_rate": 1.2341798597892196e-07, "logits/chosen": -19.101512908935547, "logits/rejected": -18.30257797241211, "logps/chosen": -399.03021240234375, "logps/rejected": -297.6232604980469, "loss": 0.5436, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6347692012786865, "rewards/margins": 1.0794543027877808, "rewards/rejected": 1.5553147792816162, "step": 63420 }, { "epoch": 2.9448906634476995, "grad_norm": 30.582080841064453, "learning_rate": 1.2339012953247597e-07, "logits/chosen": -19.209774017333984, "logits/rejected": -18.564212799072266, "logps/chosen": -344.37249755859375, "logps/rejected": -298.5837707519531, "loss": 0.4652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1503612995147705, "rewards/margins": 1.412006139755249, "rewards/rejected": 1.7383549213409424, "step": 63430 }, { "epoch": 2.9453549375551327, "grad_norm": 116.2315444946289, "learning_rate": 1.2336227308603e-07, "logits/chosen": -19.964366912841797, "logits/rejected": -18.664752960205078, "logps/chosen": -422.85247802734375, "logps/rejected": -231.61355590820312, "loss": 0.2954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.46588659286499, "rewards/margins": 2.5183184146881104, "rewards/rejected": 1.9475685358047485, "step": 63440 }, { "epoch": 2.9458192116625654, "grad_norm": 112.31964111328125, "learning_rate": 1.23334416639584e-07, "logits/chosen": -19.903724670410156, "logits/rejected": -19.817564010620117, "logps/chosen": -354.0345153808594, "logps/rejected": -352.0708312988281, "loss": 0.8642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.04890775680542, "rewards/margins": 0.20310716331005096, "rewards/rejected": 2.8458003997802734, "step": 63450 }, { "epoch": 2.9462834857699987, "grad_norm": 0.5179330110549927, "learning_rate": 1.2330656019313802e-07, "logits/chosen": -19.207645416259766, "logits/rejected": -17.828292846679688, "logps/chosen": -377.24932861328125, "logps/rejected": -294.1837463378906, "loss": 0.3401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2224273681640625, "rewards/margins": 1.9955894947052002, "rewards/rejected": 2.226837635040283, "step": 63460 }, { "epoch": 2.9467477598774314, "grad_norm": 88.70304107666016, "learning_rate": 1.2327870374669206e-07, "logits/chosen": -19.092985153198242, "logits/rejected": -18.512531280517578, "logps/chosen": -398.59039306640625, "logps/rejected": -374.40460205078125, "loss": 0.7599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5019054412841797, "rewards/margins": 0.9391781091690063, "rewards/rejected": 2.562727212905884, "step": 63470 }, { "epoch": 2.9472120339848646, "grad_norm": 25.922626495361328, "learning_rate": 1.2325084730024605e-07, "logits/chosen": -18.477962493896484, "logits/rejected": -18.011924743652344, "logps/chosen": -429.4297790527344, "logps/rejected": -390.6921691894531, "loss": 1.3902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.815417766571045, "rewards/margins": 0.05004405975341797, "rewards/rejected": 2.765373706817627, "step": 63480 }, { "epoch": 2.947676308092298, "grad_norm": 46.770294189453125, "learning_rate": 1.2322299085380009e-07, "logits/chosen": -19.181079864501953, "logits/rejected": -17.46030044555664, "logps/chosen": -479.49102783203125, "logps/rejected": -320.2474670410156, "loss": 0.3628, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.49935245513916, "rewards/margins": 2.114464044570923, "rewards/rejected": 2.3848888874053955, "step": 63490 }, { "epoch": 2.9481405821997306, "grad_norm": 148.5366668701172, "learning_rate": 1.231951344073541e-07, "logits/chosen": -18.69766616821289, "logits/rejected": -18.093381881713867, "logps/chosen": -386.64947509765625, "logps/rejected": -347.77337646484375, "loss": 0.4481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5009846687316895, "rewards/margins": 0.8228254318237305, "rewards/rejected": 1.6781591176986694, "step": 63500 }, { "epoch": 2.948604856307164, "grad_norm": 125.87996673583984, "learning_rate": 1.2316727796090812e-07, "logits/chosen": -19.215900421142578, "logits/rejected": -17.80264663696289, "logps/chosen": -433.7227478027344, "logps/rejected": -333.5296630859375, "loss": 0.3853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8595781326293945, "rewards/margins": 2.100510358810425, "rewards/rejected": 2.7590675354003906, "step": 63510 }, { "epoch": 2.9490691304145966, "grad_norm": 85.05292510986328, "learning_rate": 1.2313942151446213e-07, "logits/chosen": -18.77684783935547, "logits/rejected": -18.864727020263672, "logps/chosen": -464.500244140625, "logps/rejected": -438.18243408203125, "loss": 1.4859, "rewards/accuracies": 0.5, "rewards/chosen": 2.970569133758545, "rewards/margins": -0.43403902649879456, "rewards/rejected": 3.4046082496643066, "step": 63520 }, { "epoch": 2.94953340452203, "grad_norm": 37.50617980957031, "learning_rate": 1.2311156506801614e-07, "logits/chosen": -19.20108413696289, "logits/rejected": -18.396820068359375, "logps/chosen": -409.1972351074219, "logps/rejected": -264.1435852050781, "loss": 0.4692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9525012969970703, "rewards/margins": 1.827039361000061, "rewards/rejected": 2.125462055206299, "step": 63530 }, { "epoch": 2.9499976786294626, "grad_norm": 52.28197479248047, "learning_rate": 1.2308370862157016e-07, "logits/chosen": -19.07817268371582, "logits/rejected": -19.148698806762695, "logps/chosen": -419.6564025878906, "logps/rejected": -432.83709716796875, "loss": 0.6734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8669593334198, "rewards/margins": 0.6287382245063782, "rewards/rejected": 3.2382214069366455, "step": 63540 }, { "epoch": 2.950461952736896, "grad_norm": 102.94093322753906, "learning_rate": 1.2305585217512417e-07, "logits/chosen": -19.527271270751953, "logits/rejected": -18.407608032226562, "logps/chosen": -478.5140686035156, "logps/rejected": -281.4930725097656, "loss": 0.3362, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8021843433380127, "rewards/margins": 1.5201300382614136, "rewards/rejected": 2.2820544242858887, "step": 63550 }, { "epoch": 2.950926226844329, "grad_norm": 127.30760955810547, "learning_rate": 1.2302799572867821e-07, "logits/chosen": -18.51503562927246, "logits/rejected": -18.70294952392578, "logps/chosen": -403.3014831542969, "logps/rejected": -414.293701171875, "loss": 0.9207, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6667609214782715, "rewards/margins": 0.45726051926612854, "rewards/rejected": 3.209501266479492, "step": 63560 }, { "epoch": 2.951390500951762, "grad_norm": 17.42603874206543, "learning_rate": 1.2300013928223223e-07, "logits/chosen": -19.50979995727539, "logits/rejected": -18.887752532958984, "logps/chosen": -417.9083557128906, "logps/rejected": -333.4619445800781, "loss": 0.4314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.127957344055176, "rewards/margins": 1.6844831705093384, "rewards/rejected": 1.443474531173706, "step": 63570 }, { "epoch": 2.951854775059195, "grad_norm": 232.0288848876953, "learning_rate": 1.2297228283578624e-07, "logits/chosen": -18.899845123291016, "logits/rejected": -18.94328498840332, "logps/chosen": -442.94268798828125, "logps/rejected": -401.1689147949219, "loss": 0.7865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.642665386199951, "rewards/margins": 0.8824507594108582, "rewards/rejected": 2.7602150440216064, "step": 63580 }, { "epoch": 2.9523190491666282, "grad_norm": 10.000239372253418, "learning_rate": 1.2294442638934026e-07, "logits/chosen": -19.140113830566406, "logits/rejected": -18.38023567199707, "logps/chosen": -381.2231750488281, "logps/rejected": -363.64532470703125, "loss": 0.5952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.925858974456787, "rewards/margins": 1.379305362701416, "rewards/rejected": 2.54655385017395, "step": 63590 }, { "epoch": 2.952783323274061, "grad_norm": 167.98483276367188, "learning_rate": 1.2291656994289427e-07, "logits/chosen": -19.10354232788086, "logits/rejected": -17.727428436279297, "logps/chosen": -527.4906005859375, "logps/rejected": -357.42706298828125, "loss": 0.9867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9869887828826904, "rewards/margins": 1.2957899570465088, "rewards/rejected": 2.6911990642547607, "step": 63600 }, { "epoch": 2.953247597381494, "grad_norm": 25.794832229614258, "learning_rate": 1.228887134964483e-07, "logits/chosen": -19.958026885986328, "logits/rejected": -19.390377044677734, "logps/chosen": -472.28118896484375, "logps/rejected": -401.0390319824219, "loss": 0.6731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.261407375335693, "rewards/margins": 1.3597888946533203, "rewards/rejected": 2.901618480682373, "step": 63610 }, { "epoch": 2.953711871488927, "grad_norm": 44.18943405151367, "learning_rate": 1.2286085705000233e-07, "logits/chosen": -19.129663467407227, "logits/rejected": -18.219961166381836, "logps/chosen": -375.3142395019531, "logps/rejected": -375.7696228027344, "loss": 0.717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.535454273223877, "rewards/margins": 0.6497462391853333, "rewards/rejected": 2.8857078552246094, "step": 63620 }, { "epoch": 2.95417614559636, "grad_norm": 44.06411361694336, "learning_rate": 1.2283300060355634e-07, "logits/chosen": -18.958459854125977, "logits/rejected": -19.61864471435547, "logps/chosen": -314.66668701171875, "logps/rejected": -438.47943115234375, "loss": 1.6168, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.3049964904785156, "rewards/margins": -0.8950517773628235, "rewards/rejected": 4.200048446655273, "step": 63630 }, { "epoch": 2.954640419703793, "grad_norm": 67.39635467529297, "learning_rate": 1.2280514415711036e-07, "logits/chosen": -18.468143463134766, "logits/rejected": -17.637142181396484, "logps/chosen": -372.7109375, "logps/rejected": -294.9708251953125, "loss": 0.4084, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.037006378173828, "rewards/margins": 2.40446138381958, "rewards/rejected": 1.6325451135635376, "step": 63640 }, { "epoch": 2.955104693811226, "grad_norm": 41.1019401550293, "learning_rate": 1.2277728771066437e-07, "logits/chosen": -18.692991256713867, "logits/rejected": -19.081180572509766, "logps/chosen": -331.1502990722656, "logps/rejected": -318.52459716796875, "loss": 1.0127, "rewards/accuracies": 0.5, "rewards/chosen": 2.6238489151000977, "rewards/margins": -0.1883992850780487, "rewards/rejected": 2.8122479915618896, "step": 63650 }, { "epoch": 2.9555689679186594, "grad_norm": 118.77913665771484, "learning_rate": 1.2274943126421839e-07, "logits/chosen": -19.30475616455078, "logits/rejected": -18.184391021728516, "logps/chosen": -498.6061096191406, "logps/rejected": -440.5833435058594, "loss": 0.7921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.072699069976807, "rewards/margins": 1.1466240882873535, "rewards/rejected": 2.926074504852295, "step": 63660 }, { "epoch": 2.956033242026092, "grad_norm": 94.33154296875, "learning_rate": 1.227215748177724e-07, "logits/chosen": -18.84605598449707, "logits/rejected": -17.064952850341797, "logps/chosen": -296.27545166015625, "logps/rejected": -218.33486938476562, "loss": 0.9982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7298026084899902, "rewards/margins": 1.9731206893920898, "rewards/rejected": 0.7566819787025452, "step": 63670 }, { "epoch": 2.956497516133525, "grad_norm": 113.36267852783203, "learning_rate": 1.2269371837132644e-07, "logits/chosen": -20.133440017700195, "logits/rejected": -18.88907814025879, "logps/chosen": -528.8902587890625, "logps/rejected": -438.861083984375, "loss": 0.531, "rewards/accuracies": 0.5, "rewards/chosen": 5.3553667068481445, "rewards/margins": 1.4260390996932983, "rewards/rejected": 3.9293274879455566, "step": 63680 }, { "epoch": 2.956961790240958, "grad_norm": 199.0067901611328, "learning_rate": 1.2266586192488043e-07, "logits/chosen": -18.525920867919922, "logits/rejected": -18.1220703125, "logps/chosen": -349.93341064453125, "logps/rejected": -305.03643798828125, "loss": 0.8579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6868515014648438, "rewards/margins": 0.7979318499565125, "rewards/rejected": 2.8889200687408447, "step": 63690 }, { "epoch": 2.9574260643483914, "grad_norm": 78.45294189453125, "learning_rate": 1.2263800547843447e-07, "logits/chosen": -18.887203216552734, "logits/rejected": -18.470523834228516, "logps/chosen": -379.86651611328125, "logps/rejected": -429.64923095703125, "loss": 1.3151, "rewards/accuracies": 0.5, "rewards/chosen": 3.80877947807312, "rewards/margins": -0.025210117921233177, "rewards/rejected": 3.8339900970458984, "step": 63700 }, { "epoch": 2.957890338455824, "grad_norm": 31.86623191833496, "learning_rate": 1.2261014903198849e-07, "logits/chosen": -19.254289627075195, "logits/rejected": -17.632884979248047, "logps/chosen": -447.51165771484375, "logps/rejected": -247.42208862304688, "loss": 0.3003, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.637128829956055, "rewards/margins": 3.8140792846679688, "rewards/rejected": 1.8230502605438232, "step": 63710 }, { "epoch": 2.9583546125632574, "grad_norm": 45.84700393676758, "learning_rate": 1.225822925855425e-07, "logits/chosen": -18.95406150817871, "logits/rejected": -17.856992721557617, "logps/chosen": -326.82855224609375, "logps/rejected": -221.8740234375, "loss": 0.4462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2556629180908203, "rewards/margins": 1.8632926940917969, "rewards/rejected": 1.3923704624176025, "step": 63720 }, { "epoch": 2.9588188866706906, "grad_norm": 2.0920963287353516, "learning_rate": 1.2255443613909651e-07, "logits/chosen": -19.321178436279297, "logits/rejected": -18.550922393798828, "logps/chosen": -408.12591552734375, "logps/rejected": -325.0787048339844, "loss": 0.246, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.0205302238464355, "rewards/margins": 2.8636248111724854, "rewards/rejected": 2.15690541267395, "step": 63730 }, { "epoch": 2.9592831607781234, "grad_norm": 68.25883483886719, "learning_rate": 1.2252657969265053e-07, "logits/chosen": -19.104978561401367, "logits/rejected": -18.62061882019043, "logps/chosen": -315.13653564453125, "logps/rejected": -207.3303985595703, "loss": 0.8541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2733497619628906, "rewards/margins": 0.9686471819877625, "rewards/rejected": 1.3047025203704834, "step": 63740 }, { "epoch": 2.9597474348855566, "grad_norm": 274.4633483886719, "learning_rate": 1.2249872324620454e-07, "logits/chosen": -18.914440155029297, "logits/rejected": -18.918010711669922, "logps/chosen": -455.5723571777344, "logps/rejected": -450.09112548828125, "loss": 1.2364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.655297756195068, "rewards/margins": 0.9817547798156738, "rewards/rejected": 3.6735432147979736, "step": 63750 }, { "epoch": 2.9602117089929894, "grad_norm": 34.38950729370117, "learning_rate": 1.2247086679975856e-07, "logits/chosen": -18.7713623046875, "logits/rejected": -19.222126007080078, "logps/chosen": -369.970947265625, "logps/rejected": -363.3882751464844, "loss": 1.4773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.427046060562134, "rewards/margins": -0.2181689441204071, "rewards/rejected": 2.6452150344848633, "step": 63760 }, { "epoch": 2.9606759831004226, "grad_norm": 137.7391357421875, "learning_rate": 1.224430103533126e-07, "logits/chosen": -18.80488395690918, "logits/rejected": -18.39596939086914, "logps/chosen": -341.0167541503906, "logps/rejected": -314.81707763671875, "loss": 0.5028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8852386474609375, "rewards/margins": 1.2191001176834106, "rewards/rejected": 1.6661386489868164, "step": 63770 }, { "epoch": 2.9611402572078553, "grad_norm": 28.183916091918945, "learning_rate": 1.2241515390686661e-07, "logits/chosen": -19.231552124023438, "logits/rejected": -18.129817962646484, "logps/chosen": -353.51446533203125, "logps/rejected": -301.9259948730469, "loss": 0.3535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.228870391845703, "rewards/margins": 1.595800518989563, "rewards/rejected": 1.633069634437561, "step": 63780 }, { "epoch": 2.9616045313152886, "grad_norm": 53.410606384277344, "learning_rate": 1.2238729746042063e-07, "logits/chosen": -18.80279541015625, "logits/rejected": -17.70252799987793, "logps/chosen": -517.8049926757812, "logps/rejected": -322.86614990234375, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": 4.599024295806885, "rewards/margins": 2.4394302368164062, "rewards/rejected": 2.1595940589904785, "step": 63790 }, { "epoch": 2.9620688054227218, "grad_norm": 125.25298309326172, "learning_rate": 1.2235944101397464e-07, "logits/chosen": -19.698810577392578, "logits/rejected": -19.635251998901367, "logps/chosen": -401.6846008300781, "logps/rejected": -406.73638916015625, "loss": 1.1622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.269310474395752, "rewards/margins": 0.3212178647518158, "rewards/rejected": 3.9480929374694824, "step": 63800 }, { "epoch": 2.9625330795301545, "grad_norm": 31.75448226928711, "learning_rate": 1.2233158456752866e-07, "logits/chosen": -19.29983139038086, "logits/rejected": -18.247188568115234, "logps/chosen": -409.1187744140625, "logps/rejected": -214.1638946533203, "loss": 0.503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.180431842803955, "rewards/margins": 2.008575916290283, "rewards/rejected": 1.1718556880950928, "step": 63810 }, { "epoch": 2.9629973536375878, "grad_norm": 145.45704650878906, "learning_rate": 1.2230372812108267e-07, "logits/chosen": -20.284955978393555, "logits/rejected": -19.285358428955078, "logps/chosen": -441.8108825683594, "logps/rejected": -378.0392150878906, "loss": 0.5363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.120347023010254, "rewards/margins": 1.2419769763946533, "rewards/rejected": 2.8783700466156006, "step": 63820 }, { "epoch": 2.9634616277450205, "grad_norm": 138.13348388671875, "learning_rate": 1.222758716746367e-07, "logits/chosen": -19.358409881591797, "logits/rejected": -19.52202606201172, "logps/chosen": -371.81549072265625, "logps/rejected": -412.40203857421875, "loss": 1.0209, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.136293888092041, "rewards/margins": 0.05911605432629585, "rewards/rejected": 4.07717752456665, "step": 63830 }, { "epoch": 2.9639259018524537, "grad_norm": 45.506813049316406, "learning_rate": 1.2224801522819073e-07, "logits/chosen": -18.713228225708008, "logits/rejected": -17.81283950805664, "logps/chosen": -433.96881103515625, "logps/rejected": -334.7733459472656, "loss": 0.3758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5728201866149902, "rewards/margins": 1.742384910583496, "rewards/rejected": 1.8304357528686523, "step": 63840 }, { "epoch": 2.9643901759598865, "grad_norm": 12.081475257873535, "learning_rate": 1.2222015878174474e-07, "logits/chosen": -20.25067710876465, "logits/rejected": -19.854215621948242, "logps/chosen": -461.7444763183594, "logps/rejected": -382.2015380859375, "loss": 0.498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.929969310760498, "rewards/margins": 1.4482700824737549, "rewards/rejected": 2.4816994667053223, "step": 63850 }, { "epoch": 2.9648544500673197, "grad_norm": 76.39878845214844, "learning_rate": 1.2219230233529876e-07, "logits/chosen": -18.931232452392578, "logits/rejected": -18.332212448120117, "logps/chosen": -405.9678649902344, "logps/rejected": -369.2850646972656, "loss": 0.5757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6228466033935547, "rewards/margins": 1.53281569480896, "rewards/rejected": 2.090031147003174, "step": 63860 }, { "epoch": 2.965318724174753, "grad_norm": 133.27110290527344, "learning_rate": 1.2216444588885277e-07, "logits/chosen": -19.242698669433594, "logits/rejected": -19.107229232788086, "logps/chosen": -272.5975036621094, "logps/rejected": -264.73779296875, "loss": 0.5606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.734653949737549, "rewards/margins": 1.0979512929916382, "rewards/rejected": 1.6367027759552002, "step": 63870 }, { "epoch": 2.9657829982821857, "grad_norm": 19.06425666809082, "learning_rate": 1.2213658944240679e-07, "logits/chosen": -19.14792251586914, "logits/rejected": -17.848846435546875, "logps/chosen": -460.971923828125, "logps/rejected": -258.8851318359375, "loss": 0.2441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.399494647979736, "rewards/margins": 2.710395336151123, "rewards/rejected": 1.6890993118286133, "step": 63880 }, { "epoch": 2.966247272389619, "grad_norm": 154.4697265625, "learning_rate": 1.2210873299596083e-07, "logits/chosen": -18.285633087158203, "logits/rejected": -17.969924926757812, "logps/chosen": -452.166748046875, "logps/rejected": -370.90570068359375, "loss": 1.121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.5373382568359375, "rewards/margins": 1.0929749011993408, "rewards/rejected": 3.444364070892334, "step": 63890 }, { "epoch": 2.9667115464970517, "grad_norm": 106.26935577392578, "learning_rate": 1.2208087654951481e-07, "logits/chosen": -19.3161678314209, "logits/rejected": -17.80945587158203, "logps/chosen": -447.72845458984375, "logps/rejected": -248.40707397460938, "loss": 0.4114, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.050907850265503, "rewards/margins": 1.701864242553711, "rewards/rejected": 1.349043607711792, "step": 63900 }, { "epoch": 2.967175820604485, "grad_norm": 3.376495361328125, "learning_rate": 1.2205302010306886e-07, "logits/chosen": -19.27178955078125, "logits/rejected": -17.54263687133789, "logps/chosen": -539.9395141601562, "logps/rejected": -372.11883544921875, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": 5.394608020782471, "rewards/margins": 3.1320154666900635, "rewards/rejected": 2.2625927925109863, "step": 63910 }, { "epoch": 2.9676400947119177, "grad_norm": 70.8698959350586, "learning_rate": 1.2202516365662287e-07, "logits/chosen": -18.303478240966797, "logits/rejected": -17.269020080566406, "logps/chosen": -413.34210205078125, "logps/rejected": -307.7635803222656, "loss": 0.5269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2404332160949707, "rewards/margins": 1.592773199081421, "rewards/rejected": 1.6476600170135498, "step": 63920 }, { "epoch": 2.968104368819351, "grad_norm": 39.42675018310547, "learning_rate": 1.2199730721017688e-07, "logits/chosen": -19.0245418548584, "logits/rejected": -17.850753784179688, "logps/chosen": -405.8117370605469, "logps/rejected": -276.0010681152344, "loss": 0.8156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3434700965881348, "rewards/margins": 1.297572374343872, "rewards/rejected": 2.0458974838256836, "step": 63930 }, { "epoch": 2.968568642926784, "grad_norm": 39.99640655517578, "learning_rate": 1.219694507637309e-07, "logits/chosen": -19.195171356201172, "logits/rejected": -18.319625854492188, "logps/chosen": -353.46612548828125, "logps/rejected": -262.229248046875, "loss": 0.7931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2525992393493652, "rewards/margins": 1.3359348773956299, "rewards/rejected": 1.916664481163025, "step": 63940 }, { "epoch": 2.969032917034217, "grad_norm": 85.84696197509766, "learning_rate": 1.2194159431728491e-07, "logits/chosen": -18.540016174316406, "logits/rejected": -17.787872314453125, "logps/chosen": -424.54119873046875, "logps/rejected": -411.62750244140625, "loss": 1.1771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4362776279449463, "rewards/margins": 0.21374063193798065, "rewards/rejected": 2.22253680229187, "step": 63950 }, { "epoch": 2.96949719114165, "grad_norm": 96.24098205566406, "learning_rate": 1.2191373787083893e-07, "logits/chosen": -20.5037899017334, "logits/rejected": -19.07777214050293, "logps/chosen": -476.91070556640625, "logps/rejected": -362.1976623535156, "loss": 0.5708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.415069103240967, "rewards/margins": 1.483910322189331, "rewards/rejected": 2.931159019470215, "step": 63960 }, { "epoch": 2.9699614652490833, "grad_norm": 222.88662719726562, "learning_rate": 1.2188588142439294e-07, "logits/chosen": -19.110525131225586, "logits/rejected": -18.960315704345703, "logps/chosen": -453.86309814453125, "logps/rejected": -409.35882568359375, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.8706254959106445, "rewards/margins": 0.8402649164199829, "rewards/rejected": 4.030360698699951, "step": 63970 }, { "epoch": 2.970425739356516, "grad_norm": 0.5039806365966797, "learning_rate": 1.2185802497794698e-07, "logits/chosen": -20.17214012145996, "logits/rejected": -19.198930740356445, "logps/chosen": -373.76190185546875, "logps/rejected": -262.64617919921875, "loss": 1.0471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.843053102493286, "rewards/margins": 1.4483888149261475, "rewards/rejected": 2.3946645259857178, "step": 63980 }, { "epoch": 2.970890013463949, "grad_norm": 129.89320373535156, "learning_rate": 1.21830168531501e-07, "logits/chosen": -19.185632705688477, "logits/rejected": -17.30551528930664, "logps/chosen": -425.02569580078125, "logps/rejected": -327.9666442871094, "loss": 0.4434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.425545692443848, "rewards/margins": 1.6576306819915771, "rewards/rejected": 2.7679147720336914, "step": 63990 }, { "epoch": 2.971354287571382, "grad_norm": 283.4378662109375, "learning_rate": 1.21802312085055e-07, "logits/chosen": -18.8944034576416, "logits/rejected": -18.30862045288086, "logps/chosen": -338.99169921875, "logps/rejected": -329.74810791015625, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3514609336853027, "rewards/margins": 1.0019134283065796, "rewards/rejected": 2.349547863006592, "step": 64000 }, { "epoch": 2.9718185616788153, "grad_norm": 39.21377944946289, "learning_rate": 1.2177445563860903e-07, "logits/chosen": -18.038358688354492, "logits/rejected": -17.659927368164062, "logps/chosen": -287.95635986328125, "logps/rejected": -245.2133331298828, "loss": 0.6362, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.76071298122406, "rewards/margins": 0.9876848459243774, "rewards/rejected": 0.7730280160903931, "step": 64010 }, { "epoch": 2.972282835786248, "grad_norm": 46.10438919067383, "learning_rate": 1.2174659919216304e-07, "logits/chosen": -18.895431518554688, "logits/rejected": -18.556440353393555, "logps/chosen": -441.29595947265625, "logps/rejected": -399.350341796875, "loss": 0.9985, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.3593125343322754, "rewards/margins": 0.37140750885009766, "rewards/rejected": 2.9879047870635986, "step": 64020 }, { "epoch": 2.9727471098936813, "grad_norm": 11.882829666137695, "learning_rate": 1.2171874274571706e-07, "logits/chosen": -18.005067825317383, "logits/rejected": -18.415206909179688, "logps/chosen": -357.9290466308594, "logps/rejected": -373.3701171875, "loss": 1.2559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.041152238845825, "rewards/margins": -0.047185372561216354, "rewards/rejected": 3.0883374214172363, "step": 64030 }, { "epoch": 2.9732113840011145, "grad_norm": 207.1134796142578, "learning_rate": 1.216908862992711e-07, "logits/chosen": -18.58978271484375, "logits/rejected": -17.767847061157227, "logps/chosen": -400.32611083984375, "logps/rejected": -259.18170166015625, "loss": 0.7792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8075637817382812, "rewards/margins": 1.408119559288025, "rewards/rejected": 2.399444580078125, "step": 64040 }, { "epoch": 2.9736756581085473, "grad_norm": 213.890869140625, "learning_rate": 1.216630298528251e-07, "logits/chosen": -18.860904693603516, "logits/rejected": -18.83856773376465, "logps/chosen": -501.16650390625, "logps/rejected": -430.59716796875, "loss": 0.8131, "rewards/accuracies": 0.5, "rewards/chosen": 3.948638916015625, "rewards/margins": 0.0648735985159874, "rewards/rejected": 3.883765459060669, "step": 64050 }, { "epoch": 2.97413993221598, "grad_norm": 68.17223358154297, "learning_rate": 1.2163517340637913e-07, "logits/chosen": -20.31269073486328, "logits/rejected": -18.677709579467773, "logps/chosen": -448.5021057128906, "logps/rejected": -359.2538146972656, "loss": 0.3137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8167762756347656, "rewards/margins": 1.7238504886627197, "rewards/rejected": 2.092925548553467, "step": 64060 }, { "epoch": 2.9746042063234133, "grad_norm": 138.52879333496094, "learning_rate": 1.2160731695993314e-07, "logits/chosen": -20.08664894104004, "logits/rejected": -19.02035903930664, "logps/chosen": -380.71246337890625, "logps/rejected": -316.52398681640625, "loss": 0.4251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.930208683013916, "rewards/margins": 1.402045488357544, "rewards/rejected": 2.528162717819214, "step": 64070 }, { "epoch": 2.9750684804308465, "grad_norm": 3.8201069831848145, "learning_rate": 1.2157946051348716e-07, "logits/chosen": -18.117389678955078, "logits/rejected": -18.186992645263672, "logps/chosen": -366.4815368652344, "logps/rejected": -310.0675964355469, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2840943336486816, "rewards/margins": 0.9539925456047058, "rewards/rejected": 2.330101490020752, "step": 64080 }, { "epoch": 2.9755327545382793, "grad_norm": 8.136971473693848, "learning_rate": 1.2155160406704117e-07, "logits/chosen": -19.126602172851562, "logits/rejected": -17.579360961914062, "logps/chosen": -398.7028503417969, "logps/rejected": -231.6571502685547, "loss": 0.2969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.231182098388672, "rewards/margins": 2.4892284870147705, "rewards/rejected": 0.7419537305831909, "step": 64090 }, { "epoch": 2.9759970286457125, "grad_norm": 9.691360473632812, "learning_rate": 1.215237476205952e-07, "logits/chosen": -19.29201316833496, "logits/rejected": -18.322710037231445, "logps/chosen": -434.3641662597656, "logps/rejected": -376.15911865234375, "loss": 1.0346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8254921436309814, "rewards/margins": 0.7137900590896606, "rewards/rejected": 3.1117022037506104, "step": 64100 }, { "epoch": 2.9764613027531457, "grad_norm": 164.0042266845703, "learning_rate": 1.214958911741492e-07, "logits/chosen": -18.800874710083008, "logits/rejected": -18.2817325592041, "logps/chosen": -394.9722595214844, "logps/rejected": -338.7725830078125, "loss": 0.7471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8053555488586426, "rewards/margins": 1.2534137964248657, "rewards/rejected": 2.5519416332244873, "step": 64110 }, { "epoch": 2.9769255768605785, "grad_norm": 173.35284423828125, "learning_rate": 1.2146803472770321e-07, "logits/chosen": -18.172168731689453, "logits/rejected": -18.162601470947266, "logps/chosen": -385.64691162109375, "logps/rejected": -344.15863037109375, "loss": 1.0438, "rewards/accuracies": 0.5, "rewards/chosen": 2.5784621238708496, "rewards/margins": -0.21741795539855957, "rewards/rejected": 2.795880079269409, "step": 64120 }, { "epoch": 2.9773898509680117, "grad_norm": 196.7581024169922, "learning_rate": 1.2144017828125726e-07, "logits/chosen": -19.01729965209961, "logits/rejected": -18.84572982788086, "logps/chosen": -385.1590270996094, "logps/rejected": -405.50872802734375, "loss": 1.1231, "rewards/accuracies": 0.5, "rewards/chosen": 2.760784387588501, "rewards/margins": -0.027541542425751686, "rewards/rejected": 2.788325786590576, "step": 64130 }, { "epoch": 2.9778541250754444, "grad_norm": 0.15877749025821686, "learning_rate": 1.2141232183481127e-07, "logits/chosen": -20.392017364501953, "logits/rejected": -18.87306785583496, "logps/chosen": -401.79742431640625, "logps/rejected": -304.0054626464844, "loss": 0.4172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.073412895202637, "rewards/margins": 2.4508206844329834, "rewards/rejected": 1.6225922107696533, "step": 64140 }, { "epoch": 2.9783183991828777, "grad_norm": 53.32210159301758, "learning_rate": 1.2138446538836528e-07, "logits/chosen": -19.33675193786621, "logits/rejected": -18.599660873413086, "logps/chosen": -521.9488525390625, "logps/rejected": -375.1151123046875, "loss": 0.2036, "rewards/accuracies": 1.0, "rewards/chosen": 5.5824127197265625, "rewards/margins": 2.711699962615967, "rewards/rejected": 2.8707122802734375, "step": 64150 }, { "epoch": 2.9787826732903104, "grad_norm": 44.400177001953125, "learning_rate": 1.213566089419193e-07, "logits/chosen": -18.416818618774414, "logits/rejected": -18.642362594604492, "logps/chosen": -388.1515197753906, "logps/rejected": -382.8945007324219, "loss": 0.9517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.465738296508789, "rewards/margins": -0.029565125703811646, "rewards/rejected": 3.4953033924102783, "step": 64160 }, { "epoch": 2.9792469473977437, "grad_norm": 274.20318603515625, "learning_rate": 1.213287524954733e-07, "logits/chosen": -18.60968017578125, "logits/rejected": -17.900421142578125, "logps/chosen": -332.7911376953125, "logps/rejected": -336.12677001953125, "loss": 1.1121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6359305381774902, "rewards/margins": 0.725354790687561, "rewards/rejected": 2.9105758666992188, "step": 64170 }, { "epoch": 2.979711221505177, "grad_norm": 31.105985641479492, "learning_rate": 1.2130089604902733e-07, "logits/chosen": -19.2836856842041, "logits/rejected": -18.239248275756836, "logps/chosen": -420.64019775390625, "logps/rejected": -344.7051696777344, "loss": 0.598, "rewards/accuracies": 0.5, "rewards/chosen": 3.3652420043945312, "rewards/margins": 1.0967686176300049, "rewards/rejected": 2.2684731483459473, "step": 64180 }, { "epoch": 2.9801754956126096, "grad_norm": 15.614175796508789, "learning_rate": 1.2127303960258137e-07, "logits/chosen": -19.007707595825195, "logits/rejected": -18.05939483642578, "logps/chosen": -344.3915100097656, "logps/rejected": -262.150634765625, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 3.3729796409606934, "rewards/margins": 1.8799511194229126, "rewards/rejected": 1.4930285215377808, "step": 64190 }, { "epoch": 2.980639769720043, "grad_norm": 129.42190551757812, "learning_rate": 1.2124518315613538e-07, "logits/chosen": -19.095388412475586, "logits/rejected": -18.90387535095215, "logps/chosen": -469.2435607910156, "logps/rejected": -396.1465759277344, "loss": 0.604, "rewards/accuracies": 0.5, "rewards/chosen": 3.936702013015747, "rewards/margins": 1.0078147649765015, "rewards/rejected": 2.928886890411377, "step": 64200 }, { "epoch": 2.9811040438274756, "grad_norm": 30.10332679748535, "learning_rate": 1.212173267096894e-07, "logits/chosen": -19.73411750793457, "logits/rejected": -19.22500228881836, "logps/chosen": -476.1123046875, "logps/rejected": -348.82440185546875, "loss": 0.8083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9936954975128174, "rewards/margins": 0.8754755258560181, "rewards/rejected": 3.1182198524475098, "step": 64210 }, { "epoch": 2.981568317934909, "grad_norm": 66.70604705810547, "learning_rate": 1.211894702632434e-07, "logits/chosen": -18.522686004638672, "logits/rejected": -18.570756912231445, "logps/chosen": -315.7674865722656, "logps/rejected": -321.0806579589844, "loss": 1.0909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7806692123413086, "rewards/margins": -0.11617939174175262, "rewards/rejected": 2.896848201751709, "step": 64220 }, { "epoch": 2.9820325920423416, "grad_norm": 36.56636047363281, "learning_rate": 1.2116161381679743e-07, "logits/chosen": -19.046472549438477, "logits/rejected": -18.189861297607422, "logps/chosen": -395.67108154296875, "logps/rejected": -351.4520568847656, "loss": 0.331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.031434535980225, "rewards/margins": 1.2869547605514526, "rewards/rejected": 2.7444794178009033, "step": 64230 }, { "epoch": 2.982496866149775, "grad_norm": 155.43467712402344, "learning_rate": 1.2113375737035144e-07, "logits/chosen": -18.744037628173828, "logits/rejected": -17.825090408325195, "logps/chosen": -312.3990478515625, "logps/rejected": -273.5506591796875, "loss": 0.9577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.489445209503174, "rewards/margins": 1.1015207767486572, "rewards/rejected": 2.3879239559173584, "step": 64240 }, { "epoch": 2.982961140257208, "grad_norm": 7.196521282196045, "learning_rate": 1.2110590092390548e-07, "logits/chosen": -19.273862838745117, "logits/rejected": -18.042999267578125, "logps/chosen": -516.4002685546875, "logps/rejected": -331.40673828125, "loss": 0.2932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.107203483581543, "rewards/margins": 2.5052731037139893, "rewards/rejected": 2.601930618286133, "step": 64250 }, { "epoch": 2.983425414364641, "grad_norm": 239.3807373046875, "learning_rate": 1.210780444774595e-07, "logits/chosen": -19.308216094970703, "logits/rejected": -17.77383041381836, "logps/chosen": -428.5785217285156, "logps/rejected": -287.58203125, "loss": 0.5052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6659040451049805, "rewards/margins": 2.8346734046936035, "rewards/rejected": 1.831230878829956, "step": 64260 }, { "epoch": 2.983889688472074, "grad_norm": 10.130461692810059, "learning_rate": 1.210501880310135e-07, "logits/chosen": -19.08708953857422, "logits/rejected": -17.656272888183594, "logps/chosen": -444.80029296875, "logps/rejected": -290.3206481933594, "loss": 0.5048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.555037498474121, "rewards/margins": 2.221529006958008, "rewards/rejected": 2.333508253097534, "step": 64270 }, { "epoch": 2.984353962579507, "grad_norm": 230.83721923828125, "learning_rate": 1.2102233158456753e-07, "logits/chosen": -18.512815475463867, "logits/rejected": -18.23600959777832, "logps/chosen": -434.88134765625, "logps/rejected": -404.78509521484375, "loss": 1.0242, "rewards/accuracies": 0.5, "rewards/chosen": 4.312552452087402, "rewards/margins": 0.22220131754875183, "rewards/rejected": 4.090351581573486, "step": 64280 }, { "epoch": 2.98481823668694, "grad_norm": 0.305070161819458, "learning_rate": 1.2099447513812154e-07, "logits/chosen": -19.105628967285156, "logits/rejected": -19.077110290527344, "logps/chosen": -380.15667724609375, "logps/rejected": -372.21868896484375, "loss": 1.3349, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2374091148376465, "rewards/margins": 0.5222323536872864, "rewards/rejected": 2.715176582336426, "step": 64290 }, { "epoch": 2.985282510794373, "grad_norm": 200.61477661132812, "learning_rate": 1.2096661869167556e-07, "logits/chosen": -20.61256980895996, "logits/rejected": -19.412166595458984, "logps/chosen": -392.80474853515625, "logps/rejected": -277.7825927734375, "loss": 0.6278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3022968769073486, "rewards/margins": 1.710873007774353, "rewards/rejected": 1.5914238691329956, "step": 64300 }, { "epoch": 2.985746784901806, "grad_norm": 209.11436462402344, "learning_rate": 1.209387622452296e-07, "logits/chosen": -17.959712982177734, "logits/rejected": -18.176494598388672, "logps/chosen": -356.7060852050781, "logps/rejected": -365.8431091308594, "loss": 1.4193, "rewards/accuracies": 0.5, "rewards/chosen": 3.1588077545166016, "rewards/margins": 0.31531038880348206, "rewards/rejected": 2.8434975147247314, "step": 64310 }, { "epoch": 2.986211059009239, "grad_norm": 77.74962615966797, "learning_rate": 1.2091090579878358e-07, "logits/chosen": -18.754018783569336, "logits/rejected": -17.68705177307129, "logps/chosen": -331.76898193359375, "logps/rejected": -246.08670043945312, "loss": 0.3546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4914612770080566, "rewards/margins": 1.6659332513809204, "rewards/rejected": 1.8255283832550049, "step": 64320 }, { "epoch": 2.986675333116672, "grad_norm": 174.05357360839844, "learning_rate": 1.208830493523376e-07, "logits/chosen": -19.470605850219727, "logits/rejected": -18.380107879638672, "logps/chosen": -426.6705017089844, "logps/rejected": -369.8118896484375, "loss": 0.5169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.585986137390137, "rewards/margins": 1.4111415147781372, "rewards/rejected": 3.174844980239868, "step": 64330 }, { "epoch": 2.987139607224105, "grad_norm": 112.97296905517578, "learning_rate": 1.2085519290589164e-07, "logits/chosen": -20.36801528930664, "logits/rejected": -18.697856903076172, "logps/chosen": -390.02496337890625, "logps/rejected": -287.175537109375, "loss": 0.3381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3191866874694824, "rewards/margins": 1.4440345764160156, "rewards/rejected": 1.8751518726348877, "step": 64340 }, { "epoch": 2.987603881331538, "grad_norm": 3.0089635848999023, "learning_rate": 1.2082733645944565e-07, "logits/chosen": -19.86684799194336, "logits/rejected": -19.50387191772461, "logps/chosen": -362.77880859375, "logps/rejected": -319.9146728515625, "loss": 0.9727, "rewards/accuracies": 0.5, "rewards/chosen": 2.919081211090088, "rewards/margins": 0.6670271158218384, "rewards/rejected": 2.252054214477539, "step": 64350 }, { "epoch": 2.988068155438971, "grad_norm": 80.33084869384766, "learning_rate": 1.2079948001299967e-07, "logits/chosen": -19.107242584228516, "logits/rejected": -18.220727920532227, "logps/chosen": -412.15643310546875, "logps/rejected": -355.352294921875, "loss": 0.5365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.71899676322937, "rewards/margins": 0.8419191241264343, "rewards/rejected": 1.877077341079712, "step": 64360 }, { "epoch": 2.988532429546404, "grad_norm": 50.58264923095703, "learning_rate": 1.2077162356655368e-07, "logits/chosen": -18.646387100219727, "logits/rejected": -17.770998001098633, "logps/chosen": -400.6419982910156, "logps/rejected": -320.75189208984375, "loss": 0.9265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.121213674545288, "rewards/margins": 1.1175421476364136, "rewards/rejected": 2.003671646118164, "step": 64370 }, { "epoch": 2.988996703653837, "grad_norm": 6.4397735595703125, "learning_rate": 1.207437671201077e-07, "logits/chosen": -18.652053833007812, "logits/rejected": -17.654573440551758, "logps/chosen": -390.5979309082031, "logps/rejected": -359.46942138671875, "loss": 0.7315, "rewards/accuracies": 0.5, "rewards/chosen": 2.417217969894409, "rewards/margins": 0.31177768111228943, "rewards/rejected": 2.105440139770508, "step": 64380 }, { "epoch": 2.9894609777612704, "grad_norm": 52.05686950683594, "learning_rate": 1.207159106736617e-07, "logits/chosen": -18.753650665283203, "logits/rejected": -17.978809356689453, "logps/chosen": -457.4375, "logps/rejected": -369.6938171386719, "loss": 0.7109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3658719062805176, "rewards/margins": 0.8535563349723816, "rewards/rejected": 2.512315273284912, "step": 64390 }, { "epoch": 2.989925251868703, "grad_norm": 52.93905258178711, "learning_rate": 1.2068805422721575e-07, "logits/chosen": -17.928905487060547, "logits/rejected": -18.259336471557617, "logps/chosen": -327.3734130859375, "logps/rejected": -326.08660888671875, "loss": 0.9191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.391080379486084, "rewards/margins": 0.41651520133018494, "rewards/rejected": 1.9745655059814453, "step": 64400 }, { "epoch": 2.9903895259761364, "grad_norm": 143.6074981689453, "learning_rate": 1.2066019778076977e-07, "logits/chosen": -18.653051376342773, "logits/rejected": -17.607975006103516, "logps/chosen": -342.0133361816406, "logps/rejected": -237.23056030273438, "loss": 0.5944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3575992584228516, "rewards/margins": 2.1110992431640625, "rewards/rejected": 1.246500015258789, "step": 64410 }, { "epoch": 2.9908538000835696, "grad_norm": 223.19920349121094, "learning_rate": 1.2063234133432378e-07, "logits/chosen": -19.424686431884766, "logits/rejected": -20.544336318969727, "logps/chosen": -339.5098876953125, "logps/rejected": -437.71429443359375, "loss": 1.1919, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.0526576042175293, "rewards/margins": -0.5462062358856201, "rewards/rejected": 3.5988636016845703, "step": 64420 }, { "epoch": 2.9913180741910024, "grad_norm": 3.181626319885254, "learning_rate": 1.206044848878778e-07, "logits/chosen": -18.956817626953125, "logits/rejected": -17.992441177368164, "logps/chosen": -559.1787109375, "logps/rejected": -388.36175537109375, "loss": 0.3996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.424035549163818, "rewards/margins": 1.7230879068374634, "rewards/rejected": 2.7009475231170654, "step": 64430 }, { "epoch": 2.991782348298435, "grad_norm": 65.2066879272461, "learning_rate": 1.205766284414318e-07, "logits/chosen": -19.555423736572266, "logits/rejected": -18.589582443237305, "logps/chosen": -394.0574035644531, "logps/rejected": -290.71710205078125, "loss": 0.717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.668409824371338, "rewards/margins": 0.8125098943710327, "rewards/rejected": 2.8558993339538574, "step": 64440 }, { "epoch": 2.9922466224058684, "grad_norm": 185.37606811523438, "learning_rate": 1.2054877199498583e-07, "logits/chosen": -19.097965240478516, "logits/rejected": -19.057785034179688, "logps/chosen": -429.02685546875, "logps/rejected": -402.12554931640625, "loss": 0.5725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.140786170959473, "rewards/margins": 1.3395397663116455, "rewards/rejected": 2.801246166229248, "step": 64450 }, { "epoch": 2.9927108965133016, "grad_norm": 269.9489440917969, "learning_rate": 1.2052091554853987e-07, "logits/chosen": -19.796152114868164, "logits/rejected": -18.511518478393555, "logps/chosen": -464.21710205078125, "logps/rejected": -388.9322509765625, "loss": 0.5927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.348628997802734, "rewards/margins": 1.5651615858078003, "rewards/rejected": 2.7834672927856445, "step": 64460 }, { "epoch": 2.9931751706207343, "grad_norm": 33.198486328125, "learning_rate": 1.2049305910209388e-07, "logits/chosen": -19.58572769165039, "logits/rejected": -18.67075538635254, "logps/chosen": -437.2193298339844, "logps/rejected": -346.2158203125, "loss": 0.611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5716922283172607, "rewards/margins": 0.9782058000564575, "rewards/rejected": 2.5934863090515137, "step": 64470 }, { "epoch": 2.9936394447281676, "grad_norm": 0.00599694112315774, "learning_rate": 1.204652026556479e-07, "logits/chosen": -19.690732955932617, "logits/rejected": -18.185041427612305, "logps/chosen": -392.13140869140625, "logps/rejected": -288.60125732421875, "loss": 0.3246, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9589591026306152, "rewards/margins": 2.992157459259033, "rewards/rejected": 0.9668019413948059, "step": 64480 }, { "epoch": 2.994103718835601, "grad_norm": 234.4952392578125, "learning_rate": 1.204373462092019e-07, "logits/chosen": -18.527742385864258, "logits/rejected": -17.450191497802734, "logps/chosen": -417.2098083496094, "logps/rejected": -375.28594970703125, "loss": 0.6433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.36154842376709, "rewards/margins": 1.108314871788025, "rewards/rejected": 3.2532334327697754, "step": 64490 }, { "epoch": 2.9945679929430336, "grad_norm": 7.288191795349121, "learning_rate": 1.2040948976275593e-07, "logits/chosen": -18.85489845275879, "logits/rejected": -18.417705535888672, "logps/chosen": -312.6552429199219, "logps/rejected": -220.81661987304688, "loss": 0.5736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3886559009552, "rewards/margins": 1.1382191181182861, "rewards/rejected": 1.2504366636276245, "step": 64500 }, { "epoch": 2.9950322670504663, "grad_norm": 23.506450653076172, "learning_rate": 1.2038163331630994e-07, "logits/chosen": -19.709209442138672, "logits/rejected": -20.06084632873535, "logps/chosen": -296.2295227050781, "logps/rejected": -340.5021057128906, "loss": 0.985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0164663791656494, "rewards/margins": 0.22204530239105225, "rewards/rejected": 2.7944211959838867, "step": 64510 }, { "epoch": 2.9954965411578995, "grad_norm": 10.328542709350586, "learning_rate": 1.2035377686986395e-07, "logits/chosen": -19.647937774658203, "logits/rejected": -18.405620574951172, "logps/chosen": -380.13922119140625, "logps/rejected": -315.4535827636719, "loss": 0.6817, "rewards/accuracies": 0.5, "rewards/chosen": 3.5694580078125, "rewards/margins": 0.7924138307571411, "rewards/rejected": 2.7770445346832275, "step": 64520 }, { "epoch": 2.9959608152653328, "grad_norm": 157.89306640625, "learning_rate": 1.2032592042341797e-07, "logits/chosen": -18.926027297973633, "logits/rejected": -18.08266830444336, "logps/chosen": -407.32952880859375, "logps/rejected": -324.36834716796875, "loss": 0.4295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.685080051422119, "rewards/margins": 1.1922162771224976, "rewards/rejected": 2.492863416671753, "step": 64530 }, { "epoch": 2.9964250893727655, "grad_norm": 0.7803380489349365, "learning_rate": 1.2029806397697198e-07, "logits/chosen": -19.060611724853516, "logits/rejected": -18.267520904541016, "logps/chosen": -344.699462890625, "logps/rejected": -278.3884582519531, "loss": 0.4751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.471585750579834, "rewards/margins": 1.6616929769515991, "rewards/rejected": 1.8098926544189453, "step": 64540 }, { "epoch": 2.9968893634801987, "grad_norm": 19.230548858642578, "learning_rate": 1.2027020753052602e-07, "logits/chosen": -19.10866355895996, "logits/rejected": -18.777881622314453, "logps/chosen": -334.23040771484375, "logps/rejected": -309.90240478515625, "loss": 0.6594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2210259437561035, "rewards/margins": 0.5043861269950867, "rewards/rejected": 2.7166402339935303, "step": 64550 }, { "epoch": 2.997353637587632, "grad_norm": 64.62399291992188, "learning_rate": 1.2024235108408004e-07, "logits/chosen": -20.16602325439453, "logits/rejected": -19.323043823242188, "logps/chosen": -364.7366943359375, "logps/rejected": -183.87860107421875, "loss": 0.624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6564135551452637, "rewards/margins": 2.217129945755005, "rewards/rejected": 1.439283847808838, "step": 64560 }, { "epoch": 2.9978179116950647, "grad_norm": 125.55875396728516, "learning_rate": 1.2021449463763405e-07, "logits/chosen": -19.88956642150879, "logits/rejected": -19.261619567871094, "logps/chosen": -377.39752197265625, "logps/rejected": -304.0572204589844, "loss": 0.3457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.417261600494385, "rewards/margins": 1.6556522846221924, "rewards/rejected": 2.7616093158721924, "step": 64570 }, { "epoch": 2.998282185802498, "grad_norm": 36.45766067504883, "learning_rate": 1.2018663819118807e-07, "logits/chosen": -18.908878326416016, "logits/rejected": -17.85309600830078, "logps/chosen": -430.95294189453125, "logps/rejected": -318.60052490234375, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.531857490539551, "rewards/margins": 1.8968334197998047, "rewards/rejected": 2.6350245475769043, "step": 64580 }, { "epoch": 2.9987464599099307, "grad_norm": 205.3402099609375, "learning_rate": 1.2015878174474208e-07, "logits/chosen": -19.56607437133789, "logits/rejected": -18.93722915649414, "logps/chosen": -565.2418823242188, "logps/rejected": -419.3599548339844, "loss": 0.7287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.214271545410156, "rewards/margins": 1.0189855098724365, "rewards/rejected": 4.195286273956299, "step": 64590 }, { "epoch": 2.999210734017364, "grad_norm": 81.94379425048828, "learning_rate": 1.201309252982961e-07, "logits/chosen": -18.564167022705078, "logits/rejected": -18.026323318481445, "logps/chosen": -470.23681640625, "logps/rejected": -357.7309265136719, "loss": 0.4921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.099911689758301, "rewards/margins": 1.0524822473526, "rewards/rejected": 4.047430038452148, "step": 64600 }, { "epoch": 2.9996750081247967, "grad_norm": 80.26226043701172, "learning_rate": 1.2010306885185014e-07, "logits/chosen": -18.91799545288086, "logits/rejected": -18.01226043701172, "logps/chosen": -541.7880859375, "logps/rejected": -435.0931091308594, "loss": 0.2936, "rewards/accuracies": 1.0, "rewards/chosen": 6.129482746124268, "rewards/margins": 1.4630687236785889, "rewards/rejected": 4.6664137840271, "step": 64610 }, { "epoch": 3.00013928223223, "grad_norm": 74.26380920410156, "learning_rate": 1.2007521240540415e-07, "logits/chosen": -19.235065460205078, "logits/rejected": -18.788497924804688, "logps/chosen": -276.00164794921875, "logps/rejected": -254.0408935546875, "loss": 0.6735, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5415079593658447, "rewards/margins": 0.2128789871931076, "rewards/rejected": 1.3286291360855103, "step": 64620 }, { "epoch": 3.000603556339663, "grad_norm": 40.17082214355469, "learning_rate": 1.2004735595895817e-07, "logits/chosen": -18.811107635498047, "logits/rejected": -18.501869201660156, "logps/chosen": -455.907470703125, "logps/rejected": -406.3310546875, "loss": 1.0039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.529476165771484, "rewards/margins": 1.1614290475845337, "rewards/rejected": 3.368046998977661, "step": 64630 }, { "epoch": 3.001067830447096, "grad_norm": 200.8292236328125, "learning_rate": 1.2001949951251218e-07, "logits/chosen": -18.713924407958984, "logits/rejected": -18.495319366455078, "logps/chosen": -302.65618896484375, "logps/rejected": -279.7051696777344, "loss": 0.9657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2827789783477783, "rewards/margins": 0.21153104305267334, "rewards/rejected": 2.0712478160858154, "step": 64640 }, { "epoch": 3.001532104554529, "grad_norm": 107.77404022216797, "learning_rate": 1.199916430660662e-07, "logits/chosen": -19.770872116088867, "logits/rejected": -19.284587860107422, "logps/chosen": -472.7762145996094, "logps/rejected": -336.7151794433594, "loss": 0.53, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.814549684524536, "rewards/margins": 1.5181798934936523, "rewards/rejected": 2.2963693141937256, "step": 64650 }, { "epoch": 3.001996378661962, "grad_norm": 60.66337966918945, "learning_rate": 1.199637866196202e-07, "logits/chosen": -19.641693115234375, "logits/rejected": -18.952518463134766, "logps/chosen": -363.8907470703125, "logps/rejected": -301.62994384765625, "loss": 0.7614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2424380779266357, "rewards/margins": 1.4869658946990967, "rewards/rejected": 1.755472183227539, "step": 64660 }, { "epoch": 3.002460652769395, "grad_norm": 17.215675354003906, "learning_rate": 1.1993593017317425e-07, "logits/chosen": -18.822891235351562, "logits/rejected": -17.879230499267578, "logps/chosen": -444.42413330078125, "logps/rejected": -296.1541442871094, "loss": 0.2595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.316620826721191, "rewards/margins": 2.2542059421539307, "rewards/rejected": 2.0624148845672607, "step": 64670 }, { "epoch": 3.002924926876828, "grad_norm": 8.993124008178711, "learning_rate": 1.1990807372672827e-07, "logits/chosen": -18.52834701538086, "logits/rejected": -17.808887481689453, "logps/chosen": -290.9493103027344, "logps/rejected": -210.32510375976562, "loss": 0.8399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5481467247009277, "rewards/margins": 1.1998674869537354, "rewards/rejected": 1.348279595375061, "step": 64680 }, { "epoch": 3.003389200984261, "grad_norm": 178.01058959960938, "learning_rate": 1.1988021728028228e-07, "logits/chosen": -18.60512924194336, "logits/rejected": -17.850984573364258, "logps/chosen": -324.46490478515625, "logps/rejected": -262.11505126953125, "loss": 1.3757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.080566644668579, "rewards/margins": 0.24474883079528809, "rewards/rejected": 1.8358176946640015, "step": 64690 }, { "epoch": 3.0038534750916943, "grad_norm": 53.75853729248047, "learning_rate": 1.198523608338363e-07, "logits/chosen": -18.6743221282959, "logits/rejected": -17.10338020324707, "logps/chosen": -339.92071533203125, "logps/rejected": -173.68516540527344, "loss": 0.2668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3330001831054688, "rewards/margins": 2.443250894546509, "rewards/rejected": 0.8897490501403809, "step": 64700 }, { "epoch": 3.004317749199127, "grad_norm": 8.222171783447266, "learning_rate": 1.198245043873903e-07, "logits/chosen": -19.431087493896484, "logits/rejected": -18.389265060424805, "logps/chosen": -469.60662841796875, "logps/rejected": -362.2907409667969, "loss": 0.3208, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.499505996704102, "rewards/margins": 1.323202133178711, "rewards/rejected": 3.1763041019439697, "step": 64710 }, { "epoch": 3.0047820233065603, "grad_norm": 33.954368591308594, "learning_rate": 1.1979664794094432e-07, "logits/chosen": -18.86257553100586, "logits/rejected": -18.616708755493164, "logps/chosen": -332.3872375488281, "logps/rejected": -281.60888671875, "loss": 0.8016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7007269859313965, "rewards/margins": 1.152644157409668, "rewards/rejected": 1.5480825901031494, "step": 64720 }, { "epoch": 3.005246297413993, "grad_norm": 2.4700403213500977, "learning_rate": 1.1976879149449834e-07, "logits/chosen": -19.529216766357422, "logits/rejected": -18.215221405029297, "logps/chosen": -342.2017822265625, "logps/rejected": -255.686279296875, "loss": 1.078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1141014099121094, "rewards/margins": 1.0334417819976807, "rewards/rejected": 2.0806593894958496, "step": 64730 }, { "epoch": 3.0057105715214263, "grad_norm": 271.95013427734375, "learning_rate": 1.1974093504805235e-07, "logits/chosen": -18.59337615966797, "logits/rejected": -18.126222610473633, "logps/chosen": -318.376953125, "logps/rejected": -314.20562744140625, "loss": 0.8617, "rewards/accuracies": 0.5, "rewards/chosen": 2.342254400253296, "rewards/margins": 0.34502825140953064, "rewards/rejected": 1.9972261190414429, "step": 64740 }, { "epoch": 3.006174845628859, "grad_norm": 13.046271324157715, "learning_rate": 1.1971307860160637e-07, "logits/chosen": -19.907766342163086, "logits/rejected": -18.343059539794922, "logps/chosen": -455.6669921875, "logps/rejected": -313.0898132324219, "loss": 0.2789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.628331661224365, "rewards/margins": 2.346170663833618, "rewards/rejected": 2.282161235809326, "step": 64750 }, { "epoch": 3.0066391197362923, "grad_norm": 1.8115191459655762, "learning_rate": 1.196852221551604e-07, "logits/chosen": -19.320323944091797, "logits/rejected": -19.231094360351562, "logps/chosen": -404.31268310546875, "logps/rejected": -364.7272033691406, "loss": 0.5142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8101372718811035, "rewards/margins": 1.6453901529312134, "rewards/rejected": 2.1647469997406006, "step": 64760 }, { "epoch": 3.0071033938437255, "grad_norm": 78.79339599609375, "learning_rate": 1.1965736570871442e-07, "logits/chosen": -18.201038360595703, "logits/rejected": -18.960580825805664, "logps/chosen": -221.10006713867188, "logps/rejected": -258.9930419921875, "loss": 1.4053, "rewards/accuracies": 0.5, "rewards/chosen": 2.030574321746826, "rewards/margins": -0.4084796905517578, "rewards/rejected": 2.439054012298584, "step": 64770 }, { "epoch": 3.0075676679511583, "grad_norm": 290.6842346191406, "learning_rate": 1.1962950926226844e-07, "logits/chosen": -19.154438018798828, "logits/rejected": -19.126188278198242, "logps/chosen": -388.9305419921875, "logps/rejected": -433.51336669921875, "loss": 0.749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.587472915649414, "rewards/margins": 0.746667742729187, "rewards/rejected": 2.8408048152923584, "step": 64780 }, { "epoch": 3.0080319420585915, "grad_norm": 195.17837524414062, "learning_rate": 1.1960165281582245e-07, "logits/chosen": -17.976581573486328, "logits/rejected": -18.24064064025879, "logps/chosen": -295.1453552246094, "logps/rejected": -291.0044860839844, "loss": 0.6714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3841238021850586, "rewards/margins": 1.050382375717163, "rewards/rejected": 1.3337414264678955, "step": 64790 }, { "epoch": 3.0084962161660243, "grad_norm": 204.4738006591797, "learning_rate": 1.1957379636937647e-07, "logits/chosen": -18.649707794189453, "logits/rejected": -18.207313537597656, "logps/chosen": -503.18292236328125, "logps/rejected": -451.33740234375, "loss": 0.919, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.410820960998535, "rewards/margins": 0.7353631258010864, "rewards/rejected": 3.67545747756958, "step": 64800 }, { "epoch": 3.0089604902734575, "grad_norm": 34.41299819946289, "learning_rate": 1.1954593992293048e-07, "logits/chosen": -19.752506256103516, "logits/rejected": -19.022464752197266, "logps/chosen": -551.4189453125, "logps/rejected": -393.0900573730469, "loss": 0.2952, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.603704452514648, "rewards/margins": 1.7597557306289673, "rewards/rejected": 2.8439488410949707, "step": 64810 }, { "epoch": 3.0094247643808907, "grad_norm": 83.78395080566406, "learning_rate": 1.1951808347648452e-07, "logits/chosen": -18.921451568603516, "logits/rejected": -17.691909790039062, "logps/chosen": -315.0304260253906, "logps/rejected": -220.7499237060547, "loss": 0.3095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2860569953918457, "rewards/margins": 2.0003857612609863, "rewards/rejected": 1.2856711149215698, "step": 64820 }, { "epoch": 3.0098890384883235, "grad_norm": 0.020748617127537727, "learning_rate": 1.1949022703003854e-07, "logits/chosen": -19.013568878173828, "logits/rejected": -17.521488189697266, "logps/chosen": -588.6182861328125, "logps/rejected": -349.0877685546875, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 5.78695011138916, "rewards/margins": 3.355442762374878, "rewards/rejected": 2.4315075874328613, "step": 64830 }, { "epoch": 3.0103533125957567, "grad_norm": 127.9753189086914, "learning_rate": 1.1946237058359255e-07, "logits/chosen": -19.53829002380371, "logits/rejected": -19.220369338989258, "logps/chosen": -414.8720703125, "logps/rejected": -348.13677978515625, "loss": 0.7013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5791409015655518, "rewards/margins": 0.7710908651351929, "rewards/rejected": 2.8080499172210693, "step": 64840 }, { "epoch": 3.0108175867031894, "grad_norm": 1.731637954711914, "learning_rate": 1.1943451413714657e-07, "logits/chosen": -19.066390991210938, "logits/rejected": -17.357044219970703, "logps/chosen": -365.70611572265625, "logps/rejected": -253.5333709716797, "loss": 0.2684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9452641010284424, "rewards/margins": 2.6427950859069824, "rewards/rejected": 1.30246901512146, "step": 64850 }, { "epoch": 3.0112818608106227, "grad_norm": 45.61575698852539, "learning_rate": 1.1940665769070058e-07, "logits/chosen": -18.532325744628906, "logits/rejected": -18.169462203979492, "logps/chosen": -378.14337158203125, "logps/rejected": -422.55841064453125, "loss": 0.8647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.072436809539795, "rewards/margins": 0.3830795884132385, "rewards/rejected": 2.689357042312622, "step": 64860 }, { "epoch": 3.0117461349180554, "grad_norm": 35.48735427856445, "learning_rate": 1.193788012442546e-07, "logits/chosen": -18.809816360473633, "logits/rejected": -18.413299560546875, "logps/chosen": -392.3854064941406, "logps/rejected": -303.5623474121094, "loss": 0.6195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.497109889984131, "rewards/margins": 0.8192240595817566, "rewards/rejected": 2.6778857707977295, "step": 64870 }, { "epoch": 3.0122104090254886, "grad_norm": 14.375560760498047, "learning_rate": 1.1935094479780864e-07, "logits/chosen": -19.14576530456543, "logits/rejected": -17.847759246826172, "logps/chosen": -404.97137451171875, "logps/rejected": -243.097412109375, "loss": 0.6446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.62786865234375, "rewards/margins": 1.1000951528549194, "rewards/rejected": 2.527773380279541, "step": 64880 }, { "epoch": 3.012674683132922, "grad_norm": 79.60875701904297, "learning_rate": 1.1932308835136265e-07, "logits/chosen": -19.111433029174805, "logits/rejected": -18.70041847229004, "logps/chosen": -357.7613830566406, "logps/rejected": -331.60662841796875, "loss": 0.7209, "rewards/accuracies": 0.5, "rewards/chosen": 3.4641952514648438, "rewards/margins": 0.7389789819717407, "rewards/rejected": 2.7252161502838135, "step": 64890 }, { "epoch": 3.0131389572403546, "grad_norm": 94.56990051269531, "learning_rate": 1.1929523190491667e-07, "logits/chosen": -18.702566146850586, "logits/rejected": -18.374177932739258, "logps/chosen": -334.475830078125, "logps/rejected": -301.4249267578125, "loss": 0.9379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4319698810577393, "rewards/margins": 0.228533074259758, "rewards/rejected": 3.2034366130828857, "step": 64900 }, { "epoch": 3.013603231347788, "grad_norm": 6.776856422424316, "learning_rate": 1.1926737545847068e-07, "logits/chosen": -18.822837829589844, "logits/rejected": -17.48075294494629, "logps/chosen": -294.3408203125, "logps/rejected": -145.9044189453125, "loss": 0.3661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7434852123260498, "rewards/margins": 2.0180907249450684, "rewards/rejected": -0.27460554242134094, "step": 64910 }, { "epoch": 3.0140675054552206, "grad_norm": 41.1859130859375, "learning_rate": 1.192395190120247e-07, "logits/chosen": -18.418249130249023, "logits/rejected": -18.270526885986328, "logps/chosen": -348.1296081542969, "logps/rejected": -314.2161865234375, "loss": 0.5775, "rewards/accuracies": 0.5, "rewards/chosen": 3.0762481689453125, "rewards/margins": 0.7686794996261597, "rewards/rejected": 2.3075690269470215, "step": 64920 }, { "epoch": 3.014531779562654, "grad_norm": 39.02682876586914, "learning_rate": 1.192116625655787e-07, "logits/chosen": -18.624847412109375, "logits/rejected": -18.747005462646484, "logps/chosen": -292.9224853515625, "logps/rejected": -291.63153076171875, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4940197467803955, "rewards/margins": 0.8835132718086243, "rewards/rejected": 1.6105066537857056, "step": 64930 }, { "epoch": 3.0149960536700866, "grad_norm": 6.15322732925415, "learning_rate": 1.1918380611913272e-07, "logits/chosen": -18.818279266357422, "logits/rejected": -17.90388298034668, "logps/chosen": -427.9385681152344, "logps/rejected": -278.29754638671875, "loss": 0.6196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2440505027770996, "rewards/margins": 1.6142867803573608, "rewards/rejected": 1.6297638416290283, "step": 64940 }, { "epoch": 3.01546032777752, "grad_norm": 184.80384826660156, "learning_rate": 1.1915594967268675e-07, "logits/chosen": -18.549833297729492, "logits/rejected": -18.69428253173828, "logps/chosen": -369.23077392578125, "logps/rejected": -383.75531005859375, "loss": 1.5931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9070992469787598, "rewards/margins": -0.5411055684089661, "rewards/rejected": 3.4482052326202393, "step": 64950 }, { "epoch": 3.015924601884953, "grad_norm": 40.81291198730469, "learning_rate": 1.1912809322624077e-07, "logits/chosen": -19.15131378173828, "logits/rejected": -18.58438491821289, "logps/chosen": -322.2998962402344, "logps/rejected": -267.0969543457031, "loss": 0.3563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6957156658172607, "rewards/margins": 1.5514323711395264, "rewards/rejected": 1.144283652305603, "step": 64960 }, { "epoch": 3.016388875992386, "grad_norm": 74.14146423339844, "learning_rate": 1.191002367797948e-07, "logits/chosen": -19.604995727539062, "logits/rejected": -17.38119888305664, "logps/chosen": -397.50897216796875, "logps/rejected": -163.2513427734375, "loss": 0.4793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9916179180145264, "rewards/margins": 1.8950235843658447, "rewards/rejected": 1.0965943336486816, "step": 64970 }, { "epoch": 3.016853150099819, "grad_norm": 57.74214172363281, "learning_rate": 1.190723803333488e-07, "logits/chosen": -18.348745346069336, "logits/rejected": -18.339019775390625, "logps/chosen": -365.1567687988281, "logps/rejected": -336.16302490234375, "loss": 1.0671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.072402238845825, "rewards/margins": 0.7131373286247253, "rewards/rejected": 2.359264612197876, "step": 64980 }, { "epoch": 3.017317424207252, "grad_norm": 2.5373785495758057, "learning_rate": 1.1904452388690282e-07, "logits/chosen": -18.242107391357422, "logits/rejected": -16.970598220825195, "logps/chosen": -473.4823303222656, "logps/rejected": -287.12554931640625, "loss": 0.4944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.291997909545898, "rewards/margins": 2.068328857421875, "rewards/rejected": 2.2236692905426025, "step": 64990 }, { "epoch": 3.017781698314685, "grad_norm": 16.157733917236328, "learning_rate": 1.1901666744045684e-07, "logits/chosen": -19.345333099365234, "logits/rejected": -19.110822677612305, "logps/chosen": -424.8900451660156, "logps/rejected": -393.66156005859375, "loss": 0.7829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.908501386642456, "rewards/margins": 0.35605549812316895, "rewards/rejected": 3.5524463653564453, "step": 65000 }, { "epoch": 3.0182459724221182, "grad_norm": 122.41165924072266, "learning_rate": 1.1898881099401087e-07, "logits/chosen": -19.27615737915039, "logits/rejected": -18.507640838623047, "logps/chosen": -539.7161254882812, "logps/rejected": -392.29290771484375, "loss": 0.7479, "rewards/accuracies": 0.5, "rewards/chosen": 3.626492738723755, "rewards/margins": 0.35841959714889526, "rewards/rejected": 3.268073320388794, "step": 65010 }, { "epoch": 3.018710246529551, "grad_norm": 182.49281311035156, "learning_rate": 1.1896095454756488e-07, "logits/chosen": -20.025693893432617, "logits/rejected": -18.998775482177734, "logps/chosen": -430.08770751953125, "logps/rejected": -369.6161804199219, "loss": 0.6325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.401181221008301, "rewards/margins": 0.7268285155296326, "rewards/rejected": 3.6743526458740234, "step": 65020 }, { "epoch": 3.019174520636984, "grad_norm": 18.973615646362305, "learning_rate": 1.189330981011189e-07, "logits/chosen": -17.768037796020508, "logits/rejected": -17.481178283691406, "logps/chosen": -428.2223205566406, "logps/rejected": -363.60906982421875, "loss": 1.0552, "rewards/accuracies": 0.5, "rewards/chosen": 2.7703633308410645, "rewards/margins": 0.18398301303386688, "rewards/rejected": 2.5863804817199707, "step": 65030 }, { "epoch": 3.019638794744417, "grad_norm": 80.02201843261719, "learning_rate": 1.1890524165467291e-07, "logits/chosen": -18.962175369262695, "logits/rejected": -18.4091796875, "logps/chosen": -324.35882568359375, "logps/rejected": -291.9663391113281, "loss": 0.804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1358582973480225, "rewards/margins": 0.6419151425361633, "rewards/rejected": 1.493943214416504, "step": 65040 }, { "epoch": 3.02010306885185, "grad_norm": 2.6369662284851074, "learning_rate": 1.1887738520822694e-07, "logits/chosen": -18.650732040405273, "logits/rejected": -18.384510040283203, "logps/chosen": -412.8544006347656, "logps/rejected": -373.85260009765625, "loss": 0.6447, "rewards/accuracies": 0.5, "rewards/chosen": 3.3415141105651855, "rewards/margins": 1.0327186584472656, "rewards/rejected": 2.30879545211792, "step": 65050 }, { "epoch": 3.020567342959283, "grad_norm": 35.836280822753906, "learning_rate": 1.1884952876178095e-07, "logits/chosen": -20.215396881103516, "logits/rejected": -18.620248794555664, "logps/chosen": -434.49017333984375, "logps/rejected": -299.1611633300781, "loss": 0.1814, "rewards/accuracies": 1.0, "rewards/chosen": 4.821777820587158, "rewards/margins": 2.2537949085235596, "rewards/rejected": 2.5679826736450195, "step": 65060 }, { "epoch": 3.021031617066716, "grad_norm": 205.5990753173828, "learning_rate": 1.1882167231533498e-07, "logits/chosen": -18.324100494384766, "logits/rejected": -17.836360931396484, "logps/chosen": -244.57003784179688, "logps/rejected": -232.03988647460938, "loss": 1.2816, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4992268085479736, "rewards/margins": 0.2097339630126953, "rewards/rejected": 1.2894928455352783, "step": 65070 }, { "epoch": 3.0214958911741494, "grad_norm": 20.374807357788086, "learning_rate": 1.1879381586888898e-07, "logits/chosen": -18.91117286682129, "logits/rejected": -18.409374237060547, "logps/chosen": -364.4774475097656, "logps/rejected": -339.2406921386719, "loss": 0.3103, "rewards/accuracies": 1.0, "rewards/chosen": 3.8386969566345215, "rewards/margins": 1.8058035373687744, "rewards/rejected": 2.032893657684326, "step": 65080 }, { "epoch": 3.021960165281582, "grad_norm": 73.15592193603516, "learning_rate": 1.1876595942244301e-07, "logits/chosen": -19.910783767700195, "logits/rejected": -19.240482330322266, "logps/chosen": -441.16534423828125, "logps/rejected": -363.8753356933594, "loss": 0.5838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8134522438049316, "rewards/margins": 1.601378083229065, "rewards/rejected": 2.212073564529419, "step": 65090 }, { "epoch": 3.0224244393890154, "grad_norm": 0.6215285062789917, "learning_rate": 1.1873810297599702e-07, "logits/chosen": -18.83808708190918, "logits/rejected": -17.937904357910156, "logps/chosen": -358.18341064453125, "logps/rejected": -196.4701690673828, "loss": 0.4075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0988802909851074, "rewards/margins": 2.096494197845459, "rewards/rejected": 1.0023858547210693, "step": 65100 }, { "epoch": 3.022888713496448, "grad_norm": 41.649261474609375, "learning_rate": 1.1871024652955104e-07, "logits/chosen": -19.043455123901367, "logits/rejected": -18.41672134399414, "logps/chosen": -518.2286376953125, "logps/rejected": -415.9820251464844, "loss": 0.3859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.888348579406738, "rewards/margins": 2.252443552017212, "rewards/rejected": 2.6359057426452637, "step": 65110 }, { "epoch": 3.0233529876038814, "grad_norm": 227.76254272460938, "learning_rate": 1.1868239008310506e-07, "logits/chosen": -19.218387603759766, "logits/rejected": -18.555221557617188, "logps/chosen": -408.22454833984375, "logps/rejected": -364.8914489746094, "loss": 0.6492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3649725914001465, "rewards/margins": 1.180690050125122, "rewards/rejected": 2.1842827796936035, "step": 65120 }, { "epoch": 3.023817261711314, "grad_norm": 0.9008312225341797, "learning_rate": 1.1865453363665907e-07, "logits/chosen": -18.957876205444336, "logits/rejected": -18.199846267700195, "logps/chosen": -385.10955810546875, "logps/rejected": -275.4433288574219, "loss": 1.0799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.844735860824585, "rewards/margins": 1.2695682048797607, "rewards/rejected": 1.5751678943634033, "step": 65130 }, { "epoch": 3.0242815358187474, "grad_norm": 33.438899993896484, "learning_rate": 1.186266771902131e-07, "logits/chosen": -19.327314376831055, "logits/rejected": -18.908977508544922, "logps/chosen": -333.555419921875, "logps/rejected": -277.29864501953125, "loss": 0.4751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.79504656791687, "rewards/margins": 1.084086537361145, "rewards/rejected": 2.7109599113464355, "step": 65140 }, { "epoch": 3.0247458099261806, "grad_norm": 168.69375610351562, "learning_rate": 1.1859882074376711e-07, "logits/chosen": -18.901103973388672, "logits/rejected": -18.496963500976562, "logps/chosen": -426.34820556640625, "logps/rejected": -397.4333190917969, "loss": 0.7015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.298388719558716, "rewards/margins": 0.9481647610664368, "rewards/rejected": 2.350224018096924, "step": 65150 }, { "epoch": 3.0252100840336134, "grad_norm": 0.305258572101593, "learning_rate": 1.1857096429732114e-07, "logits/chosen": -19.616840362548828, "logits/rejected": -18.40278434753418, "logps/chosen": -445.7720642089844, "logps/rejected": -281.1853942871094, "loss": 0.5654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.130155563354492, "rewards/margins": 1.9045501947402954, "rewards/rejected": 2.2256052494049072, "step": 65160 }, { "epoch": 3.0256743581410466, "grad_norm": 39.32895278930664, "learning_rate": 1.1854310785087515e-07, "logits/chosen": -19.302061080932617, "logits/rejected": -17.288936614990234, "logps/chosen": -428.1477966308594, "logps/rejected": -295.75140380859375, "loss": 0.2597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.648787260055542, "rewards/margins": 1.9643917083740234, "rewards/rejected": 1.684395432472229, "step": 65170 }, { "epoch": 3.0261386322484793, "grad_norm": 15.346819877624512, "learning_rate": 1.1851525140442918e-07, "logits/chosen": -19.567129135131836, "logits/rejected": -18.694351196289062, "logps/chosen": -456.34735107421875, "logps/rejected": -400.6210632324219, "loss": 0.2663, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2151007652282715, "rewards/margins": 2.093723773956299, "rewards/rejected": 2.1213772296905518, "step": 65180 }, { "epoch": 3.0266029063559126, "grad_norm": 2.8140106201171875, "learning_rate": 1.1848739495798318e-07, "logits/chosen": -19.410091400146484, "logits/rejected": -17.884685516357422, "logps/chosen": -465.4248962402344, "logps/rejected": -343.6781311035156, "loss": 0.1451, "rewards/accuracies": 1.0, "rewards/chosen": 4.816763877868652, "rewards/margins": 2.9475350379943848, "rewards/rejected": 1.8692289590835571, "step": 65190 }, { "epoch": 3.0270671804633458, "grad_norm": 134.09854125976562, "learning_rate": 1.1845953851153721e-07, "logits/chosen": -20.229909896850586, "logits/rejected": -19.723670959472656, "logps/chosen": -403.53277587890625, "logps/rejected": -382.08489990234375, "loss": 0.4579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1160383224487305, "rewards/margins": 1.507442831993103, "rewards/rejected": 2.608595132827759, "step": 65200 }, { "epoch": 3.0275314545707785, "grad_norm": 0.5236588716506958, "learning_rate": 1.1843168206509122e-07, "logits/chosen": -19.236488342285156, "logits/rejected": -17.963388442993164, "logps/chosen": -464.9892578125, "logps/rejected": -266.4188232421875, "loss": 0.4995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.562780857086182, "rewards/margins": 2.5703165531158447, "rewards/rejected": 1.9924646615982056, "step": 65210 }, { "epoch": 3.0279957286782118, "grad_norm": 0.09351092576980591, "learning_rate": 1.1840382561864525e-07, "logits/chosen": -18.74970245361328, "logits/rejected": -17.617595672607422, "logps/chosen": -420.437255859375, "logps/rejected": -300.52911376953125, "loss": 0.4385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.0256571769714355, "rewards/margins": 3.0630786418914795, "rewards/rejected": 1.9625790119171143, "step": 65220 }, { "epoch": 3.0284600027856445, "grad_norm": 5.758636474609375, "learning_rate": 1.1837596917219926e-07, "logits/chosen": -18.480873107910156, "logits/rejected": -18.063051223754883, "logps/chosen": -354.4757385253906, "logps/rejected": -312.6415100097656, "loss": 0.4931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.838238477706909, "rewards/margins": 1.1057698726654053, "rewards/rejected": 2.732468843460083, "step": 65230 }, { "epoch": 3.0289242768930777, "grad_norm": 30.11172103881836, "learning_rate": 1.1834811272575328e-07, "logits/chosen": -19.362735748291016, "logits/rejected": -19.11617660522461, "logps/chosen": -445.8998107910156, "logps/rejected": -431.70867919921875, "loss": 0.8601, "rewards/accuracies": 0.5, "rewards/chosen": 4.009389400482178, "rewards/margins": 0.33368343114852905, "rewards/rejected": 3.675706386566162, "step": 65240 }, { "epoch": 3.0293885510005105, "grad_norm": 57.21851348876953, "learning_rate": 1.183202562793073e-07, "logits/chosen": -18.369007110595703, "logits/rejected": -18.05186653137207, "logps/chosen": -347.5479431152344, "logps/rejected": -289.8197021484375, "loss": 0.7061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0338406562805176, "rewards/margins": 0.9449882507324219, "rewards/rejected": 2.0888524055480957, "step": 65250 }, { "epoch": 3.0298528251079437, "grad_norm": 53.16253662109375, "learning_rate": 1.1829239983286132e-07, "logits/chosen": -19.753101348876953, "logits/rejected": -19.37843894958496, "logps/chosen": -450.54327392578125, "logps/rejected": -412.98699951171875, "loss": 0.4654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0972208976745605, "rewards/margins": 1.2328157424926758, "rewards/rejected": 2.8644049167633057, "step": 65260 }, { "epoch": 3.030317099215377, "grad_norm": 177.10121154785156, "learning_rate": 1.1826454338641534e-07, "logits/chosen": -18.6683292388916, "logits/rejected": -17.661619186401367, "logps/chosen": -435.2184143066406, "logps/rejected": -316.92828369140625, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3730149269104004, "rewards/margins": 2.005842685699463, "rewards/rejected": 1.3671724796295166, "step": 65270 }, { "epoch": 3.0307813733228097, "grad_norm": 22.600156784057617, "learning_rate": 1.1823668693996936e-07, "logits/chosen": -19.81585693359375, "logits/rejected": -18.072566986083984, "logps/chosen": -496.777587890625, "logps/rejected": -311.2998046875, "loss": 0.3792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.239666938781738, "rewards/margins": 1.60683274269104, "rewards/rejected": 2.632833957672119, "step": 65280 }, { "epoch": 3.031245647430243, "grad_norm": 145.30198669433594, "learning_rate": 1.1820883049352336e-07, "logits/chosen": -18.423702239990234, "logits/rejected": -17.843528747558594, "logps/chosen": -354.99041748046875, "logps/rejected": -342.60357666015625, "loss": 0.6747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.445736885070801, "rewards/margins": 0.7175105810165405, "rewards/rejected": 1.728226661682129, "step": 65290 }, { "epoch": 3.0317099215376757, "grad_norm": 3.7551703453063965, "learning_rate": 1.1818097404707738e-07, "logits/chosen": -20.038089752197266, "logits/rejected": -19.219409942626953, "logps/chosen": -509.2640686035156, "logps/rejected": -379.5403137207031, "loss": 0.5435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.053174018859863, "rewards/margins": 1.4582531452178955, "rewards/rejected": 2.5949206352233887, "step": 65300 }, { "epoch": 3.032174195645109, "grad_norm": 169.08924865722656, "learning_rate": 1.1815311760063141e-07, "logits/chosen": -18.93584632873535, "logits/rejected": -18.439958572387695, "logps/chosen": -266.6789245605469, "logps/rejected": -260.4061584472656, "loss": 0.927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8560594320297241, "rewards/margins": 0.10133805125951767, "rewards/rejected": 1.7547214031219482, "step": 65310 }, { "epoch": 3.0326384697525417, "grad_norm": 20.724889755249023, "learning_rate": 1.1812526115418542e-07, "logits/chosen": -19.710771560668945, "logits/rejected": -18.656360626220703, "logps/chosen": -509.91168212890625, "logps/rejected": -465.6630859375, "loss": 0.8541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.469882965087891, "rewards/margins": 1.2261409759521484, "rewards/rejected": 3.243741989135742, "step": 65320 }, { "epoch": 3.033102743859975, "grad_norm": 61.45072555541992, "learning_rate": 1.1809740470773945e-07, "logits/chosen": -19.08738136291504, "logits/rejected": -17.701534271240234, "logps/chosen": -405.6854553222656, "logps/rejected": -335.1871643066406, "loss": 0.4342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2255940437316895, "rewards/margins": 1.4987766742706299, "rewards/rejected": 1.7268174886703491, "step": 65330 }, { "epoch": 3.033567017967408, "grad_norm": 143.54302978515625, "learning_rate": 1.1806954826129345e-07, "logits/chosen": -18.354660034179688, "logits/rejected": -18.38283348083496, "logps/chosen": -360.4578857421875, "logps/rejected": -367.60345458984375, "loss": 1.2722, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7147631645202637, "rewards/margins": -0.3810580372810364, "rewards/rejected": 3.0958211421966553, "step": 65340 }, { "epoch": 3.034031292074841, "grad_norm": 7.588525772094727, "learning_rate": 1.1804169181484748e-07, "logits/chosen": -19.39322280883789, "logits/rejected": -18.819246292114258, "logps/chosen": -394.4112243652344, "logps/rejected": -370.80841064453125, "loss": 0.5528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7593135833740234, "rewards/margins": 0.8755123019218445, "rewards/rejected": 2.883800983428955, "step": 65350 }, { "epoch": 3.034495566182274, "grad_norm": 45.43462371826172, "learning_rate": 1.1801383536840149e-07, "logits/chosen": -18.05387306213379, "logits/rejected": -17.70193099975586, "logps/chosen": -254.08447265625, "logps/rejected": -252.37649536132812, "loss": 0.7161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0276265144348145, "rewards/margins": 0.6241396069526672, "rewards/rejected": 1.403486967086792, "step": 65360 }, { "epoch": 3.034959840289707, "grad_norm": 0.2350730299949646, "learning_rate": 1.1798597892195552e-07, "logits/chosen": -18.43389129638672, "logits/rejected": -16.90018653869629, "logps/chosen": -459.7789001464844, "logps/rejected": -259.58148193359375, "loss": 0.7533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.341010570526123, "rewards/margins": 2.5528924465179443, "rewards/rejected": 1.7881183624267578, "step": 65370 }, { "epoch": 3.03542411439714, "grad_norm": 46.614952087402344, "learning_rate": 1.1795812247550954e-07, "logits/chosen": -17.668106079101562, "logits/rejected": -18.069063186645508, "logps/chosen": -292.0118103027344, "logps/rejected": -275.9633483886719, "loss": 1.5389, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.161398410797119, "rewards/margins": -0.09238789230585098, "rewards/rejected": 2.253786325454712, "step": 65380 }, { "epoch": 3.0358883885045733, "grad_norm": 58.76260757446289, "learning_rate": 1.1793026602906356e-07, "logits/chosen": -18.60114288330078, "logits/rejected": -18.59439468383789, "logps/chosen": -388.7102355957031, "logps/rejected": -381.5356750488281, "loss": 0.7368, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4066009521484375, "rewards/margins": 0.4520389139652252, "rewards/rejected": 2.954562187194824, "step": 65390 }, { "epoch": 3.036352662612006, "grad_norm": 0.4193838834762573, "learning_rate": 1.1790240958261756e-07, "logits/chosen": -18.728681564331055, "logits/rejected": -18.086687088012695, "logps/chosen": -514.4826049804688, "logps/rejected": -330.6346435546875, "loss": 0.3087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.279025554656982, "rewards/margins": 2.867213249206543, "rewards/rejected": 2.4118130207061768, "step": 65400 }, { "epoch": 3.0368169367194393, "grad_norm": 0.08740785717964172, "learning_rate": 1.1787455313617159e-07, "logits/chosen": -18.887022018432617, "logits/rejected": -18.191852569580078, "logps/chosen": -249.32687377929688, "logps/rejected": -176.98155212402344, "loss": 1.0485, "rewards/accuracies": 0.5, "rewards/chosen": 2.066199779510498, "rewards/margins": 1.0117623805999756, "rewards/rejected": 1.0544376373291016, "step": 65410 }, { "epoch": 3.037281210826872, "grad_norm": 35.522216796875, "learning_rate": 1.1784669668972561e-07, "logits/chosen": -18.61029624938965, "logits/rejected": -18.27273941040039, "logps/chosen": -411.19677734375, "logps/rejected": -372.02362060546875, "loss": 0.4836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4984405040740967, "rewards/margins": 1.8932483196258545, "rewards/rejected": 1.605191946029663, "step": 65420 }, { "epoch": 3.0377454849343053, "grad_norm": 87.27299499511719, "learning_rate": 1.1781884024327963e-07, "logits/chosen": -19.129894256591797, "logits/rejected": -17.800838470458984, "logps/chosen": -364.7395935058594, "logps/rejected": -265.81024169921875, "loss": 0.5157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7635085582733154, "rewards/margins": 1.4737097024917603, "rewards/rejected": 2.2897984981536865, "step": 65430 }, { "epoch": 3.038209759041738, "grad_norm": 9.345403671264648, "learning_rate": 1.1779098379683365e-07, "logits/chosen": -18.725290298461914, "logits/rejected": -18.59339714050293, "logps/chosen": -347.1850280761719, "logps/rejected": -306.11383056640625, "loss": 0.4808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7890450954437256, "rewards/margins": 1.017459511756897, "rewards/rejected": 1.7715858221054077, "step": 65440 }, { "epoch": 3.0386740331491713, "grad_norm": 98.12567901611328, "learning_rate": 1.1776312735038766e-07, "logits/chosen": -18.862323760986328, "logits/rejected": -19.308917999267578, "logps/chosen": -354.5274353027344, "logps/rejected": -437.9103088378906, "loss": 1.214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4416162967681885, "rewards/margins": -0.10409190505743027, "rewards/rejected": 2.545708179473877, "step": 65450 }, { "epoch": 3.0391383072566045, "grad_norm": 190.71517944335938, "learning_rate": 1.1773527090394168e-07, "logits/chosen": -18.532642364501953, "logits/rejected": -18.615888595581055, "logps/chosen": -352.2543640136719, "logps/rejected": -382.1916198730469, "loss": 0.892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5900940895080566, "rewards/margins": 0.15152224898338318, "rewards/rejected": 2.4385714530944824, "step": 65460 }, { "epoch": 3.0396025813640373, "grad_norm": 150.30589294433594, "learning_rate": 1.177074144574957e-07, "logits/chosen": -18.721065521240234, "logits/rejected": -18.26059341430664, "logps/chosen": -395.67254638671875, "logps/rejected": -359.615966796875, "loss": 1.0017, "rewards/accuracies": 0.5, "rewards/chosen": 3.3155674934387207, "rewards/margins": 0.4687426686286926, "rewards/rejected": 2.846824884414673, "step": 65470 }, { "epoch": 3.0400668554714705, "grad_norm": 68.14845275878906, "learning_rate": 1.1767955801104972e-07, "logits/chosen": -19.27129364013672, "logits/rejected": -18.165985107421875, "logps/chosen": -388.6185607910156, "logps/rejected": -313.00360107421875, "loss": 0.2943, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.757795810699463, "rewards/margins": 2.085439682006836, "rewards/rejected": 1.6723558902740479, "step": 65480 }, { "epoch": 3.0405311295789033, "grad_norm": 111.05321502685547, "learning_rate": 1.1765170156460375e-07, "logits/chosen": -18.00252342224121, "logits/rejected": -18.661785125732422, "logps/chosen": -291.5115661621094, "logps/rejected": -328.9423828125, "loss": 1.6751, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6936206817626953, "rewards/margins": -0.8806449174880981, "rewards/rejected": 2.574265480041504, "step": 65490 }, { "epoch": 3.0409954036863365, "grad_norm": 126.01097106933594, "learning_rate": 1.1762384511815775e-07, "logits/chosen": -19.167285919189453, "logits/rejected": -18.947856903076172, "logps/chosen": -405.86773681640625, "logps/rejected": -400.6535339355469, "loss": 0.657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.281686305999756, "rewards/margins": 0.7611933946609497, "rewards/rejected": 2.5204930305480957, "step": 65500 }, { "epoch": 3.0414596777937692, "grad_norm": 38.453617095947266, "learning_rate": 1.1759598867171176e-07, "logits/chosen": -19.395517349243164, "logits/rejected": -18.56939697265625, "logps/chosen": -313.9688720703125, "logps/rejected": -311.488037109375, "loss": 0.3902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0690789222717285, "rewards/margins": 1.4628835916519165, "rewards/rejected": 1.6061954498291016, "step": 65510 }, { "epoch": 3.0419239519012025, "grad_norm": 32.31110763549805, "learning_rate": 1.1756813222526579e-07, "logits/chosen": -18.638620376586914, "logits/rejected": -18.71898651123047, "logps/chosen": -421.9159240722656, "logps/rejected": -420.80670166015625, "loss": 1.4817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.389666795730591, "rewards/margins": 0.02174677886068821, "rewards/rejected": 3.367920398712158, "step": 65520 }, { "epoch": 3.0423882260086357, "grad_norm": 61.68999481201172, "learning_rate": 1.175402757788198e-07, "logits/chosen": -19.124053955078125, "logits/rejected": -19.011329650878906, "logps/chosen": -402.2270202636719, "logps/rejected": -418.039306640625, "loss": 0.5593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2422642707824707, "rewards/margins": 0.9009034037590027, "rewards/rejected": 2.341360569000244, "step": 65530 }, { "epoch": 3.0428525001160684, "grad_norm": 160.6179962158203, "learning_rate": 1.1751241933237383e-07, "logits/chosen": -18.718942642211914, "logits/rejected": -18.30453109741211, "logps/chosen": -458.7665100097656, "logps/rejected": -389.69732666015625, "loss": 0.868, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.465266704559326, "rewards/margins": -0.019998610019683838, "rewards/rejected": 3.485265016555786, "step": 65540 }, { "epoch": 3.0433167742235017, "grad_norm": 230.92774963378906, "learning_rate": 1.1748456288592784e-07, "logits/chosen": -18.80719566345215, "logits/rejected": -18.899383544921875, "logps/chosen": -358.56640625, "logps/rejected": -337.11004638671875, "loss": 1.1697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2789766788482666, "rewards/margins": 0.14407792687416077, "rewards/rejected": 2.1348986625671387, "step": 65550 }, { "epoch": 3.0437810483309344, "grad_norm": 20.86433219909668, "learning_rate": 1.1745670643948186e-07, "logits/chosen": -18.62791633605957, "logits/rejected": -18.859676361083984, "logps/chosen": -391.7728576660156, "logps/rejected": -429.37481689453125, "loss": 1.0172, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.99257230758667, "rewards/margins": -0.04233352094888687, "rewards/rejected": 3.0349059104919434, "step": 65560 }, { "epoch": 3.0442453224383677, "grad_norm": 170.14297485351562, "learning_rate": 1.1742884999303588e-07, "logits/chosen": -19.25775909423828, "logits/rejected": -18.450389862060547, "logps/chosen": -346.35003662109375, "logps/rejected": -250.9130096435547, "loss": 1.0881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0762429237365723, "rewards/margins": 0.751091480255127, "rewards/rejected": 2.3251519203186035, "step": 65570 }, { "epoch": 3.0447095965458004, "grad_norm": 258.7189025878906, "learning_rate": 1.174009935465899e-07, "logits/chosen": -18.531354904174805, "logits/rejected": -17.99212074279785, "logps/chosen": -376.14605712890625, "logps/rejected": -390.7828674316406, "loss": 0.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6443068981170654, "rewards/margins": 1.1224174499511719, "rewards/rejected": 2.5218894481658936, "step": 65580 }, { "epoch": 3.0451738706532336, "grad_norm": 63.942626953125, "learning_rate": 1.1737313710014392e-07, "logits/chosen": -19.534948348999023, "logits/rejected": -18.538162231445312, "logps/chosen": -515.2427978515625, "logps/rejected": -309.7139587402344, "loss": 0.3218, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.889350175857544, "rewards/margins": 2.0562281608581543, "rewards/rejected": 1.8331222534179688, "step": 65590 }, { "epoch": 3.045638144760667, "grad_norm": 45.39886474609375, "learning_rate": 1.1734528065369795e-07, "logits/chosen": -18.418569564819336, "logits/rejected": -17.792205810546875, "logps/chosen": -308.4320983886719, "logps/rejected": -289.17291259765625, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": 2.1114649772644043, "rewards/margins": 0.6011210083961487, "rewards/rejected": 1.5103439092636108, "step": 65600 }, { "epoch": 3.0461024188680996, "grad_norm": 1.204017996788025, "learning_rate": 1.1731742420725195e-07, "logits/chosen": -19.786535263061523, "logits/rejected": -19.027128219604492, "logps/chosen": -313.15911865234375, "logps/rejected": -243.8239288330078, "loss": 0.4429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5191330909729004, "rewards/margins": 1.9189571142196655, "rewards/rejected": 0.6001760363578796, "step": 65610 }, { "epoch": 3.046566692975533, "grad_norm": 31.22004508972168, "learning_rate": 1.1728956776080598e-07, "logits/chosen": -18.70255470275879, "logits/rejected": -18.255964279174805, "logps/chosen": -362.9483337402344, "logps/rejected": -283.9992980957031, "loss": 0.9249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3495945930480957, "rewards/margins": 0.45856136083602905, "rewards/rejected": 1.8910331726074219, "step": 65620 }, { "epoch": 3.0470309670829656, "grad_norm": 3.3863725662231445, "learning_rate": 1.1726171131435999e-07, "logits/chosen": -19.187480926513672, "logits/rejected": -17.896638870239258, "logps/chosen": -511.5516662597656, "logps/rejected": -320.062744140625, "loss": 0.3576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.462841510772705, "rewards/margins": 2.6585793495178223, "rewards/rejected": 1.8042619228363037, "step": 65630 }, { "epoch": 3.047495241190399, "grad_norm": 120.62364196777344, "learning_rate": 1.1723385486791402e-07, "logits/chosen": -19.55696678161621, "logits/rejected": -19.090438842773438, "logps/chosen": -392.8951721191406, "logps/rejected": -404.2881774902344, "loss": 1.4023, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5605361461639404, "rewards/margins": -0.13988251984119415, "rewards/rejected": 3.700418472290039, "step": 65640 }, { "epoch": 3.047959515297832, "grad_norm": 1.2523518800735474, "learning_rate": 1.1720599842146803e-07, "logits/chosen": -18.838977813720703, "logits/rejected": -18.3169002532959, "logps/chosen": -385.6014709472656, "logps/rejected": -359.9165954589844, "loss": 1.6474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.04638409614563, "rewards/margins": 0.31074896454811096, "rewards/rejected": 2.735635280609131, "step": 65650 }, { "epoch": 3.048423789405265, "grad_norm": 57.865936279296875, "learning_rate": 1.1717814197502205e-07, "logits/chosen": -18.696523666381836, "logits/rejected": -18.47295570373535, "logps/chosen": -390.1653137207031, "logps/rejected": -403.2447509765625, "loss": 0.5319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.277453422546387, "rewards/margins": 1.1633518934249878, "rewards/rejected": 3.1141018867492676, "step": 65660 }, { "epoch": 3.048888063512698, "grad_norm": 22.181625366210938, "learning_rate": 1.1715028552857606e-07, "logits/chosen": -20.293405532836914, "logits/rejected": -19.13498878479004, "logps/chosen": -397.87078857421875, "logps/rejected": -267.6587829589844, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": 3.428370952606201, "rewards/margins": 2.1506965160369873, "rewards/rejected": 1.2776744365692139, "step": 65670 }, { "epoch": 3.049352337620131, "grad_norm": 45.872982025146484, "learning_rate": 1.1712242908213009e-07, "logits/chosen": -19.043777465820312, "logits/rejected": -18.381072998046875, "logps/chosen": -342.51727294921875, "logps/rejected": -313.78729248046875, "loss": 0.7202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0015320777893066, "rewards/margins": 0.7828377485275269, "rewards/rejected": 2.2186942100524902, "step": 65680 }, { "epoch": 3.049816611727564, "grad_norm": 16.369766235351562, "learning_rate": 1.170945726356841e-07, "logits/chosen": -19.33030128479004, "logits/rejected": -18.636775970458984, "logps/chosen": -397.3421936035156, "logps/rejected": -244.72109985351562, "loss": 0.4718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.887329578399658, "rewards/margins": 1.975857138633728, "rewards/rejected": 1.9114720821380615, "step": 65690 }, { "epoch": 3.050280885834997, "grad_norm": 165.94351196289062, "learning_rate": 1.170667161892381e-07, "logits/chosen": -19.499757766723633, "logits/rejected": -19.205142974853516, "logps/chosen": -415.1053161621094, "logps/rejected": -368.10662841796875, "loss": 0.7079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5872929096221924, "rewards/margins": 0.8258379697799683, "rewards/rejected": 2.7614550590515137, "step": 65700 }, { "epoch": 3.05074515994243, "grad_norm": 39.63022994995117, "learning_rate": 1.1703885974279213e-07, "logits/chosen": -18.279699325561523, "logits/rejected": -17.359346389770508, "logps/chosen": -357.66436767578125, "logps/rejected": -290.17901611328125, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.398982286453247, "rewards/margins": 0.9353117942810059, "rewards/rejected": 1.4636703729629517, "step": 65710 }, { "epoch": 3.0512094340498632, "grad_norm": 19.9770450592041, "learning_rate": 1.1701100329634615e-07, "logits/chosen": -19.598522186279297, "logits/rejected": -18.479948043823242, "logps/chosen": -383.50213623046875, "logps/rejected": -269.65802001953125, "loss": 0.6564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.793280839920044, "rewards/margins": 1.4865052700042725, "rewards/rejected": 2.3067753314971924, "step": 65720 }, { "epoch": 3.051673708157296, "grad_norm": 17.258012771606445, "learning_rate": 1.1698314684990018e-07, "logits/chosen": -19.89318084716797, "logits/rejected": -19.35818099975586, "logps/chosen": -449.8636779785156, "logps/rejected": -393.197509765625, "loss": 0.4374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4970386028289795, "rewards/margins": 0.8854115605354309, "rewards/rejected": 2.6116273403167725, "step": 65730 }, { "epoch": 3.052137982264729, "grad_norm": 1.9355276823043823, "learning_rate": 1.1695529040345419e-07, "logits/chosen": -19.465896606445312, "logits/rejected": -18.028564453125, "logps/chosen": -335.7074890136719, "logps/rejected": -241.30130004882812, "loss": 0.3852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9843430519104004, "rewards/margins": 2.6257386207580566, "rewards/rejected": 1.3586041927337646, "step": 65740 }, { "epoch": 3.052602256372162, "grad_norm": 38.943023681640625, "learning_rate": 1.1692743395700822e-07, "logits/chosen": -19.310155868530273, "logits/rejected": -18.64425277709961, "logps/chosen": -424.97821044921875, "logps/rejected": -357.76068115234375, "loss": 0.5292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.286144256591797, "rewards/margins": 1.1149089336395264, "rewards/rejected": 3.1712355613708496, "step": 65750 }, { "epoch": 3.053066530479595, "grad_norm": 66.15296173095703, "learning_rate": 1.1689957751056222e-07, "logits/chosen": -18.299114227294922, "logits/rejected": -18.38266944885254, "logps/chosen": -394.5773010253906, "logps/rejected": -352.3059997558594, "loss": 0.6305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1939327716827393, "rewards/margins": 1.0767799615859985, "rewards/rejected": 2.117152690887451, "step": 65760 }, { "epoch": 3.053530804587028, "grad_norm": 43.32225036621094, "learning_rate": 1.1687172106411625e-07, "logits/chosen": -18.993675231933594, "logits/rejected": -17.523344039916992, "logps/chosen": -351.9111328125, "logps/rejected": -261.94232177734375, "loss": 0.3584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.801654815673828, "rewards/margins": 1.9734296798706055, "rewards/rejected": 1.828224539756775, "step": 65770 }, { "epoch": 3.053995078694461, "grad_norm": 265.5395202636719, "learning_rate": 1.1684386461767026e-07, "logits/chosen": -18.969873428344727, "logits/rejected": -17.818422317504883, "logps/chosen": -475.98614501953125, "logps/rejected": -322.810302734375, "loss": 0.7263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.801247596740723, "rewards/margins": 1.5235345363616943, "rewards/rejected": 3.2777130603790283, "step": 65780 }, { "epoch": 3.0544593528018944, "grad_norm": 50.270790100097656, "learning_rate": 1.1681600817122429e-07, "logits/chosen": -18.462596893310547, "logits/rejected": -17.475772857666016, "logps/chosen": -406.0968322753906, "logps/rejected": -301.2444152832031, "loss": 0.4021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6851532459259033, "rewards/margins": 1.9871124029159546, "rewards/rejected": 1.6980407238006592, "step": 65790 }, { "epoch": 3.054923626909327, "grad_norm": 7.8938446044921875, "learning_rate": 1.167881517247783e-07, "logits/chosen": -19.39281463623047, "logits/rejected": -18.242368698120117, "logps/chosen": -259.5730285644531, "logps/rejected": -200.04940795898438, "loss": 0.3699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.433411121368408, "rewards/margins": 1.7609329223632812, "rewards/rejected": 1.6724780797958374, "step": 65800 }, { "epoch": 3.0553879010167604, "grad_norm": 65.09043884277344, "learning_rate": 1.1676029527833233e-07, "logits/chosen": -19.22539710998535, "logits/rejected": -18.84186553955078, "logps/chosen": -351.2909851074219, "logps/rejected": -322.1044006347656, "loss": 0.3855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.593761920928955, "rewards/margins": 1.5585525035858154, "rewards/rejected": 2.0352091789245605, "step": 65810 }, { "epoch": 3.055852175124193, "grad_norm": 125.0597915649414, "learning_rate": 1.1673243883188633e-07, "logits/chosen": -18.57049560546875, "logits/rejected": -18.200302124023438, "logps/chosen": -381.9825439453125, "logps/rejected": -351.1324462890625, "loss": 0.6303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.267932415008545, "rewards/margins": 0.4591507017612457, "rewards/rejected": 1.808781623840332, "step": 65820 }, { "epoch": 3.0563164492316264, "grad_norm": 19.93975257873535, "learning_rate": 1.1670458238544036e-07, "logits/chosen": -18.904939651489258, "logits/rejected": -18.109725952148438, "logps/chosen": -428.7832946777344, "logps/rejected": -325.90081787109375, "loss": 0.467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.597295045852661, "rewards/margins": 1.2182055711746216, "rewards/rejected": 2.37908935546875, "step": 65830 }, { "epoch": 3.0567807233390596, "grad_norm": 3.0812907218933105, "learning_rate": 1.1667672593899438e-07, "logits/chosen": -19.186059951782227, "logits/rejected": -17.821332931518555, "logps/chosen": -373.30535888671875, "logps/rejected": -246.80673217773438, "loss": 0.263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8184115886688232, "rewards/margins": 2.776386260986328, "rewards/rejected": 1.0420252084732056, "step": 65840 }, { "epoch": 3.0572449974464924, "grad_norm": 88.58146667480469, "learning_rate": 1.166488694925484e-07, "logits/chosen": -19.44405746459961, "logits/rejected": -19.513286590576172, "logps/chosen": -399.0174255371094, "logps/rejected": -325.37103271484375, "loss": 1.0393, "rewards/accuracies": 0.5, "rewards/chosen": 3.8693995475769043, "rewards/margins": 0.04145312309265137, "rewards/rejected": 3.827946901321411, "step": 65850 }, { "epoch": 3.0577092715539256, "grad_norm": 24.88422203063965, "learning_rate": 1.1662101304610242e-07, "logits/chosen": -19.614421844482422, "logits/rejected": -17.91279411315918, "logps/chosen": -342.5053405761719, "logps/rejected": -245.0054931640625, "loss": 0.3384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9831252098083496, "rewards/margins": 2.1789889335632324, "rewards/rejected": 1.8041362762451172, "step": 65860 }, { "epoch": 3.0581735456613584, "grad_norm": 45.03822708129883, "learning_rate": 1.1659315659965643e-07, "logits/chosen": -19.4093074798584, "logits/rejected": -19.467538833618164, "logps/chosen": -369.66162109375, "logps/rejected": -383.93865966796875, "loss": 1.0807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6786766052246094, "rewards/margins": 0.13525691628456116, "rewards/rejected": 3.543419361114502, "step": 65870 }, { "epoch": 3.0586378197687916, "grad_norm": 12.009915351867676, "learning_rate": 1.1656530015321045e-07, "logits/chosen": -18.80412483215332, "logits/rejected": -17.712726593017578, "logps/chosen": -417.86883544921875, "logps/rejected": -319.8647766113281, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.220590591430664, "rewards/margins": 1.55600905418396, "rewards/rejected": 1.6645816564559937, "step": 65880 }, { "epoch": 3.0591020938762243, "grad_norm": 24.583572387695312, "learning_rate": 1.1653744370676448e-07, "logits/chosen": -18.772228240966797, "logits/rejected": -18.190568923950195, "logps/chosen": -453.080078125, "logps/rejected": -330.1734313964844, "loss": 0.6425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5422940254211426, "rewards/margins": 0.9472543001174927, "rewards/rejected": 2.5950398445129395, "step": 65890 }, { "epoch": 3.0595663679836576, "grad_norm": 52.01295471191406, "learning_rate": 1.1650958726031849e-07, "logits/chosen": -18.93203353881836, "logits/rejected": -18.230979919433594, "logps/chosen": -438.2674865722656, "logps/rejected": -319.5267639160156, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 3.7900638580322266, "rewards/margins": 2.0120253562927246, "rewards/rejected": 1.7780386209487915, "step": 65900 }, { "epoch": 3.0600306420910908, "grad_norm": 56.22468566894531, "learning_rate": 1.1648173081387249e-07, "logits/chosen": -19.20456314086914, "logits/rejected": -17.65781593322754, "logps/chosen": -422.7762145996094, "logps/rejected": -354.05889892578125, "loss": 1.3618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.163924217224121, "rewards/margins": 0.7625974416732788, "rewards/rejected": 3.4013266563415527, "step": 65910 }, { "epoch": 3.0604949161985235, "grad_norm": 158.21632385253906, "learning_rate": 1.1645387436742652e-07, "logits/chosen": -19.327573776245117, "logits/rejected": -18.15005111694336, "logps/chosen": -360.2961730957031, "logps/rejected": -317.8775329589844, "loss": 0.5814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3639039993286133, "rewards/margins": 1.2474409341812134, "rewards/rejected": 1.1164629459381104, "step": 65920 }, { "epoch": 3.0609591903059568, "grad_norm": 37.52373123168945, "learning_rate": 1.1642601792098053e-07, "logits/chosen": -18.368000030517578, "logits/rejected": -18.015342712402344, "logps/chosen": -331.74053955078125, "logps/rejected": -319.1133728027344, "loss": 0.6827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1327898502349854, "rewards/margins": 0.5635767579078674, "rewards/rejected": 2.569213390350342, "step": 65930 }, { "epoch": 3.0614234644133895, "grad_norm": 47.84119415283203, "learning_rate": 1.1639816147453456e-07, "logits/chosen": -19.801868438720703, "logits/rejected": -19.609214782714844, "logps/chosen": -358.13555908203125, "logps/rejected": -270.13482666015625, "loss": 1.3431, "rewards/accuracies": 0.5, "rewards/chosen": 2.7127811908721924, "rewards/margins": 0.11852145195007324, "rewards/rejected": 2.594259738922119, "step": 65940 }, { "epoch": 3.0618877385208227, "grad_norm": 2.8359673023223877, "learning_rate": 1.1637030502808858e-07, "logits/chosen": -19.48849868774414, "logits/rejected": -18.351076126098633, "logps/chosen": -362.6304016113281, "logps/rejected": -287.2043151855469, "loss": 0.6276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.760653257369995, "rewards/margins": 1.7321714162826538, "rewards/rejected": 2.028481960296631, "step": 65950 }, { "epoch": 3.0623520126282555, "grad_norm": 89.53739929199219, "learning_rate": 1.163424485816426e-07, "logits/chosen": -18.67723274230957, "logits/rejected": -18.40813446044922, "logps/chosen": -219.2836456298828, "logps/rejected": -197.8950653076172, "loss": 0.7689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6198651790618896, "rewards/margins": 0.31670793890953064, "rewards/rejected": 1.3031569719314575, "step": 65960 }, { "epoch": 3.0628162867356887, "grad_norm": 39.67695617675781, "learning_rate": 1.163145921351966e-07, "logits/chosen": -20.01321792602539, "logits/rejected": -19.32199478149414, "logps/chosen": -363.16741943359375, "logps/rejected": -293.38226318359375, "loss": 0.6462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6167449951171875, "rewards/margins": 0.99481600522995, "rewards/rejected": 2.6219289302825928, "step": 65970 }, { "epoch": 3.063280560843122, "grad_norm": 15.48923397064209, "learning_rate": 1.1628673568875063e-07, "logits/chosen": -19.244070053100586, "logits/rejected": -18.353652954101562, "logps/chosen": -468.7652282714844, "logps/rejected": -417.6898498535156, "loss": 0.6102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1835036277771, "rewards/margins": 0.9383661150932312, "rewards/rejected": 3.2451376914978027, "step": 65980 }, { "epoch": 3.0637448349505547, "grad_norm": 0.5781447291374207, "learning_rate": 1.1625887924230465e-07, "logits/chosen": -18.919836044311523, "logits/rejected": -18.50545310974121, "logps/chosen": -353.93707275390625, "logps/rejected": -307.9155578613281, "loss": 0.9088, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.448284864425659, "rewards/margins": 1.120057225227356, "rewards/rejected": 2.3282275199890137, "step": 65990 }, { "epoch": 3.064209109057988, "grad_norm": 17.057188034057617, "learning_rate": 1.1623102279585867e-07, "logits/chosen": -19.4190673828125, "logits/rejected": -18.833229064941406, "logps/chosen": -518.17919921875, "logps/rejected": -382.23773193359375, "loss": 0.3796, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9063220024108887, "rewards/margins": 1.226630449295044, "rewards/rejected": 2.679691791534424, "step": 66000 }, { "epoch": 3.0646733831654207, "grad_norm": 242.17005920410156, "learning_rate": 1.1620316634941269e-07, "logits/chosen": -19.193510055541992, "logits/rejected": -19.211732864379883, "logps/chosen": -447.53375244140625, "logps/rejected": -389.50970458984375, "loss": 0.8555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.463486194610596, "rewards/margins": 0.9522415399551392, "rewards/rejected": 3.511244535446167, "step": 66010 }, { "epoch": 3.065137657272854, "grad_norm": 59.89949417114258, "learning_rate": 1.1617530990296672e-07, "logits/chosen": -18.964786529541016, "logits/rejected": -18.5172176361084, "logps/chosen": -453.1319885253906, "logps/rejected": -350.4217529296875, "loss": 0.6716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5836381912231445, "rewards/margins": 1.2264868021011353, "rewards/rejected": 3.357151508331299, "step": 66020 }, { "epoch": 3.0656019313802867, "grad_norm": 129.9123992919922, "learning_rate": 1.1614745345652072e-07, "logits/chosen": -18.055601119995117, "logits/rejected": -17.714984893798828, "logps/chosen": -351.0016784667969, "logps/rejected": -271.4361877441406, "loss": 0.7929, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0332114696502686, "rewards/margins": 0.8939601182937622, "rewards/rejected": 1.139251470565796, "step": 66030 }, { "epoch": 3.06606620548772, "grad_norm": 10.81173324584961, "learning_rate": 1.1611959701007475e-07, "logits/chosen": -18.957374572753906, "logits/rejected": -17.936939239501953, "logps/chosen": -395.5898132324219, "logps/rejected": -230.12081909179688, "loss": 0.3372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.012246608734131, "rewards/margins": 1.917624831199646, "rewards/rejected": 1.0946218967437744, "step": 66040 }, { "epoch": 3.066530479595153, "grad_norm": 24.417890548706055, "learning_rate": 1.1609174056362876e-07, "logits/chosen": -18.622224807739258, "logits/rejected": -17.58414077758789, "logps/chosen": -331.39007568359375, "logps/rejected": -252.70401000976562, "loss": 0.3345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6519181728363037, "rewards/margins": 1.7421754598617554, "rewards/rejected": 0.909742534160614, "step": 66050 }, { "epoch": 3.066994753702586, "grad_norm": 81.9676513671875, "learning_rate": 1.1606388411718279e-07, "logits/chosen": -18.612586975097656, "logits/rejected": -18.384164810180664, "logps/chosen": -395.9844055175781, "logps/rejected": -341.72308349609375, "loss": 0.6116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0548176765441895, "rewards/margins": 0.8988627195358276, "rewards/rejected": 2.1559548377990723, "step": 66060 }, { "epoch": 3.067459027810019, "grad_norm": 272.3399353027344, "learning_rate": 1.160360276707368e-07, "logits/chosen": -19.739688873291016, "logits/rejected": -19.699726104736328, "logps/chosen": -389.49896240234375, "logps/rejected": -423.9664611816406, "loss": 1.0232, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.0698447227478027, "rewards/margins": -0.20299629867076874, "rewards/rejected": 3.272841215133667, "step": 66070 }, { "epoch": 3.067923301917452, "grad_norm": 0.03918199986219406, "learning_rate": 1.1600817122429082e-07, "logits/chosen": -19.54303741455078, "logits/rejected": -18.80105209350586, "logps/chosen": -368.8663330078125, "logps/rejected": -303.8175048828125, "loss": 1.1297, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3484134674072266, "rewards/margins": 0.48196133971214294, "rewards/rejected": 1.8664519786834717, "step": 66080 }, { "epoch": 3.068387576024885, "grad_norm": 124.65621948242188, "learning_rate": 1.1598031477784483e-07, "logits/chosen": -18.527694702148438, "logits/rejected": -18.446269989013672, "logps/chosen": -344.7884216308594, "logps/rejected": -348.24560546875, "loss": 0.7711, "rewards/accuracies": 0.5, "rewards/chosen": 3.4998939037323, "rewards/margins": 0.6713827848434448, "rewards/rejected": 2.8285109996795654, "step": 66090 }, { "epoch": 3.0688518501323183, "grad_norm": 0.4222617447376251, "learning_rate": 1.1595245833139885e-07, "logits/chosen": -19.011600494384766, "logits/rejected": -18.634639739990234, "logps/chosen": -300.49749755859375, "logps/rejected": -319.3083190917969, "loss": 1.4516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9137685298919678, "rewards/margins": 0.10196392238140106, "rewards/rejected": 2.811805248260498, "step": 66100 }, { "epoch": 3.069316124239751, "grad_norm": 23.15744400024414, "learning_rate": 1.1592460188495287e-07, "logits/chosen": -19.88494110107422, "logits/rejected": -19.68293571472168, "logps/chosen": -432.1783142089844, "logps/rejected": -424.2992248535156, "loss": 0.6704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.584360122680664, "rewards/margins": 0.4345766603946686, "rewards/rejected": 3.1497836112976074, "step": 66110 }, { "epoch": 3.0697803983471843, "grad_norm": 102.33800506591797, "learning_rate": 1.1589674543850688e-07, "logits/chosen": -19.041860580444336, "logits/rejected": -18.47649383544922, "logps/chosen": -394.22528076171875, "logps/rejected": -335.6827087402344, "loss": 0.7884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7102279663085938, "rewards/margins": 0.9185444116592407, "rewards/rejected": 2.7916836738586426, "step": 66120 }, { "epoch": 3.070244672454617, "grad_norm": 33.007266998291016, "learning_rate": 1.158688889920609e-07, "logits/chosen": -19.87752914428711, "logits/rejected": -19.066877365112305, "logps/chosen": -494.96484375, "logps/rejected": -378.9730529785156, "loss": 0.3095, "rewards/accuracies": 1.0, "rewards/chosen": 5.0148115158081055, "rewards/margins": 1.7468703985214233, "rewards/rejected": 3.26794171333313, "step": 66130 }, { "epoch": 3.0707089465620503, "grad_norm": 11.754403114318848, "learning_rate": 1.1584103254561492e-07, "logits/chosen": -19.614463806152344, "logits/rejected": -19.006433486938477, "logps/chosen": -472.92401123046875, "logps/rejected": -405.0945739746094, "loss": 0.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.1123576164245605, "rewards/margins": 2.0332813262939453, "rewards/rejected": 3.0790772438049316, "step": 66140 }, { "epoch": 3.071173220669483, "grad_norm": 108.77973175048828, "learning_rate": 1.1581317609916895e-07, "logits/chosen": -18.23562240600586, "logits/rejected": -17.489599227905273, "logps/chosen": -389.09051513671875, "logps/rejected": -300.4625549316406, "loss": 0.3765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.175393581390381, "rewards/margins": 1.6493608951568604, "rewards/rejected": 1.5260326862335205, "step": 66150 }, { "epoch": 3.0716374947769163, "grad_norm": 69.92823028564453, "learning_rate": 1.1578531965272296e-07, "logits/chosen": -18.72879981994629, "logits/rejected": -18.252492904663086, "logps/chosen": -533.7346801757812, "logps/rejected": -435.45361328125, "loss": 0.8566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.959155082702637, "rewards/margins": 1.0557188987731934, "rewards/rejected": 3.9034359455108643, "step": 66160 }, { "epoch": 3.0721017688843495, "grad_norm": 26.830171585083008, "learning_rate": 1.1575746320627699e-07, "logits/chosen": -18.81497573852539, "logits/rejected": -17.612531661987305, "logps/chosen": -416.03265380859375, "logps/rejected": -301.84515380859375, "loss": 0.924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.586495876312256, "rewards/margins": 1.9499146938323975, "rewards/rejected": 1.6365816593170166, "step": 66170 }, { "epoch": 3.0725660429917823, "grad_norm": 42.37699508666992, "learning_rate": 1.1572960675983099e-07, "logits/chosen": -19.675823211669922, "logits/rejected": -18.52693748474121, "logps/chosen": -398.41717529296875, "logps/rejected": -337.8659973144531, "loss": 0.5171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4283647537231445, "rewards/margins": 1.7783300876617432, "rewards/rejected": 2.650034189224243, "step": 66180 }, { "epoch": 3.0730303170992155, "grad_norm": 48.69160842895508, "learning_rate": 1.1570175031338502e-07, "logits/chosen": -18.614307403564453, "logits/rejected": -18.548736572265625, "logps/chosen": -433.00103759765625, "logps/rejected": -373.240966796875, "loss": 0.3847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.797452926635742, "rewards/margins": 1.0697386264801025, "rewards/rejected": 2.7277140617370605, "step": 66190 }, { "epoch": 3.0734945912066483, "grad_norm": 33.214439392089844, "learning_rate": 1.1567389386693903e-07, "logits/chosen": -19.089580535888672, "logits/rejected": -18.8167781829834, "logps/chosen": -360.1977233886719, "logps/rejected": -340.6993713378906, "loss": 0.718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4652695655822754, "rewards/margins": 0.4630666673183441, "rewards/rejected": 2.0022027492523193, "step": 66200 }, { "epoch": 3.0739588653140815, "grad_norm": 143.39654541015625, "learning_rate": 1.1564603742049306e-07, "logits/chosen": -19.40575408935547, "logits/rejected": -19.037160873413086, "logps/chosen": -400.1837158203125, "logps/rejected": -375.6471252441406, "loss": 0.8579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.211133003234863, "rewards/margins": 0.5972931385040283, "rewards/rejected": 3.6138393878936768, "step": 66210 }, { "epoch": 3.0744231394215147, "grad_norm": 20.671266555786133, "learning_rate": 1.1561818097404707e-07, "logits/chosen": -18.489225387573242, "logits/rejected": -17.279943466186523, "logps/chosen": -404.4412536621094, "logps/rejected": -268.1880798339844, "loss": 0.4607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.886531352996826, "rewards/margins": 2.610291004180908, "rewards/rejected": 1.2762398719787598, "step": 66220 }, { "epoch": 3.0748874135289475, "grad_norm": 2.0497076511383057, "learning_rate": 1.155903245276011e-07, "logits/chosen": -18.846981048583984, "logits/rejected": -18.593711853027344, "logps/chosen": -387.68975830078125, "logps/rejected": -267.2164001464844, "loss": 0.5503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.807584762573242, "rewards/margins": 1.4521262645721436, "rewards/rejected": 1.3554580211639404, "step": 66230 }, { "epoch": 3.0753516876363807, "grad_norm": 12.081122398376465, "learning_rate": 1.155624680811551e-07, "logits/chosen": -19.18600082397461, "logits/rejected": -18.302690505981445, "logps/chosen": -359.0491027832031, "logps/rejected": -279.80474853515625, "loss": 0.8872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.475170135498047, "rewards/margins": 1.1739970445632935, "rewards/rejected": 2.301173210144043, "step": 66240 }, { "epoch": 3.0758159617438134, "grad_norm": 0.04843546450138092, "learning_rate": 1.1553461163470913e-07, "logits/chosen": -18.478191375732422, "logits/rejected": -17.593128204345703, "logps/chosen": -472.3604431152344, "logps/rejected": -343.552978515625, "loss": 0.929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.272411346435547, "rewards/margins": 2.429642915725708, "rewards/rejected": 1.842768907546997, "step": 66250 }, { "epoch": 3.0762802358512467, "grad_norm": 197.95285034179688, "learning_rate": 1.1550675518826315e-07, "logits/chosen": -19.227590560913086, "logits/rejected": -18.838857650756836, "logps/chosen": -386.30511474609375, "logps/rejected": -339.54437255859375, "loss": 0.5089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.015377998352051, "rewards/margins": 1.2446386814117432, "rewards/rejected": 2.7707393169403076, "step": 66260 }, { "epoch": 3.0767445099586794, "grad_norm": 33.599178314208984, "learning_rate": 1.1547889874181717e-07, "logits/chosen": -18.889074325561523, "logits/rejected": -18.212236404418945, "logps/chosen": -395.2259216308594, "logps/rejected": -269.57110595703125, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.56689715385437, "rewards/margins": 1.669263243675232, "rewards/rejected": 1.8976342678070068, "step": 66270 }, { "epoch": 3.0772087840661126, "grad_norm": 1.4551820755004883, "learning_rate": 1.1545104229537119e-07, "logits/chosen": -18.796268463134766, "logits/rejected": -18.000873565673828, "logps/chosen": -435.26348876953125, "logps/rejected": -313.9231872558594, "loss": 1.105, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.618597507476807, "rewards/margins": 1.4138715267181396, "rewards/rejected": 3.204725742340088, "step": 66280 }, { "epoch": 3.077673058173546, "grad_norm": 0.38179248571395874, "learning_rate": 1.1542318584892519e-07, "logits/chosen": -18.66800880432129, "logits/rejected": -18.362545013427734, "logps/chosen": -350.8585205078125, "logps/rejected": -244.9369354248047, "loss": 0.9256, "rewards/accuracies": 0.5, "rewards/chosen": 2.2827987670898438, "rewards/margins": 0.4460299015045166, "rewards/rejected": 1.8367688655853271, "step": 66290 }, { "epoch": 3.0781373322809786, "grad_norm": 157.65086364746094, "learning_rate": 1.1539532940247922e-07, "logits/chosen": -19.24032974243164, "logits/rejected": -19.35284996032715, "logps/chosen": -404.716552734375, "logps/rejected": -429.68292236328125, "loss": 0.8132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1171064376831055, "rewards/margins": 0.7872132062911987, "rewards/rejected": 3.329893112182617, "step": 66300 }, { "epoch": 3.078601606388412, "grad_norm": 65.16325378417969, "learning_rate": 1.1536747295603323e-07, "logits/chosen": -18.617013931274414, "logits/rejected": -17.665386199951172, "logps/chosen": -515.0370483398438, "logps/rejected": -354.242919921875, "loss": 0.507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.451320171356201, "rewards/margins": 1.16579270362854, "rewards/rejected": 2.2855277061462402, "step": 66310 }, { "epoch": 3.0790658804958446, "grad_norm": 24.850555419921875, "learning_rate": 1.1533961650958726e-07, "logits/chosen": -17.571731567382812, "logits/rejected": -17.152912139892578, "logps/chosen": -231.25350952148438, "logps/rejected": -205.0401611328125, "loss": 0.6556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3522759675979614, "rewards/margins": 0.8412330746650696, "rewards/rejected": 0.5110427141189575, "step": 66320 }, { "epoch": 3.079530154603278, "grad_norm": 71.38986206054688, "learning_rate": 1.1531176006314126e-07, "logits/chosen": -19.295013427734375, "logits/rejected": -18.747379302978516, "logps/chosen": -372.7110595703125, "logps/rejected": -298.2708435058594, "loss": 0.288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.382537364959717, "rewards/margins": 1.5907598733901978, "rewards/rejected": 1.7917778491973877, "step": 66330 }, { "epoch": 3.0799944287107106, "grad_norm": 120.13450622558594, "learning_rate": 1.1528390361669529e-07, "logits/chosen": -18.302364349365234, "logits/rejected": -18.04378890991211, "logps/chosen": -448.40911865234375, "logps/rejected": -402.352294921875, "loss": 0.4792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.27946662902832, "rewards/margins": 1.6170494556427002, "rewards/rejected": 2.66241717338562, "step": 66340 }, { "epoch": 3.080458702818144, "grad_norm": 75.56692504882812, "learning_rate": 1.152560471702493e-07, "logits/chosen": -19.285629272460938, "logits/rejected": -18.023223876953125, "logps/chosen": -374.3695068359375, "logps/rejected": -317.9856872558594, "loss": 0.4355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.575812578201294, "rewards/margins": 1.795255422592163, "rewards/rejected": 1.7805572748184204, "step": 66350 }, { "epoch": 3.080922976925577, "grad_norm": 34.241416931152344, "learning_rate": 1.1522819072380333e-07, "logits/chosen": -19.943552017211914, "logits/rejected": -19.056156158447266, "logps/chosen": -388.00128173828125, "logps/rejected": -313.8125, "loss": 0.8166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.392210960388184, "rewards/margins": 1.6229890584945679, "rewards/rejected": 2.7692222595214844, "step": 66360 }, { "epoch": 3.08138725103301, "grad_norm": 34.587135314941406, "learning_rate": 1.1520033427735735e-07, "logits/chosen": -20.248218536376953, "logits/rejected": -19.06503677368164, "logps/chosen": -270.78839111328125, "logps/rejected": -261.7185974121094, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.624904155731201, "rewards/margins": 1.624528169631958, "rewards/rejected": 2.0003762245178223, "step": 66370 }, { "epoch": 3.081851525140443, "grad_norm": 26.649124145507812, "learning_rate": 1.1517247783091137e-07, "logits/chosen": -19.2844181060791, "logits/rejected": -18.873292922973633, "logps/chosen": -373.1421813964844, "logps/rejected": -318.2558898925781, "loss": 0.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8537392616271973, "rewards/margins": 1.17717707157135, "rewards/rejected": 2.6765620708465576, "step": 66380 }, { "epoch": 3.082315799247876, "grad_norm": 305.13421630859375, "learning_rate": 1.1514462138446537e-07, "logits/chosen": -18.162832260131836, "logits/rejected": -18.18544578552246, "logps/chosen": -368.9871520996094, "logps/rejected": -417.6725158691406, "loss": 1.4654, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5674935579299927, "rewards/margins": -0.8581429719924927, "rewards/rejected": 2.4256365299224854, "step": 66390 }, { "epoch": 3.082780073355309, "grad_norm": 0.8625071048736572, "learning_rate": 1.151167649380194e-07, "logits/chosen": -18.063114166259766, "logits/rejected": -17.29492950439453, "logps/chosen": -372.7585754394531, "logps/rejected": -302.39727783203125, "loss": 0.386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.507181167602539, "rewards/margins": 2.143646717071533, "rewards/rejected": 1.3635343313217163, "step": 66400 }, { "epoch": 3.083244347462742, "grad_norm": 171.34121704101562, "learning_rate": 1.1508890849157342e-07, "logits/chosen": -20.19678497314453, "logits/rejected": -19.46656608581543, "logps/chosen": -538.6659545898438, "logps/rejected": -380.81414794921875, "loss": 0.273, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.847005844116211, "rewards/margins": 2.093447208404541, "rewards/rejected": 3.753559112548828, "step": 66410 }, { "epoch": 3.083708621570175, "grad_norm": 33.78889846801758, "learning_rate": 1.1506105204512744e-07, "logits/chosen": -18.44780921936035, "logits/rejected": -18.957401275634766, "logps/chosen": -405.4013366699219, "logps/rejected": -463.72625732421875, "loss": 1.6371, "rewards/accuracies": 0.5, "rewards/chosen": 3.298476457595825, "rewards/margins": -0.5094574689865112, "rewards/rejected": 3.807934284210205, "step": 66420 }, { "epoch": 3.084172895677608, "grad_norm": 19.289831161499023, "learning_rate": 1.1503319559868146e-07, "logits/chosen": -19.16415023803711, "logits/rejected": -17.618762969970703, "logps/chosen": -355.80035400390625, "logps/rejected": -248.92416381835938, "loss": 0.4674, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.90873646736145, "rewards/margins": 1.873318076133728, "rewards/rejected": 2.035418748855591, "step": 66430 }, { "epoch": 3.084637169785041, "grad_norm": 48.891357421875, "learning_rate": 1.1500533915223549e-07, "logits/chosen": -19.98491668701172, "logits/rejected": -18.493051528930664, "logps/chosen": -485.431640625, "logps/rejected": -382.10174560546875, "loss": 0.3102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.078632831573486, "rewards/margins": 1.9611765146255493, "rewards/rejected": 2.1174559593200684, "step": 66440 }, { "epoch": 3.085101443892474, "grad_norm": 237.8365478515625, "learning_rate": 1.1497748270578949e-07, "logits/chosen": -19.073816299438477, "logits/rejected": -18.705326080322266, "logps/chosen": -388.1728210449219, "logps/rejected": -308.03253173828125, "loss": 1.0286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.78180193901062, "rewards/margins": 1.6336148977279663, "rewards/rejected": 2.148186206817627, "step": 66450 }, { "epoch": 3.085565717999907, "grad_norm": 18.00642204284668, "learning_rate": 1.1494962625934352e-07, "logits/chosen": -19.519977569580078, "logits/rejected": -18.40237045288086, "logps/chosen": -371.505859375, "logps/rejected": -276.63531494140625, "loss": 0.2622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.611672401428223, "rewards/margins": 2.1977601051330566, "rewards/rejected": 2.413912296295166, "step": 66460 }, { "epoch": 3.08602999210734, "grad_norm": 228.5675048828125, "learning_rate": 1.1492176981289753e-07, "logits/chosen": -19.06328582763672, "logits/rejected": -17.90316390991211, "logps/chosen": -339.05316162109375, "logps/rejected": -230.6685028076172, "loss": 0.7466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0653443336486816, "rewards/margins": 1.84273362159729, "rewards/rejected": 1.2226107120513916, "step": 66470 }, { "epoch": 3.0864942662147734, "grad_norm": 158.5803985595703, "learning_rate": 1.1489391336645156e-07, "logits/chosen": -19.326553344726562, "logits/rejected": -18.8377628326416, "logps/chosen": -578.7924194335938, "logps/rejected": -478.6268615722656, "loss": 0.5532, "rewards/accuracies": 0.5, "rewards/chosen": 5.2754926681518555, "rewards/margins": 1.0877232551574707, "rewards/rejected": 4.187769412994385, "step": 66480 }, { "epoch": 3.086958540322206, "grad_norm": 14.16277027130127, "learning_rate": 1.1486605692000557e-07, "logits/chosen": -18.87337875366211, "logits/rejected": -18.62118148803711, "logps/chosen": -338.1708679199219, "logps/rejected": -281.9027099609375, "loss": 0.6278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9448189735412598, "rewards/margins": 0.9002315402030945, "rewards/rejected": 2.0445876121520996, "step": 66490 }, { "epoch": 3.0874228144296394, "grad_norm": 0.04463633522391319, "learning_rate": 1.1483820047355957e-07, "logits/chosen": -20.18872833251953, "logits/rejected": -18.936552047729492, "logps/chosen": -423.31658935546875, "logps/rejected": -325.3919372558594, "loss": 0.5132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.050202369689941, "rewards/margins": 2.016181468963623, "rewards/rejected": 2.03402042388916, "step": 66500 }, { "epoch": 3.087887088537072, "grad_norm": 222.2135772705078, "learning_rate": 1.148103440271136e-07, "logits/chosen": -19.364192962646484, "logits/rejected": -18.534013748168945, "logps/chosen": -419.34588623046875, "logps/rejected": -299.9665832519531, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.042979717254639, "rewards/margins": 2.5102429389953613, "rewards/rejected": 2.5327370166778564, "step": 66510 }, { "epoch": 3.0883513626445054, "grad_norm": 4.528761863708496, "learning_rate": 1.1478248758066762e-07, "logits/chosen": -19.467023849487305, "logits/rejected": -19.66616439819336, "logps/chosen": -298.12176513671875, "logps/rejected": -290.7601013183594, "loss": 1.5311, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8736188411712646, "rewards/margins": 0.041863441467285156, "rewards/rejected": 2.8317551612854004, "step": 66520 }, { "epoch": 3.088815636751938, "grad_norm": 20.0657901763916, "learning_rate": 1.1475463113422164e-07, "logits/chosen": -20.169103622436523, "logits/rejected": -18.943111419677734, "logps/chosen": -380.60919189453125, "logps/rejected": -332.9676818847656, "loss": 1.0275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4374184608459473, "rewards/margins": 0.47913092374801636, "rewards/rejected": 2.958287239074707, "step": 66530 }, { "epoch": 3.0892799108593714, "grad_norm": 95.45450592041016, "learning_rate": 1.1472677468777565e-07, "logits/chosen": -19.120840072631836, "logits/rejected": -18.7499942779541, "logps/chosen": -356.5346374511719, "logps/rejected": -313.4200134277344, "loss": 0.9194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3617489337921143, "rewards/margins": 0.07239391654729843, "rewards/rejected": 2.2893548011779785, "step": 66540 }, { "epoch": 3.0897441849668046, "grad_norm": 31.701416015625, "learning_rate": 1.1469891824132967e-07, "logits/chosen": -19.09398078918457, "logits/rejected": -18.102584838867188, "logps/chosen": -365.6229248046875, "logps/rejected": -268.8418884277344, "loss": 1.0389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.273686170578003, "rewards/margins": 0.9469336271286011, "rewards/rejected": 2.3267524242401123, "step": 66550 }, { "epoch": 3.0902084590742374, "grad_norm": 15.61379623413086, "learning_rate": 1.1467106179488369e-07, "logits/chosen": -18.539165496826172, "logits/rejected": -17.577335357666016, "logps/chosen": -444.98773193359375, "logps/rejected": -309.34112548828125, "loss": 0.5314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9458134174346924, "rewards/margins": 1.0198843479156494, "rewards/rejected": 1.9259288311004639, "step": 66560 }, { "epoch": 3.0906727331816706, "grad_norm": 59.16691207885742, "learning_rate": 1.1464320534843772e-07, "logits/chosen": -18.55667495727539, "logits/rejected": -18.548295974731445, "logps/chosen": -347.7099304199219, "logps/rejected": -374.7320251464844, "loss": 1.3566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5941758155822754, "rewards/margins": -0.43474873900413513, "rewards/rejected": 3.0289244651794434, "step": 66570 }, { "epoch": 3.0911370072891033, "grad_norm": 129.8006134033203, "learning_rate": 1.1461534890199173e-07, "logits/chosen": -19.70531463623047, "logits/rejected": -18.184293746948242, "logps/chosen": -422.876953125, "logps/rejected": -277.0671691894531, "loss": 0.545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.712313175201416, "rewards/margins": 1.9953725337982178, "rewards/rejected": 1.7169402837753296, "step": 66580 }, { "epoch": 3.0916012813965366, "grad_norm": 0.4296107590198517, "learning_rate": 1.1458749245554576e-07, "logits/chosen": -18.95619773864746, "logits/rejected": -17.468229293823242, "logps/chosen": -349.914306640625, "logps/rejected": -212.2705078125, "loss": 0.4219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.513343095779419, "rewards/margins": 2.551222562789917, "rewards/rejected": 0.962120532989502, "step": 66590 }, { "epoch": 3.0920655555039693, "grad_norm": 74.51094818115234, "learning_rate": 1.1455963600909976e-07, "logits/chosen": -19.025279998779297, "logits/rejected": -18.184162139892578, "logps/chosen": -361.12969970703125, "logps/rejected": -295.7315979003906, "loss": 0.6515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0373566150665283, "rewards/margins": 1.0448437929153442, "rewards/rejected": 1.9925124645233154, "step": 66600 }, { "epoch": 3.0925298296114025, "grad_norm": 73.78241729736328, "learning_rate": 1.1453177956265379e-07, "logits/chosen": -18.667789459228516, "logits/rejected": -17.824203491210938, "logps/chosen": -268.3610534667969, "logps/rejected": -188.25790405273438, "loss": 0.419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.220919132232666, "rewards/margins": 1.5065511465072632, "rewards/rejected": 0.7143679857254028, "step": 66610 }, { "epoch": 3.0929941037188358, "grad_norm": 79.73027038574219, "learning_rate": 1.145039231162078e-07, "logits/chosen": -17.59648323059082, "logits/rejected": -17.59098243713379, "logps/chosen": -433.63427734375, "logps/rejected": -436.605712890625, "loss": 0.9634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.886805772781372, "rewards/margins": 0.4678807854652405, "rewards/rejected": 3.4189248085021973, "step": 66620 }, { "epoch": 3.0934583778262685, "grad_norm": 197.17373657226562, "learning_rate": 1.1447606666976183e-07, "logits/chosen": -17.94784927368164, "logits/rejected": -17.716869354248047, "logps/chosen": -259.69403076171875, "logps/rejected": -253.1649932861328, "loss": 0.9212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.882584810256958, "rewards/margins": 0.8334003686904907, "rewards/rejected": 1.0491844415664673, "step": 66630 }, { "epoch": 3.0939226519337018, "grad_norm": 54.78140640258789, "learning_rate": 1.1444821022331584e-07, "logits/chosen": -18.549718856811523, "logits/rejected": -17.821460723876953, "logps/chosen": -327.4386291503906, "logps/rejected": -266.2513122558594, "loss": 0.661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4039974212646484, "rewards/margins": 1.5881140232086182, "rewards/rejected": 1.8158830404281616, "step": 66640 }, { "epoch": 3.0943869260411345, "grad_norm": 22.7796688079834, "learning_rate": 1.1442035377686987e-07, "logits/chosen": -19.367040634155273, "logits/rejected": -18.442636489868164, "logps/chosen": -455.21392822265625, "logps/rejected": -398.7106018066406, "loss": 0.2857, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.087313652038574, "rewards/margins": 1.7227847576141357, "rewards/rejected": 2.3645291328430176, "step": 66650 }, { "epoch": 3.0948512001485677, "grad_norm": 24.693527221679688, "learning_rate": 1.1439528297506848e-07, "logits/chosen": -18.660200119018555, "logits/rejected": -17.308759689331055, "logps/chosen": -417.83343505859375, "logps/rejected": -292.1025390625, "loss": 1.1309, "rewards/accuracies": 0.5, "rewards/chosen": 3.4628148078918457, "rewards/margins": 1.0829801559448242, "rewards/rejected": 2.3798344135284424, "step": 66660 }, { "epoch": 3.095315474256001, "grad_norm": 44.42707061767578, "learning_rate": 1.143674265286225e-07, "logits/chosen": -20.017244338989258, "logits/rejected": -18.775066375732422, "logps/chosen": -364.42254638671875, "logps/rejected": -310.53558349609375, "loss": 0.5157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4447293281555176, "rewards/margins": 1.1516746282577515, "rewards/rejected": 2.2930545806884766, "step": 66670 }, { "epoch": 3.0957797483634337, "grad_norm": 26.43155288696289, "learning_rate": 1.1433957008217652e-07, "logits/chosen": -19.566402435302734, "logits/rejected": -18.989566802978516, "logps/chosen": -350.7408447265625, "logps/rejected": -280.47088623046875, "loss": 0.4158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.701305627822876, "rewards/margins": 1.104381799697876, "rewards/rejected": 1.596923589706421, "step": 66680 }, { "epoch": 3.096244022470867, "grad_norm": 32.8138427734375, "learning_rate": 1.1431171363573052e-07, "logits/chosen": -20.00524139404297, "logits/rejected": -18.694232940673828, "logps/chosen": -451.04345703125, "logps/rejected": -307.9918518066406, "loss": 0.3119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.810675621032715, "rewards/margins": 2.366389513015747, "rewards/rejected": 2.4442861080169678, "step": 66690 }, { "epoch": 3.0967082965782997, "grad_norm": 143.203857421875, "learning_rate": 1.1428385718928455e-07, "logits/chosen": -20.02630043029785, "logits/rejected": -19.940624237060547, "logps/chosen": -488.03656005859375, "logps/rejected": -384.63714599609375, "loss": 0.8482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.881772994995117, "rewards/margins": 0.7111079692840576, "rewards/rejected": 4.1706647872924805, "step": 66700 }, { "epoch": 3.097172570685733, "grad_norm": 4.759716510772705, "learning_rate": 1.1425600074283856e-07, "logits/chosen": -18.781841278076172, "logits/rejected": -17.99764060974121, "logps/chosen": -369.08074951171875, "logps/rejected": -336.40460205078125, "loss": 0.4334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7428040504455566, "rewards/margins": 1.7634910345077515, "rewards/rejected": 1.9793126583099365, "step": 66710 }, { "epoch": 3.0976368447931657, "grad_norm": 146.62030029296875, "learning_rate": 1.1422814429639259e-07, "logits/chosen": -18.808246612548828, "logits/rejected": -17.700775146484375, "logps/chosen": -491.85260009765625, "logps/rejected": -334.20941162109375, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3625648021698, "rewards/margins": 1.0601229667663574, "rewards/rejected": 2.3024418354034424, "step": 66720 }, { "epoch": 3.098101118900599, "grad_norm": 47.28657913208008, "learning_rate": 1.1420028784994661e-07, "logits/chosen": -18.592876434326172, "logits/rejected": -18.296953201293945, "logps/chosen": -418.69952392578125, "logps/rejected": -376.36248779296875, "loss": 0.8062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8151772022247314, "rewards/margins": 0.9450546503067017, "rewards/rejected": 1.8701225519180298, "step": 66730 }, { "epoch": 3.098565393008032, "grad_norm": 16.343847274780273, "learning_rate": 1.1417243140350064e-07, "logits/chosen": -18.637168884277344, "logits/rejected": -17.36216163635254, "logps/chosen": -303.97760009765625, "logps/rejected": -227.42575073242188, "loss": 0.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5297672748565674, "rewards/margins": 0.6999168992042542, "rewards/rejected": 1.829850435256958, "step": 66740 }, { "epoch": 3.099029667115465, "grad_norm": 84.50820922851562, "learning_rate": 1.1414457495705464e-07, "logits/chosen": -19.727680206298828, "logits/rejected": -19.43231201171875, "logps/chosen": -304.01715087890625, "logps/rejected": -285.8614807128906, "loss": 0.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2337520122528076, "rewards/margins": 0.6392213702201843, "rewards/rejected": 2.5945305824279785, "step": 66750 }, { "epoch": 3.099493941222898, "grad_norm": 25.943273544311523, "learning_rate": 1.1411671851060865e-07, "logits/chosen": -19.84617805480957, "logits/rejected": -19.319599151611328, "logps/chosen": -418.6039123535156, "logps/rejected": -390.17987060546875, "loss": 0.7283, "rewards/accuracies": 0.5, "rewards/chosen": 4.16648530960083, "rewards/margins": 1.25941801071167, "rewards/rejected": 2.9070675373077393, "step": 66760 }, { "epoch": 3.099958215330331, "grad_norm": 77.33187103271484, "learning_rate": 1.1408886206416268e-07, "logits/chosen": -19.522411346435547, "logits/rejected": -18.583263397216797, "logps/chosen": -370.43035888671875, "logps/rejected": -262.77435302734375, "loss": 0.3659, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.398355960845947, "rewards/margins": 2.2180397510528564, "rewards/rejected": 2.180316209793091, "step": 66770 }, { "epoch": 3.100422489437764, "grad_norm": 12.63890552520752, "learning_rate": 1.1406100561771669e-07, "logits/chosen": -19.713777542114258, "logits/rejected": -18.98276710510254, "logps/chosen": -436.3150939941406, "logps/rejected": -372.02532958984375, "loss": 0.4387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.52497673034668, "rewards/margins": 1.9667236804962158, "rewards/rejected": 2.558253288269043, "step": 66780 }, { "epoch": 3.100886763545197, "grad_norm": 2.3489327430725098, "learning_rate": 1.1403314917127072e-07, "logits/chosen": -19.13698959350586, "logits/rejected": -20.03108024597168, "logps/chosen": -312.2425842285156, "logps/rejected": -392.984375, "loss": 1.3808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.208569049835205, "rewards/margins": -0.1497945487499237, "rewards/rejected": 3.358363389968872, "step": 66790 }, { "epoch": 3.10135103765263, "grad_norm": 199.43408203125, "learning_rate": 1.1400529272482472e-07, "logits/chosen": -18.468708038330078, "logits/rejected": -18.625463485717773, "logps/chosen": -288.85711669921875, "logps/rejected": -252.608154296875, "loss": 1.0446, "rewards/accuracies": 0.5, "rewards/chosen": 1.7454038858413696, "rewards/margins": 0.06501825153827667, "rewards/rejected": 1.6803855895996094, "step": 66800 }, { "epoch": 3.1018153117600633, "grad_norm": 158.6285400390625, "learning_rate": 1.1397743627837875e-07, "logits/chosen": -19.718069076538086, "logits/rejected": -19.26660919189453, "logps/chosen": -337.25067138671875, "logps/rejected": -270.01788330078125, "loss": 0.5763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7421491146087646, "rewards/margins": 1.1261957883834839, "rewards/rejected": 1.6159532070159912, "step": 66810 }, { "epoch": 3.102279585867496, "grad_norm": 45.768619537353516, "learning_rate": 1.1394957983193276e-07, "logits/chosen": -18.440465927124023, "logits/rejected": -18.34907341003418, "logps/chosen": -449.69464111328125, "logps/rejected": -444.4087829589844, "loss": 1.2729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.724104404449463, "rewards/margins": 0.7899040579795837, "rewards/rejected": 2.9342005252838135, "step": 66820 }, { "epoch": 3.1027438599749293, "grad_norm": 180.5124969482422, "learning_rate": 1.1392172338548679e-07, "logits/chosen": -19.908090591430664, "logits/rejected": -18.655643463134766, "logps/chosen": -507.4718322753906, "logps/rejected": -386.6015625, "loss": 0.8347, "rewards/accuracies": 0.5, "rewards/chosen": 4.134166717529297, "rewards/margins": 1.1276775598526, "rewards/rejected": 3.0064890384674072, "step": 66830 }, { "epoch": 3.103208134082362, "grad_norm": 4.886013984680176, "learning_rate": 1.138938669390408e-07, "logits/chosen": -18.582422256469727, "logits/rejected": -17.629770278930664, "logps/chosen": -447.7030334472656, "logps/rejected": -305.35943603515625, "loss": 0.227, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.145058631896973, "rewards/margins": 2.3875980377197266, "rewards/rejected": 1.757460594177246, "step": 66840 }, { "epoch": 3.1036724081897953, "grad_norm": 94.7003173828125, "learning_rate": 1.1386601049259482e-07, "logits/chosen": -19.015127182006836, "logits/rejected": -19.043224334716797, "logps/chosen": -387.55316162109375, "logps/rejected": -417.02142333984375, "loss": 0.7225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6304078102111816, "rewards/margins": 0.8871790766716003, "rewards/rejected": 2.7432289123535156, "step": 66850 }, { "epoch": 3.104136682297228, "grad_norm": 120.27532958984375, "learning_rate": 1.1383815404614884e-07, "logits/chosen": -19.20401954650879, "logits/rejected": -19.21893310546875, "logps/chosen": -361.2694091796875, "logps/rejected": -354.5793762207031, "loss": 1.2544, "rewards/accuracies": 0.5, "rewards/chosen": 3.5568275451660156, "rewards/margins": 0.32877349853515625, "rewards/rejected": 3.2280540466308594, "step": 66860 }, { "epoch": 3.1046009564046613, "grad_norm": 1.117595911026001, "learning_rate": 1.1381029759970286e-07, "logits/chosen": -18.814847946166992, "logits/rejected": -18.90951919555664, "logps/chosen": -273.52032470703125, "logps/rejected": -314.10150146484375, "loss": 0.9955, "rewards/accuracies": 0.5, "rewards/chosen": 2.066580057144165, "rewards/margins": 0.33517318964004517, "rewards/rejected": 1.7314069271087646, "step": 66870 }, { "epoch": 3.1050652305120945, "grad_norm": 7.091444492340088, "learning_rate": 1.1378244115325688e-07, "logits/chosen": -19.70234489440918, "logits/rejected": -18.453893661499023, "logps/chosen": -481.6611328125, "logps/rejected": -351.7064208984375, "loss": 0.4061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.191405296325684, "rewards/margins": 2.7803688049316406, "rewards/rejected": 2.411036491394043, "step": 66880 }, { "epoch": 3.1055295046195273, "grad_norm": 49.50304412841797, "learning_rate": 1.137545847068109e-07, "logits/chosen": -18.47310447692871, "logits/rejected": -18.98880958557129, "logps/chosen": -247.2539825439453, "logps/rejected": -330.97833251953125, "loss": 1.6149, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.5894787311553955, "rewards/margins": -0.5952485799789429, "rewards/rejected": 3.184727191925049, "step": 66890 }, { "epoch": 3.1059937787269605, "grad_norm": 29.409101486206055, "learning_rate": 1.1372672826036491e-07, "logits/chosen": -19.173198699951172, "logits/rejected": -18.440872192382812, "logps/chosen": -367.8867492675781, "logps/rejected": -284.498291015625, "loss": 0.3605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9262642860412598, "rewards/margins": 1.6073710918426514, "rewards/rejected": 2.3188929557800293, "step": 66900 }, { "epoch": 3.1064580528343932, "grad_norm": 199.62220764160156, "learning_rate": 1.1369887181391894e-07, "logits/chosen": -18.506425857543945, "logits/rejected": -18.304580688476562, "logps/chosen": -307.92352294921875, "logps/rejected": -346.5059814453125, "loss": 1.1813, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2221555709838867, "rewards/margins": -0.39362069964408875, "rewards/rejected": 2.6157760620117188, "step": 66910 }, { "epoch": 3.1069223269418265, "grad_norm": 2.6002442836761475, "learning_rate": 1.1367101536747295e-07, "logits/chosen": -19.359088897705078, "logits/rejected": -18.07248306274414, "logps/chosen": -410.4520568847656, "logps/rejected": -312.68182373046875, "loss": 0.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.83599591255188, "rewards/margins": 1.613994836807251, "rewards/rejected": 2.222001552581787, "step": 66920 }, { "epoch": 3.1073866010492597, "grad_norm": 54.3389892578125, "learning_rate": 1.1364315892102698e-07, "logits/chosen": -18.04263687133789, "logits/rejected": -18.09185218811035, "logps/chosen": -338.24749755859375, "logps/rejected": -381.79388427734375, "loss": 1.7696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.293541193008423, "rewards/margins": -0.7307166457176208, "rewards/rejected": 3.0242576599121094, "step": 66930 }, { "epoch": 3.1078508751566924, "grad_norm": 155.9128875732422, "learning_rate": 1.1361530247458099e-07, "logits/chosen": -19.802757263183594, "logits/rejected": -19.710391998291016, "logps/chosen": -317.5160217285156, "logps/rejected": -266.252685546875, "loss": 1.1814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0068411827087402, "rewards/margins": -0.2891191840171814, "rewards/rejected": 3.2959606647491455, "step": 66940 }, { "epoch": 3.1083151492641257, "grad_norm": 45.084739685058594, "learning_rate": 1.1358744602813499e-07, "logits/chosen": -20.183574676513672, "logits/rejected": -20.044307708740234, "logps/chosen": -358.889892578125, "logps/rejected": -386.7212219238281, "loss": 0.771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.911508560180664, "rewards/margins": 1.188947081565857, "rewards/rejected": 2.7225615978240967, "step": 66950 }, { "epoch": 3.1087794233715584, "grad_norm": 236.652587890625, "learning_rate": 1.1355958958168902e-07, "logits/chosen": -20.387683868408203, "logits/rejected": -18.712547302246094, "logps/chosen": -461.199462890625, "logps/rejected": -343.59210205078125, "loss": 0.8326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.359091758728027, "rewards/margins": 1.2149817943572998, "rewards/rejected": 3.1441099643707275, "step": 66960 }, { "epoch": 3.1092436974789917, "grad_norm": 6.764323711395264, "learning_rate": 1.1353173313524304e-07, "logits/chosen": -19.585180282592773, "logits/rejected": -18.712505340576172, "logps/chosen": -343.04168701171875, "logps/rejected": -249.2831573486328, "loss": 0.4056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.867526054382324, "rewards/margins": 1.4521942138671875, "rewards/rejected": 1.4153318405151367, "step": 66970 }, { "epoch": 3.1097079715864244, "grad_norm": 43.62452697753906, "learning_rate": 1.1350387668879706e-07, "logits/chosen": -19.952104568481445, "logits/rejected": -19.57330894470215, "logps/chosen": -462.6958923339844, "logps/rejected": -415.5152893066406, "loss": 0.8581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.209109306335449, "rewards/margins": 1.0235705375671387, "rewards/rejected": 3.1855387687683105, "step": 66980 }, { "epoch": 3.1101722456938576, "grad_norm": 144.6719970703125, "learning_rate": 1.1347602024235108e-07, "logits/chosen": -18.908113479614258, "logits/rejected": -19.120813369750977, "logps/chosen": -298.97894287109375, "logps/rejected": -336.3805236816406, "loss": 1.1297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6446304321289062, "rewards/margins": 0.1742049753665924, "rewards/rejected": 2.470425605773926, "step": 66990 }, { "epoch": 3.110636519801291, "grad_norm": 45.93043518066406, "learning_rate": 1.134481637959051e-07, "logits/chosen": -20.024526596069336, "logits/rejected": -18.796688079833984, "logps/chosen": -506.0901794433594, "logps/rejected": -418.1231994628906, "loss": 0.2866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.150625705718994, "rewards/margins": 2.0039145946502686, "rewards/rejected": 3.1467103958129883, "step": 67000 }, { "epoch": 3.1111007939087236, "grad_norm": 1.0028330087661743, "learning_rate": 1.1342030734945911e-07, "logits/chosen": -18.937040328979492, "logits/rejected": -17.823572158813477, "logps/chosen": -397.4775390625, "logps/rejected": -325.84320068359375, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 4.1475324630737305, "rewards/margins": 2.4364876747131348, "rewards/rejected": 1.7110445499420166, "step": 67010 }, { "epoch": 3.111565068016157, "grad_norm": 62.73357391357422, "learning_rate": 1.1339245090301313e-07, "logits/chosen": -18.59276580810547, "logits/rejected": -19.080303192138672, "logps/chosen": -403.21795654296875, "logps/rejected": -423.32708740234375, "loss": 1.0346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8152036666870117, "rewards/margins": 0.05283334106206894, "rewards/rejected": 2.7623701095581055, "step": 67020 }, { "epoch": 3.1120293421235896, "grad_norm": 269.6317138671875, "learning_rate": 1.1336459445656715e-07, "logits/chosen": -19.743671417236328, "logits/rejected": -19.398847579956055, "logps/chosen": -447.8219299316406, "logps/rejected": -454.4864196777344, "loss": 0.954, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.427377223968506, "rewards/margins": 0.03847801685333252, "rewards/rejected": 4.388899326324463, "step": 67030 }, { "epoch": 3.112493616231023, "grad_norm": 39.84797286987305, "learning_rate": 1.1333673801012118e-07, "logits/chosen": -17.71625328063965, "logits/rejected": -18.028310775756836, "logps/chosen": -303.1602783203125, "logps/rejected": -270.39508056640625, "loss": 0.5287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.488387107849121, "rewards/margins": 0.9409855008125305, "rewards/rejected": 1.547401785850525, "step": 67040 }, { "epoch": 3.112957890338456, "grad_norm": 61.72859191894531, "learning_rate": 1.1330888156367518e-07, "logits/chosen": -18.18740463256836, "logits/rejected": -17.75364112854004, "logps/chosen": -248.3064727783203, "logps/rejected": -216.1739044189453, "loss": 1.0002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7443159222602844, "rewards/margins": 0.0008850872400216758, "rewards/rejected": 0.7434309124946594, "step": 67050 }, { "epoch": 3.113422164445889, "grad_norm": 36.716712951660156, "learning_rate": 1.132810251172292e-07, "logits/chosen": -19.37521743774414, "logits/rejected": -18.48681640625, "logps/chosen": -466.39312744140625, "logps/rejected": -288.275390625, "loss": 0.2566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.288741111755371, "rewards/margins": 2.008000373840332, "rewards/rejected": 2.280740261077881, "step": 67060 }, { "epoch": 3.113886438553322, "grad_norm": 176.23046875, "learning_rate": 1.1325316867078322e-07, "logits/chosen": -19.7840576171875, "logits/rejected": -19.12393569946289, "logps/chosen": -370.7850341796875, "logps/rejected": -317.92657470703125, "loss": 0.8307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6404201984405518, "rewards/margins": 0.631384015083313, "rewards/rejected": 3.009036064147949, "step": 67070 }, { "epoch": 3.114350712660755, "grad_norm": 61.7280387878418, "learning_rate": 1.1322531222433725e-07, "logits/chosen": -18.93716812133789, "logits/rejected": -18.852079391479492, "logps/chosen": -267.28076171875, "logps/rejected": -270.0380859375, "loss": 0.8373, "rewards/accuracies": 0.5, "rewards/chosen": 1.7555134296417236, "rewards/margins": 0.27625638246536255, "rewards/rejected": 1.4792569875717163, "step": 67080 }, { "epoch": 3.114814986768188, "grad_norm": 15.064223289489746, "learning_rate": 1.1319745577789126e-07, "logits/chosen": -19.241840362548828, "logits/rejected": -18.079177856445312, "logps/chosen": -329.6924133300781, "logps/rejected": -258.5164794921875, "loss": 0.3488, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9938764572143555, "rewards/margins": 1.7026214599609375, "rewards/rejected": 1.2912547588348389, "step": 67090 }, { "epoch": 3.115279260875621, "grad_norm": 73.00105285644531, "learning_rate": 1.1316959933144529e-07, "logits/chosen": -19.7717342376709, "logits/rejected": -18.423946380615234, "logps/chosen": -353.28277587890625, "logps/rejected": -209.5947723388672, "loss": 0.4676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3769354820251465, "rewards/margins": 1.6156864166259766, "rewards/rejected": 0.761248767375946, "step": 67100 }, { "epoch": 3.115743534983054, "grad_norm": 68.47991180419922, "learning_rate": 1.1314174288499929e-07, "logits/chosen": -18.62782096862793, "logits/rejected": -18.79193115234375, "logps/chosen": -404.9267883300781, "logps/rejected": -405.02740478515625, "loss": 0.8269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1394829750061035, "rewards/margins": 0.15882106125354767, "rewards/rejected": 2.9806621074676514, "step": 67110 }, { "epoch": 3.1162078090904872, "grad_norm": 50.79605484008789, "learning_rate": 1.1311388643855332e-07, "logits/chosen": -20.12552261352539, "logits/rejected": -19.326725006103516, "logps/chosen": -375.3333740234375, "logps/rejected": -274.206787109375, "loss": 0.5745, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.853236675262451, "rewards/margins": 1.903649091720581, "rewards/rejected": 1.9495874643325806, "step": 67120 }, { "epoch": 3.11667208319792, "grad_norm": 211.93203735351562, "learning_rate": 1.1308602999210733e-07, "logits/chosen": -20.88617515563965, "logits/rejected": -19.383358001708984, "logps/chosen": -323.4493103027344, "logps/rejected": -279.01214599609375, "loss": 0.4673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8060946464538574, "rewards/margins": 1.296852946281433, "rewards/rejected": 1.5092418193817139, "step": 67130 }, { "epoch": 3.117136357305353, "grad_norm": 73.7500228881836, "learning_rate": 1.1305817354566136e-07, "logits/chosen": -18.769412994384766, "logits/rejected": -17.759613037109375, "logps/chosen": -375.6852111816406, "logps/rejected": -322.0465393066406, "loss": 0.6366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2928779125213623, "rewards/margins": 0.9870203137397766, "rewards/rejected": 2.3058576583862305, "step": 67140 }, { "epoch": 3.117600631412786, "grad_norm": 17.15449333190918, "learning_rate": 1.1303031709921538e-07, "logits/chosen": -19.621898651123047, "logits/rejected": -18.91791534423828, "logps/chosen": -371.2992248535156, "logps/rejected": -344.106689453125, "loss": 0.7675, "rewards/accuracies": 0.5, "rewards/chosen": 3.8635241985321045, "rewards/margins": 0.6237925291061401, "rewards/rejected": 3.239731550216675, "step": 67150 }, { "epoch": 3.118064905520219, "grad_norm": 3.643505811691284, "learning_rate": 1.1300246065276938e-07, "logits/chosen": -18.136455535888672, "logits/rejected": -18.606897354125977, "logps/chosen": -267.52117919921875, "logps/rejected": -323.24066162109375, "loss": 0.9379, "rewards/accuracies": 0.5, "rewards/chosen": 2.138364315032959, "rewards/margins": 0.5009730458259583, "rewards/rejected": 1.6373916864395142, "step": 67160 }, { "epoch": 3.118529179627652, "grad_norm": 0.03937371447682381, "learning_rate": 1.129746042063234e-07, "logits/chosen": -19.857318878173828, "logits/rejected": -19.28955078125, "logps/chosen": -390.8401794433594, "logps/rejected": -288.6545104980469, "loss": 0.3968, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.930548906326294, "rewards/margins": 1.8061549663543701, "rewards/rejected": 2.1243934631347656, "step": 67170 }, { "epoch": 3.118993453735085, "grad_norm": 26.89557647705078, "learning_rate": 1.1294674775987742e-07, "logits/chosen": -20.073259353637695, "logits/rejected": -19.297616958618164, "logps/chosen": -461.2373046875, "logps/rejected": -415.4537658691406, "loss": 0.7281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.064743995666504, "rewards/margins": 0.8800773620605469, "rewards/rejected": 3.184666633605957, "step": 67180 }, { "epoch": 3.1194577278425184, "grad_norm": 14.472155570983887, "learning_rate": 1.1291889131343145e-07, "logits/chosen": -19.195199966430664, "logits/rejected": -18.985416412353516, "logps/chosen": -340.288330078125, "logps/rejected": -363.43353271484375, "loss": 1.024, "rewards/accuracies": 0.5, "rewards/chosen": 2.7458503246307373, "rewards/margins": 0.13555726408958435, "rewards/rejected": 2.61029314994812, "step": 67190 }, { "epoch": 3.119922001949951, "grad_norm": 10.528304100036621, "learning_rate": 1.1289103486698546e-07, "logits/chosen": -18.882877349853516, "logits/rejected": -17.74367332458496, "logps/chosen": -302.235107421875, "logps/rejected": -236.29660034179688, "loss": 0.3753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0732035636901855, "rewards/margins": 1.4387470483779907, "rewards/rejected": 0.6344567537307739, "step": 67200 }, { "epoch": 3.1203862760573844, "grad_norm": 0.48929938673973083, "learning_rate": 1.1286317842053949e-07, "logits/chosen": -19.586498260498047, "logits/rejected": -17.521251678466797, "logps/chosen": -422.28350830078125, "logps/rejected": -216.022705078125, "loss": 0.3185, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.348813533782959, "rewards/margins": 3.045175075531006, "rewards/rejected": 2.3036389350891113, "step": 67210 }, { "epoch": 3.120850550164817, "grad_norm": 69.8687744140625, "learning_rate": 1.1283532197409349e-07, "logits/chosen": -19.14596176147461, "logits/rejected": -18.85504150390625, "logps/chosen": -521.4150390625, "logps/rejected": -439.578857421875, "loss": 1.4557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7131247520446777, "rewards/margins": -0.5273540019989014, "rewards/rejected": 4.240479469299316, "step": 67220 }, { "epoch": 3.1213148242722504, "grad_norm": 102.91175842285156, "learning_rate": 1.1280746552764752e-07, "logits/chosen": -19.223392486572266, "logits/rejected": -19.24676513671875, "logps/chosen": -469.32440185546875, "logps/rejected": -436.6070251464844, "loss": 0.7833, "rewards/accuracies": 0.5, "rewards/chosen": 4.571046352386475, "rewards/margins": 0.07002434879541397, "rewards/rejected": 4.501021862030029, "step": 67230 }, { "epoch": 3.121779098379683, "grad_norm": 162.5532989501953, "learning_rate": 1.1277960908120153e-07, "logits/chosen": -18.882183074951172, "logits/rejected": -18.04250717163086, "logps/chosen": -501.4503479003906, "logps/rejected": -323.88214111328125, "loss": 0.6642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.957012176513672, "rewards/margins": 1.9712730646133423, "rewards/rejected": 1.9857393503189087, "step": 67240 }, { "epoch": 3.1222433724871164, "grad_norm": 4.467339515686035, "learning_rate": 1.1275175263475556e-07, "logits/chosen": -19.75029754638672, "logits/rejected": -19.430883407592773, "logps/chosen": -352.48956298828125, "logps/rejected": -310.1704406738281, "loss": 0.4539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9754672050476074, "rewards/margins": 1.1944868564605713, "rewards/rejected": 2.780980348587036, "step": 67250 }, { "epoch": 3.1227076465945496, "grad_norm": 123.52070617675781, "learning_rate": 1.1272389618830956e-07, "logits/chosen": -18.832569122314453, "logits/rejected": -19.138614654541016, "logps/chosen": -318.60357666015625, "logps/rejected": -358.08843994140625, "loss": 0.7591, "rewards/accuracies": 0.5, "rewards/chosen": 2.515779495239258, "rewards/margins": 0.02422969415783882, "rewards/rejected": 2.4915499687194824, "step": 67260 }, { "epoch": 3.1231719207019824, "grad_norm": 228.66165161132812, "learning_rate": 1.1269603974186359e-07, "logits/chosen": -18.267410278320312, "logits/rejected": -18.098783493041992, "logps/chosen": -274.1250915527344, "logps/rejected": -267.24951171875, "loss": 0.6636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.298283338546753, "rewards/margins": 1.0089728832244873, "rewards/rejected": 2.2893104553222656, "step": 67270 }, { "epoch": 3.1236361948094156, "grad_norm": 92.25291442871094, "learning_rate": 1.126681832954176e-07, "logits/chosen": -19.71935272216797, "logits/rejected": -18.578210830688477, "logps/chosen": -494.2062072753906, "logps/rejected": -371.83184814453125, "loss": 0.9085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.338141441345215, "rewards/margins": 0.9530990719795227, "rewards/rejected": 3.3850417137145996, "step": 67280 }, { "epoch": 3.1241004689168483, "grad_norm": 139.45156860351562, "learning_rate": 1.1264032684897163e-07, "logits/chosen": -19.346494674682617, "logits/rejected": -18.762332916259766, "logps/chosen": -473.16119384765625, "logps/rejected": -366.11431884765625, "loss": 0.3971, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.316367149353027, "rewards/margins": 1.8842461109161377, "rewards/rejected": 2.4321205615997314, "step": 67290 }, { "epoch": 3.1245647430242816, "grad_norm": 34.92865753173828, "learning_rate": 1.1261247040252565e-07, "logits/chosen": -20.473735809326172, "logits/rejected": -19.03647232055664, "logps/chosen": -452.66082763671875, "logps/rejected": -314.20220947265625, "loss": 0.4316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9678597450256348, "rewards/margins": 2.2150793075561523, "rewards/rejected": 1.752780556678772, "step": 67300 }, { "epoch": 3.1250290171317148, "grad_norm": 70.01771545410156, "learning_rate": 1.1258461395607968e-07, "logits/chosen": -18.672672271728516, "logits/rejected": -18.029115676879883, "logps/chosen": -397.15814208984375, "logps/rejected": -315.05682373046875, "loss": 0.4643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.912963390350342, "rewards/margins": 1.7685964107513428, "rewards/rejected": 2.14436674118042, "step": 67310 }, { "epoch": 3.1254932912391475, "grad_norm": 0.02162094973027706, "learning_rate": 1.1255675750963368e-07, "logits/chosen": -18.786582946777344, "logits/rejected": -17.86394500732422, "logps/chosen": -420.3421936035156, "logps/rejected": -283.82904052734375, "loss": 1.0512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4008426666259766, "rewards/margins": 1.521348237991333, "rewards/rejected": 1.8794949054718018, "step": 67320 }, { "epoch": 3.1259575653465808, "grad_norm": 1.3204612731933594, "learning_rate": 1.125289010631877e-07, "logits/chosen": -18.57138442993164, "logits/rejected": -17.469045639038086, "logps/chosen": -293.79681396484375, "logps/rejected": -283.23822021484375, "loss": 0.6822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.497936964035034, "rewards/margins": 1.1372952461242676, "rewards/rejected": 1.3606420755386353, "step": 67330 }, { "epoch": 3.1264218394540135, "grad_norm": 4.990281581878662, "learning_rate": 1.1250104461674172e-07, "logits/chosen": -19.041519165039062, "logits/rejected": -18.678869247436523, "logps/chosen": -381.8709716796875, "logps/rejected": -309.1249084472656, "loss": 0.5522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8242125511169434, "rewards/margins": 1.246817946434021, "rewards/rejected": 1.5773944854736328, "step": 67340 }, { "epoch": 3.1268861135614467, "grad_norm": 23.621118545532227, "learning_rate": 1.1247318817029573e-07, "logits/chosen": -20.049821853637695, "logits/rejected": -19.160083770751953, "logps/chosen": -397.34063720703125, "logps/rejected": -319.1978454589844, "loss": 0.2579, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0820536613464355, "rewards/margins": 1.9761730432510376, "rewards/rejected": 2.1058809757232666, "step": 67350 }, { "epoch": 3.1273503876688795, "grad_norm": 143.13844299316406, "learning_rate": 1.1244533172384976e-07, "logits/chosen": -19.558691024780273, "logits/rejected": -19.73038101196289, "logps/chosen": -340.2951354980469, "logps/rejected": -372.2430114746094, "loss": 1.0257, "rewards/accuracies": 0.5, "rewards/chosen": 3.1900858879089355, "rewards/margins": -0.03613276407122612, "rewards/rejected": 3.2262184619903564, "step": 67360 }, { "epoch": 3.1278146617763127, "grad_norm": 0.28995564579963684, "learning_rate": 1.1241747527740376e-07, "logits/chosen": -18.22031593322754, "logits/rejected": -18.19266128540039, "logps/chosen": -467.99072265625, "logps/rejected": -450.10052490234375, "loss": 1.0635, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8964359760284424, "rewards/margins": 0.321045458316803, "rewards/rejected": 3.575390577316284, "step": 67370 }, { "epoch": 3.128278935883746, "grad_norm": 233.78125, "learning_rate": 1.1238961883095779e-07, "logits/chosen": -19.201351165771484, "logits/rejected": -18.755191802978516, "logps/chosen": -339.90509033203125, "logps/rejected": -309.8544616699219, "loss": 0.5409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0758161544799805, "rewards/margins": 1.0051485300064087, "rewards/rejected": 2.0706677436828613, "step": 67380 }, { "epoch": 3.1287432099911787, "grad_norm": 56.96598815917969, "learning_rate": 1.123617623845118e-07, "logits/chosen": -18.841712951660156, "logits/rejected": -17.845020294189453, "logps/chosen": -396.90960693359375, "logps/rejected": -343.90716552734375, "loss": 0.6083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0533528327941895, "rewards/margins": 0.9437028169631958, "rewards/rejected": 2.109650135040283, "step": 67390 }, { "epoch": 3.129207484098612, "grad_norm": 26.597322463989258, "learning_rate": 1.1233390593806583e-07, "logits/chosen": -19.096500396728516, "logits/rejected": -18.673433303833008, "logps/chosen": -344.87506103515625, "logps/rejected": -339.5132751464844, "loss": 0.9059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7190394401550293, "rewards/margins": 0.806576132774353, "rewards/rejected": 1.9124631881713867, "step": 67400 }, { "epoch": 3.1296717582060447, "grad_norm": 98.05272674560547, "learning_rate": 1.1230604949161985e-07, "logits/chosen": -18.060874938964844, "logits/rejected": -17.952110290527344, "logps/chosen": -273.8930358886719, "logps/rejected": -304.5846862792969, "loss": 1.5044, "rewards/accuracies": 0.5, "rewards/chosen": 2.4412219524383545, "rewards/margins": -0.1061679869890213, "rewards/rejected": 2.5473897457122803, "step": 67410 }, { "epoch": 3.130136032313478, "grad_norm": 3.068965196609497, "learning_rate": 1.1227819304517388e-07, "logits/chosen": -20.25973129272461, "logits/rejected": -20.00333595275879, "logps/chosen": -357.0611572265625, "logps/rejected": -333.71966552734375, "loss": 0.6361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5250041484832764, "rewards/margins": 1.2217024564743042, "rewards/rejected": 2.3033015727996826, "step": 67420 }, { "epoch": 3.130600306420911, "grad_norm": 37.20942306518555, "learning_rate": 1.1225033659872788e-07, "logits/chosen": -19.33968734741211, "logits/rejected": -18.687021255493164, "logps/chosen": -362.55706787109375, "logps/rejected": -269.28131103515625, "loss": 0.6369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.13774037361145, "rewards/margins": 1.4321962594985962, "rewards/rejected": 1.705544114112854, "step": 67430 }, { "epoch": 3.131064580528344, "grad_norm": 22.587806701660156, "learning_rate": 1.122224801522819e-07, "logits/chosen": -18.781967163085938, "logits/rejected": -17.295515060424805, "logps/chosen": -429.2831115722656, "logps/rejected": -252.29330444335938, "loss": 0.4552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.210994243621826, "rewards/margins": 3.1215624809265137, "rewards/rejected": 1.0894317626953125, "step": 67440 }, { "epoch": 3.131528854635777, "grad_norm": 147.95596313476562, "learning_rate": 1.1219462370583592e-07, "logits/chosen": -18.576784133911133, "logits/rejected": -18.25528335571289, "logps/chosen": -324.049560546875, "logps/rejected": -307.44610595703125, "loss": 1.9363, "rewards/accuracies": 0.5, "rewards/chosen": 1.76835036277771, "rewards/margins": -0.4042932987213135, "rewards/rejected": 2.1726438999176025, "step": 67450 }, { "epoch": 3.13199312874321, "grad_norm": 166.6796417236328, "learning_rate": 1.1216676725938995e-07, "logits/chosen": -18.607675552368164, "logits/rejected": -18.36624526977539, "logps/chosen": -356.695556640625, "logps/rejected": -323.3208312988281, "loss": 0.8185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.431520462036133, "rewards/margins": -0.005088520236313343, "rewards/rejected": 2.4366087913513184, "step": 67460 }, { "epoch": 3.132457402850643, "grad_norm": 5.830918312072754, "learning_rate": 1.1213891081294395e-07, "logits/chosen": -19.204631805419922, "logits/rejected": -18.689105987548828, "logps/chosen": -384.69659423828125, "logps/rejected": -315.7993469238281, "loss": 0.5111, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3417601585388184, "rewards/margins": 1.296878457069397, "rewards/rejected": 2.044882297515869, "step": 67470 }, { "epoch": 3.132921676958076, "grad_norm": 46.34821319580078, "learning_rate": 1.1211105436649798e-07, "logits/chosen": -18.95269775390625, "logits/rejected": -19.228036880493164, "logps/chosen": -285.838623046875, "logps/rejected": -289.36407470703125, "loss": 1.4186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2732269763946533, "rewards/margins": -0.5351839065551758, "rewards/rejected": 2.808410882949829, "step": 67480 }, { "epoch": 3.133385951065509, "grad_norm": 3.800290584564209, "learning_rate": 1.1208319792005199e-07, "logits/chosen": -19.38300895690918, "logits/rejected": -17.85775375366211, "logps/chosen": -518.7071533203125, "logps/rejected": -467.20318603515625, "loss": 1.1332, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7506492137908936, "rewards/margins": 0.06824161857366562, "rewards/rejected": 3.6824073791503906, "step": 67490 }, { "epoch": 3.1338502251729423, "grad_norm": 9.73112964630127, "learning_rate": 1.1205534147360602e-07, "logits/chosen": -19.971805572509766, "logits/rejected": -19.118053436279297, "logps/chosen": -364.5368957519531, "logps/rejected": -302.71405029296875, "loss": 0.5648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.740523099899292, "rewards/margins": 1.0467486381530762, "rewards/rejected": 2.6937742233276367, "step": 67500 }, { "epoch": 3.134314499280375, "grad_norm": 27.900306701660156, "learning_rate": 1.1202748502716003e-07, "logits/chosen": -18.981000900268555, "logits/rejected": -19.453643798828125, "logps/chosen": -410.21600341796875, "logps/rejected": -393.84906005859375, "loss": 1.161, "rewards/accuracies": 0.5, "rewards/chosen": 3.377899169921875, "rewards/margins": -0.1878890097141266, "rewards/rejected": 3.5657877922058105, "step": 67510 }, { "epoch": 3.1347787733878083, "grad_norm": 40.372398376464844, "learning_rate": 1.1199962858071406e-07, "logits/chosen": -18.654827117919922, "logits/rejected": -17.985393524169922, "logps/chosen": -318.3391418457031, "logps/rejected": -277.44927978515625, "loss": 0.4403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.963120222091675, "rewards/margins": 1.236045002937317, "rewards/rejected": 2.7270750999450684, "step": 67520 }, { "epoch": 3.135243047495241, "grad_norm": 72.13267517089844, "learning_rate": 1.1197177213426806e-07, "logits/chosen": -19.985408782958984, "logits/rejected": -18.211597442626953, "logps/chosen": -436.85382080078125, "logps/rejected": -268.0945739746094, "loss": 0.5002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.572814464569092, "rewards/margins": 2.2380080223083496, "rewards/rejected": 1.3348065614700317, "step": 67530 }, { "epoch": 3.1357073216026743, "grad_norm": 128.9110565185547, "learning_rate": 1.1194391568782208e-07, "logits/chosen": -19.127248764038086, "logits/rejected": -18.28445053100586, "logps/chosen": -312.3387451171875, "logps/rejected": -291.35650634765625, "loss": 0.861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.34696626663208, "rewards/margins": 1.1533647775650024, "rewards/rejected": 2.193601369857788, "step": 67540 }, { "epoch": 3.136171595710107, "grad_norm": 65.4751968383789, "learning_rate": 1.119160592413761e-07, "logits/chosen": -20.28384780883789, "logits/rejected": -19.106159210205078, "logps/chosen": -376.6419372558594, "logps/rejected": -285.220703125, "loss": 0.5514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.563960075378418, "rewards/margins": 1.941442847251892, "rewards/rejected": 2.6225171089172363, "step": 67550 }, { "epoch": 3.1366358698175403, "grad_norm": 32.05051040649414, "learning_rate": 1.1188820279493012e-07, "logits/chosen": -19.741130828857422, "logits/rejected": -18.294200897216797, "logps/chosen": -538.4124755859375, "logps/rejected": -362.1669921875, "loss": 0.2375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8521528244018555, "rewards/margins": 2.3457164764404297, "rewards/rejected": 2.506436824798584, "step": 67560 }, { "epoch": 3.1371001439249735, "grad_norm": 331.28729248046875, "learning_rate": 1.1186034634848415e-07, "logits/chosen": -18.581790924072266, "logits/rejected": -18.60847282409668, "logps/chosen": -367.377197265625, "logps/rejected": -423.1844177246094, "loss": 1.5372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.762892484664917, "rewards/margins": 0.2749643921852112, "rewards/rejected": 3.4879279136657715, "step": 67570 }, { "epoch": 3.1375644180324063, "grad_norm": 31.182708740234375, "learning_rate": 1.1183248990203815e-07, "logits/chosen": -20.248119354248047, "logits/rejected": -19.967945098876953, "logps/chosen": -466.645751953125, "logps/rejected": -370.500244140625, "loss": 0.554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.247349739074707, "rewards/margins": 1.075840711593628, "rewards/rejected": 3.1715087890625, "step": 67580 }, { "epoch": 3.1380286921398395, "grad_norm": 3.3094077110290527, "learning_rate": 1.1180463345559218e-07, "logits/chosen": -18.32380485534668, "logits/rejected": -17.992517471313477, "logps/chosen": -335.2715148925781, "logps/rejected": -298.4505615234375, "loss": 0.7414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4563021659851074, "rewards/margins": 0.7977116107940674, "rewards/rejected": 1.658590316772461, "step": 67590 }, { "epoch": 3.1384929662472723, "grad_norm": 187.95184326171875, "learning_rate": 1.1177677700914619e-07, "logits/chosen": -20.22177505493164, "logits/rejected": -20.248294830322266, "logps/chosen": -361.3135986328125, "logps/rejected": -381.6344299316406, "loss": 1.0209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3597488403320312, "rewards/margins": 0.04220477491617203, "rewards/rejected": 3.3175442218780518, "step": 67600 }, { "epoch": 3.1389572403547055, "grad_norm": 20.32349395751953, "learning_rate": 1.1174892056270022e-07, "logits/chosen": -18.100631713867188, "logits/rejected": -17.269060134887695, "logps/chosen": -361.2782287597656, "logps/rejected": -299.7896423339844, "loss": 1.1222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2302117347717285, "rewards/margins": 0.9444392919540405, "rewards/rejected": 1.2857722043991089, "step": 67610 }, { "epoch": 3.1394215144621382, "grad_norm": 262.3802185058594, "learning_rate": 1.1172106411625423e-07, "logits/chosen": -19.18158721923828, "logits/rejected": -17.895336151123047, "logps/chosen": -363.1780700683594, "logps/rejected": -322.6040954589844, "loss": 0.938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9681410789489746, "rewards/margins": 0.6507616639137268, "rewards/rejected": 2.3173794746398926, "step": 67620 }, { "epoch": 3.1398857885695715, "grad_norm": 35.88960266113281, "learning_rate": 1.1169320766980826e-07, "logits/chosen": -19.01852798461914, "logits/rejected": -17.977645874023438, "logps/chosen": -495.9833068847656, "logps/rejected": -367.02069091796875, "loss": 0.2917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.566054344177246, "rewards/margins": 2.055922746658325, "rewards/rejected": 2.5101318359375, "step": 67630 }, { "epoch": 3.1403500626770047, "grad_norm": 18.944063186645508, "learning_rate": 1.1166535122336226e-07, "logits/chosen": -19.347238540649414, "logits/rejected": -18.132701873779297, "logps/chosen": -422.87628173828125, "logps/rejected": -365.39837646484375, "loss": 0.8175, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4190850257873535, "rewards/margins": 0.9294290542602539, "rewards/rejected": 2.4896559715270996, "step": 67640 }, { "epoch": 3.1408143367844374, "grad_norm": 116.2606201171875, "learning_rate": 1.1163749477691629e-07, "logits/chosen": -18.391937255859375, "logits/rejected": -18.548120498657227, "logps/chosen": -310.3907165527344, "logps/rejected": -382.3633728027344, "loss": 1.0634, "rewards/accuracies": 0.5, "rewards/chosen": 3.616783857345581, "rewards/margins": 0.4385995864868164, "rewards/rejected": 3.1781840324401855, "step": 67650 }, { "epoch": 3.1412786108918707, "grad_norm": 26.622295379638672, "learning_rate": 1.116096383304703e-07, "logits/chosen": -19.006704330444336, "logits/rejected": -18.42068099975586, "logps/chosen": -299.1139221191406, "logps/rejected": -308.29107666015625, "loss": 0.6123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8243812322616577, "rewards/margins": 0.5751848816871643, "rewards/rejected": 1.2491967678070068, "step": 67660 }, { "epoch": 3.1417428849993034, "grad_norm": 166.45156860351562, "learning_rate": 1.1158178188402433e-07, "logits/chosen": -19.387866973876953, "logits/rejected": -18.413415908813477, "logps/chosen": -388.12713623046875, "logps/rejected": -257.8258056640625, "loss": 0.5205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.411710023880005, "rewards/margins": 1.6535097360610962, "rewards/rejected": 1.75819993019104, "step": 67670 }, { "epoch": 3.1422071591067366, "grad_norm": 8.231731414794922, "learning_rate": 1.1155392543757833e-07, "logits/chosen": -18.312185287475586, "logits/rejected": -18.2823486328125, "logps/chosen": -291.6966857910156, "logps/rejected": -271.5682067871094, "loss": 1.2532, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.473211407661438, "rewards/margins": -0.0384468175470829, "rewards/rejected": 1.5116581916809082, "step": 67680 }, { "epoch": 3.1426714332141694, "grad_norm": 12.869405746459961, "learning_rate": 1.1152606899113236e-07, "logits/chosen": -18.853271484375, "logits/rejected": -18.865764617919922, "logps/chosen": -281.3760986328125, "logps/rejected": -244.363525390625, "loss": 0.7331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9655356407165527, "rewards/margins": 0.7297953367233276, "rewards/rejected": 2.2357401847839355, "step": 67690 }, { "epoch": 3.1431357073216026, "grad_norm": 0.726262629032135, "learning_rate": 1.1149821254468637e-07, "logits/chosen": -18.408245086669922, "logits/rejected": -17.98659324645996, "logps/chosen": -355.198486328125, "logps/rejected": -325.80535888671875, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6716015338897705, "rewards/margins": 1.7078498601913452, "rewards/rejected": 1.9637514352798462, "step": 67700 }, { "epoch": 3.143599981429036, "grad_norm": 140.8992919921875, "learning_rate": 1.114703560982404e-07, "logits/chosen": -18.825103759765625, "logits/rejected": -18.217138290405273, "logps/chosen": -337.6034240722656, "logps/rejected": -313.86480712890625, "loss": 0.5906, "rewards/accuracies": 0.5, "rewards/chosen": 3.3837456703186035, "rewards/margins": 0.5050088167190552, "rewards/rejected": 2.878736972808838, "step": 67710 }, { "epoch": 3.1440642555364686, "grad_norm": 0.3533756732940674, "learning_rate": 1.1144249965179442e-07, "logits/chosen": -17.775070190429688, "logits/rejected": -17.167888641357422, "logps/chosen": -499.5586853027344, "logps/rejected": -352.32293701171875, "loss": 0.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.717392921447754, "rewards/margins": 1.529691219329834, "rewards/rejected": 3.18770170211792, "step": 67720 }, { "epoch": 3.144528529643902, "grad_norm": 6.001247406005859, "learning_rate": 1.1141464320534844e-07, "logits/chosen": -18.29806900024414, "logits/rejected": -17.60444450378418, "logps/chosen": -299.2725524902344, "logps/rejected": -200.76144409179688, "loss": 0.3726, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.432309627532959, "rewards/margins": 1.6743812561035156, "rewards/rejected": 0.7579278349876404, "step": 67730 }, { "epoch": 3.1449928037513346, "grad_norm": 46.415287017822266, "learning_rate": 1.1138678675890245e-07, "logits/chosen": -18.990270614624023, "logits/rejected": -19.58794593811035, "logps/chosen": -376.16839599609375, "logps/rejected": -438.99993896484375, "loss": 1.8628, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8603663444519043, "rewards/margins": -1.2028921842575073, "rewards/rejected": 4.063258171081543, "step": 67740 }, { "epoch": 3.145457077858768, "grad_norm": 51.42028045654297, "learning_rate": 1.1135893031245646e-07, "logits/chosen": -19.347719192504883, "logits/rejected": -18.42439842224121, "logps/chosen": -261.9188537597656, "logps/rejected": -168.90066528320312, "loss": 0.3723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6472415924072266, "rewards/margins": 2.0916495323181152, "rewards/rejected": 0.5555919408798218, "step": 67750 }, { "epoch": 3.145921351966201, "grad_norm": 39.88371276855469, "learning_rate": 1.1133107386601049e-07, "logits/chosen": -19.423328399658203, "logits/rejected": -17.839664459228516, "logps/chosen": -344.2627258300781, "logps/rejected": -225.80770874023438, "loss": 0.5279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0577454566955566, "rewards/margins": 1.983464241027832, "rewards/rejected": 1.0742809772491455, "step": 67760 }, { "epoch": 3.146385626073634, "grad_norm": 96.93882751464844, "learning_rate": 1.113032174195645e-07, "logits/chosen": -18.957489013671875, "logits/rejected": -18.298519134521484, "logps/chosen": -385.677001953125, "logps/rejected": -297.72015380859375, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9780163764953613, "rewards/margins": 0.6203301548957825, "rewards/rejected": 2.3576865196228027, "step": 67770 }, { "epoch": 3.146849900181067, "grad_norm": 109.5901107788086, "learning_rate": 1.1127536097311853e-07, "logits/chosen": -18.411602020263672, "logits/rejected": -18.45684242248535, "logps/chosen": -359.13995361328125, "logps/rejected": -357.56988525390625, "loss": 0.5475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7954068183898926, "rewards/margins": 0.5698077082633972, "rewards/rejected": 2.2255990505218506, "step": 67780 }, { "epoch": 3.1473141742885, "grad_norm": 80.52981567382812, "learning_rate": 1.1124750452667253e-07, "logits/chosen": -18.977584838867188, "logits/rejected": -19.120622634887695, "logps/chosen": -268.6859130859375, "logps/rejected": -271.8958740234375, "loss": 1.3002, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3250648975372314, "rewards/margins": 0.09640560299158096, "rewards/rejected": 2.228659152984619, "step": 67790 }, { "epoch": 3.147778448395933, "grad_norm": 0.3305119574069977, "learning_rate": 1.1121964808022656e-07, "logits/chosen": -19.262537002563477, "logits/rejected": -17.944692611694336, "logps/chosen": -457.2770080566406, "logps/rejected": -265.8458557128906, "loss": 0.2788, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.839890718460083, "rewards/margins": 2.117677927017212, "rewards/rejected": 1.722212553024292, "step": 67800 }, { "epoch": 3.148242722503366, "grad_norm": 1.345057725906372, "learning_rate": 1.1119179163378057e-07, "logits/chosen": -18.53824234008789, "logits/rejected": -17.409381866455078, "logps/chosen": -507.92230224609375, "logps/rejected": -357.1383972167969, "loss": 0.3067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.006165504455566, "rewards/margins": 2.4489808082580566, "rewards/rejected": 1.557185411453247, "step": 67810 }, { "epoch": 3.148706996610799, "grad_norm": 97.06560516357422, "learning_rate": 1.111639351873346e-07, "logits/chosen": -20.163433074951172, "logits/rejected": -18.932283401489258, "logps/chosen": -406.377197265625, "logps/rejected": -325.2530822753906, "loss": 0.4664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.605661392211914, "rewards/margins": 1.204580307006836, "rewards/rejected": 2.401080846786499, "step": 67820 }, { "epoch": 3.149171270718232, "grad_norm": 20.432483673095703, "learning_rate": 1.1113607874088862e-07, "logits/chosen": -19.473180770874023, "logits/rejected": -18.203937530517578, "logps/chosen": -325.07135009765625, "logps/rejected": -244.15853881835938, "loss": 0.8829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.917041778564453, "rewards/margins": 0.8491213917732239, "rewards/rejected": 2.067920446395874, "step": 67830 }, { "epoch": 3.149635544825665, "grad_norm": 3.909454584121704, "learning_rate": 1.1110822229444264e-07, "logits/chosen": -19.306015014648438, "logits/rejected": -18.67731285095215, "logps/chosen": -409.4932556152344, "logps/rejected": -300.660888671875, "loss": 0.3123, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.20490026473999, "rewards/margins": 1.921508550643921, "rewards/rejected": 2.2833914756774902, "step": 67840 }, { "epoch": 3.150099818933098, "grad_norm": 0.7074463367462158, "learning_rate": 1.1108036584799665e-07, "logits/chosen": -18.730575561523438, "logits/rejected": -17.93370246887207, "logps/chosen": -461.79144287109375, "logps/rejected": -314.4241027832031, "loss": 0.3851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.127175331115723, "rewards/margins": 2.1366100311279297, "rewards/rejected": 1.9905650615692139, "step": 67850 }, { "epoch": 3.150564093040531, "grad_norm": 1.7490566968917847, "learning_rate": 1.1105250940155067e-07, "logits/chosen": -19.691234588623047, "logits/rejected": -19.22678565979004, "logps/chosen": -381.966552734375, "logps/rejected": -261.90875244140625, "loss": 0.5719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.320624589920044, "rewards/margins": 1.1211236715316772, "rewards/rejected": 2.199500560760498, "step": 67860 }, { "epoch": 3.151028367147964, "grad_norm": 124.00191497802734, "learning_rate": 1.1102465295510469e-07, "logits/chosen": -18.4786319732666, "logits/rejected": -18.81761360168457, "logps/chosen": -333.47283935546875, "logps/rejected": -321.3233947753906, "loss": 0.7418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9923958778381348, "rewards/margins": 0.4455599784851074, "rewards/rejected": 2.5468358993530273, "step": 67870 }, { "epoch": 3.1514926412553974, "grad_norm": 68.85584259033203, "learning_rate": 1.1099679650865872e-07, "logits/chosen": -19.035242080688477, "logits/rejected": -18.400903701782227, "logps/chosen": -414.5741271972656, "logps/rejected": -323.92279052734375, "loss": 0.7243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.609121322631836, "rewards/margins": 1.0990631580352783, "rewards/rejected": 1.510058045387268, "step": 67880 }, { "epoch": 3.15195691536283, "grad_norm": 24.7147159576416, "learning_rate": 1.1096894006221272e-07, "logits/chosen": -18.932239532470703, "logits/rejected": -19.236095428466797, "logps/chosen": -378.69207763671875, "logps/rejected": -299.23040771484375, "loss": 1.0357, "rewards/accuracies": 0.5, "rewards/chosen": 2.8128538131713867, "rewards/margins": 0.020796965807676315, "rewards/rejected": 2.7920567989349365, "step": 67890 }, { "epoch": 3.1524211894702634, "grad_norm": 151.24835205078125, "learning_rate": 1.1094108361576674e-07, "logits/chosen": -18.2246150970459, "logits/rejected": -17.792987823486328, "logps/chosen": -395.39825439453125, "logps/rejected": -243.4188995361328, "loss": 1.1122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6991279125213623, "rewards/margins": 0.8821069598197937, "rewards/rejected": 1.8170210123062134, "step": 67900 }, { "epoch": 3.152885463577696, "grad_norm": 38.45341110229492, "learning_rate": 1.1091322716932076e-07, "logits/chosen": -17.8448486328125, "logits/rejected": -17.88711929321289, "logps/chosen": -249.1478729248047, "logps/rejected": -284.16546630859375, "loss": 0.9048, "rewards/accuracies": 0.5, "rewards/chosen": 0.8820955157279968, "rewards/margins": 0.009283995255827904, "rewards/rejected": 0.872811496257782, "step": 67910 }, { "epoch": 3.1533497376851294, "grad_norm": 29.783845901489258, "learning_rate": 1.1088537072287479e-07, "logits/chosen": -18.66315269470215, "logits/rejected": -17.584918975830078, "logps/chosen": -510.61688232421875, "logps/rejected": -407.37457275390625, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": 4.191469669342041, "rewards/margins": 1.8306057453155518, "rewards/rejected": 2.3608641624450684, "step": 67920 }, { "epoch": 3.153814011792562, "grad_norm": 276.231201171875, "learning_rate": 1.108575142764288e-07, "logits/chosen": -17.915813446044922, "logits/rejected": -18.734201431274414, "logps/chosen": -399.80755615234375, "logps/rejected": -440.1841735839844, "loss": 1.3839, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.3318679332733154, "rewards/margins": -0.7749344110488892, "rewards/rejected": 4.106802463531494, "step": 67930 }, { "epoch": 3.1542782858999954, "grad_norm": 4.581385612487793, "learning_rate": 1.108296578299828e-07, "logits/chosen": -20.356182098388672, "logits/rejected": -18.781620025634766, "logps/chosen": -356.8648376464844, "logps/rejected": -294.1785888671875, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.164430141448975, "rewards/margins": 1.7228014469146729, "rewards/rejected": 2.441628932952881, "step": 67940 }, { "epoch": 3.1547425600074286, "grad_norm": 33.558563232421875, "learning_rate": 1.1080180138353683e-07, "logits/chosen": -20.32682991027832, "logits/rejected": -19.517759323120117, "logps/chosen": -280.12725830078125, "logps/rejected": -248.82156372070312, "loss": 0.5351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7908036708831787, "rewards/margins": 1.584360122680664, "rewards/rejected": 1.2064435482025146, "step": 67950 }, { "epoch": 3.1552068341148614, "grad_norm": 192.90725708007812, "learning_rate": 1.1077394493709085e-07, "logits/chosen": -18.632667541503906, "logits/rejected": -18.687416076660156, "logps/chosen": -365.06036376953125, "logps/rejected": -413.3694763183594, "loss": 1.2616, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.232959747314453, "rewards/margins": -0.02242562733590603, "rewards/rejected": 3.255385160446167, "step": 67960 }, { "epoch": 3.1556711082222946, "grad_norm": 138.9315185546875, "learning_rate": 1.1074608849064487e-07, "logits/chosen": -19.911022186279297, "logits/rejected": -19.42508316040039, "logps/chosen": -295.16082763671875, "logps/rejected": -248.79763793945312, "loss": 0.6121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9273593425750732, "rewards/margins": 0.6744133234024048, "rewards/rejected": 2.252946138381958, "step": 67970 }, { "epoch": 3.1561353823297273, "grad_norm": 252.77001953125, "learning_rate": 1.1071823204419889e-07, "logits/chosen": -19.790075302124023, "logits/rejected": -19.388755798339844, "logps/chosen": -451.10693359375, "logps/rejected": -428.909423828125, "loss": 0.7463, "rewards/accuracies": 0.5, "rewards/chosen": 4.672123908996582, "rewards/margins": 1.0919151306152344, "rewards/rejected": 3.5802085399627686, "step": 67980 }, { "epoch": 3.1565996564371606, "grad_norm": 63.239192962646484, "learning_rate": 1.1069037559775292e-07, "logits/chosen": -18.68536949157715, "logits/rejected": -18.115314483642578, "logps/chosen": -447.79327392578125, "logps/rejected": -389.547607421875, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8705601692199707, "rewards/margins": 0.822219729423523, "rewards/rejected": 3.048340320587158, "step": 67990 }, { "epoch": 3.1570639305445933, "grad_norm": 6.586266040802002, "learning_rate": 1.1066251915130692e-07, "logits/chosen": -18.760608673095703, "logits/rejected": -18.099231719970703, "logps/chosen": -524.3489990234375, "logps/rejected": -461.42718505859375, "loss": 0.74, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.186939239501953, "rewards/margins": 1.2080861330032349, "rewards/rejected": 2.978853702545166, "step": 68000 }, { "epoch": 3.1575282046520265, "grad_norm": 0.3955233097076416, "learning_rate": 1.1063466270486094e-07, "logits/chosen": -19.18838119506836, "logits/rejected": -18.46931266784668, "logps/chosen": -370.8367614746094, "logps/rejected": -270.2848205566406, "loss": 0.5048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.029760360717773, "rewards/margins": 2.2627756595611572, "rewards/rejected": 1.7669847011566162, "step": 68010 }, { "epoch": 3.1579924787594598, "grad_norm": 24.064373016357422, "learning_rate": 1.1060680625841496e-07, "logits/chosen": -19.892858505249023, "logits/rejected": -19.538331985473633, "logps/chosen": -438.7418518066406, "logps/rejected": -381.4739990234375, "loss": 0.7654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.111691474914551, "rewards/margins": 0.38463902473449707, "rewards/rejected": 3.7270522117614746, "step": 68020 }, { "epoch": 3.1584567528668925, "grad_norm": 53.63321304321289, "learning_rate": 1.1057894981196899e-07, "logits/chosen": -19.867216110229492, "logits/rejected": -17.736133575439453, "logps/chosen": -549.24267578125, "logps/rejected": -372.5638732910156, "loss": 0.2617, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8359808921813965, "rewards/margins": 2.7635114192962646, "rewards/rejected": 2.072470188140869, "step": 68030 }, { "epoch": 3.1589210269743258, "grad_norm": 22.539043426513672, "learning_rate": 1.10551093365523e-07, "logits/chosen": -19.49342155456543, "logits/rejected": -19.945079803466797, "logps/chosen": -448.85595703125, "logps/rejected": -380.95721435546875, "loss": 0.508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.94577169418335, "rewards/margins": 1.8114601373672485, "rewards/rejected": 3.1343114376068115, "step": 68040 }, { "epoch": 3.1593853010817585, "grad_norm": 185.8501434326172, "learning_rate": 1.1052323691907703e-07, "logits/chosen": -18.40803337097168, "logits/rejected": -18.173587799072266, "logps/chosen": -377.88531494140625, "logps/rejected": -301.88287353515625, "loss": 0.43, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.948906660079956, "rewards/margins": 1.6731650829315186, "rewards/rejected": 1.2757412195205688, "step": 68050 }, { "epoch": 3.1598495751891917, "grad_norm": 34.709075927734375, "learning_rate": 1.1049538047263103e-07, "logits/chosen": -19.462940216064453, "logits/rejected": -17.965904235839844, "logps/chosen": -404.10150146484375, "logps/rejected": -333.00482177734375, "loss": 0.4783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.072052955627441, "rewards/margins": 1.4778894186019897, "rewards/rejected": 2.594163417816162, "step": 68060 }, { "epoch": 3.1603138492966245, "grad_norm": 0.44607725739479065, "learning_rate": 1.1046752402618506e-07, "logits/chosen": -19.20211410522461, "logits/rejected": -18.157997131347656, "logps/chosen": -377.19976806640625, "logps/rejected": -256.38275146484375, "loss": 0.5178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.960254192352295, "rewards/margins": 2.1086626052856445, "rewards/rejected": 1.8515913486480713, "step": 68070 }, { "epoch": 3.1607781234040577, "grad_norm": 0.11594489961862564, "learning_rate": 1.1043966757973907e-07, "logits/chosen": -19.35939598083496, "logits/rejected": -18.195510864257812, "logps/chosen": -472.2618713378906, "logps/rejected": -341.88433837890625, "loss": 0.4907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.682271957397461, "rewards/margins": 1.9916372299194336, "rewards/rejected": 2.6906344890594482, "step": 68080 }, { "epoch": 3.161242397511491, "grad_norm": 55.23953628540039, "learning_rate": 1.104118111332931e-07, "logits/chosen": -19.251432418823242, "logits/rejected": -17.91812515258789, "logps/chosen": -374.138916015625, "logps/rejected": -248.8013916015625, "loss": 0.4881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0920374393463135, "rewards/margins": 1.7867443561553955, "rewards/rejected": 1.3052928447723389, "step": 68090 }, { "epoch": 3.1617066716189237, "grad_norm": 17.088987350463867, "learning_rate": 1.103839546868471e-07, "logits/chosen": -18.905797958374023, "logits/rejected": -18.131437301635742, "logps/chosen": -286.373046875, "logps/rejected": -232.13967895507812, "loss": 0.5759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9703831672668457, "rewards/margins": 1.1900907754898071, "rewards/rejected": 1.7802921533584595, "step": 68100 }, { "epoch": 3.162170945726357, "grad_norm": 43.227455139160156, "learning_rate": 1.1035609824040113e-07, "logits/chosen": -19.055625915527344, "logits/rejected": -18.503938674926758, "logps/chosen": -262.8768005371094, "logps/rejected": -250.41830444335938, "loss": 0.6971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8334720134735107, "rewards/margins": 0.5961283445358276, "rewards/rejected": 1.237343668937683, "step": 68110 }, { "epoch": 3.1626352198337897, "grad_norm": 90.20095825195312, "learning_rate": 1.1032824179395514e-07, "logits/chosen": -19.23174285888672, "logits/rejected": -18.324785232543945, "logps/chosen": -441.7724609375, "logps/rejected": -317.83233642578125, "loss": 0.6043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.876388072967529, "rewards/margins": 2.0965065956115723, "rewards/rejected": 2.779881715774536, "step": 68120 }, { "epoch": 3.163099493941223, "grad_norm": 74.97570037841797, "learning_rate": 1.1030038534750916e-07, "logits/chosen": -18.11699104309082, "logits/rejected": -18.361616134643555, "logps/chosen": -293.63330078125, "logps/rejected": -295.9806213378906, "loss": 1.0932, "rewards/accuracies": 0.5, "rewards/chosen": 1.9356276988983154, "rewards/margins": -0.03382381796836853, "rewards/rejected": 1.969451665878296, "step": 68130 }, { "epoch": 3.163563768048656, "grad_norm": 150.9591522216797, "learning_rate": 1.1027252890106319e-07, "logits/chosen": -20.051605224609375, "logits/rejected": -20.042316436767578, "logps/chosen": -334.026611328125, "logps/rejected": -332.1244812011719, "loss": 0.9045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.025604724884033, "rewards/margins": 0.9624274969100952, "rewards/rejected": 3.0631771087646484, "step": 68140 }, { "epoch": 3.164028042156089, "grad_norm": 12.354416847229004, "learning_rate": 1.1024467245461719e-07, "logits/chosen": -19.004304885864258, "logits/rejected": -19.11546516418457, "logps/chosen": -343.32196044921875, "logps/rejected": -394.23895263671875, "loss": 1.1119, "rewards/accuracies": 0.5, "rewards/chosen": 2.6753716468811035, "rewards/margins": 0.2822244465351105, "rewards/rejected": 2.3931469917297363, "step": 68150 }, { "epoch": 3.164492316263522, "grad_norm": 2.192196846008301, "learning_rate": 1.1021681600817122e-07, "logits/chosen": -19.658597946166992, "logits/rejected": -19.14400291442871, "logps/chosen": -374.15093994140625, "logps/rejected": -361.5331726074219, "loss": 0.8739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.785080671310425, "rewards/margins": 0.8766793012619019, "rewards/rejected": 2.9084014892578125, "step": 68160 }, { "epoch": 3.164956590370955, "grad_norm": 22.226741790771484, "learning_rate": 1.1018895956172523e-07, "logits/chosen": -18.45103645324707, "logits/rejected": -17.070205688476562, "logps/chosen": -346.9111022949219, "logps/rejected": -237.43603515625, "loss": 0.4251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0082955360412598, "rewards/margins": 1.988968849182129, "rewards/rejected": 1.0193264484405518, "step": 68170 }, { "epoch": 3.165420864478388, "grad_norm": 55.61634826660156, "learning_rate": 1.1016110311527926e-07, "logits/chosen": -19.247472763061523, "logits/rejected": -18.520225524902344, "logps/chosen": -463.9085388183594, "logps/rejected": -369.6249084472656, "loss": 0.5266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.613638401031494, "rewards/margins": 2.323185443878174, "rewards/rejected": 2.290452718734741, "step": 68180 }, { "epoch": 3.165885138585821, "grad_norm": 17.923248291015625, "learning_rate": 1.1013324666883327e-07, "logits/chosen": -19.58073616027832, "logits/rejected": -19.128034591674805, "logps/chosen": -368.1246032714844, "logps/rejected": -327.0129699707031, "loss": 0.6919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7925121784210205, "rewards/margins": 1.4774134159088135, "rewards/rejected": 2.315098524093628, "step": 68190 }, { "epoch": 3.166349412693254, "grad_norm": 90.6485824584961, "learning_rate": 1.101053902223873e-07, "logits/chosen": -18.924152374267578, "logits/rejected": -17.980915069580078, "logps/chosen": -316.7118835449219, "logps/rejected": -266.19744873046875, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0529444217681885, "rewards/margins": 0.9351035952568054, "rewards/rejected": 1.1178408861160278, "step": 68200 }, { "epoch": 3.1668136868006873, "grad_norm": 211.15528869628906, "learning_rate": 1.100775337759413e-07, "logits/chosen": -19.032323837280273, "logits/rejected": -17.9054012298584, "logps/chosen": -321.2216796875, "logps/rejected": -268.60064697265625, "loss": 0.9432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7363767623901367, "rewards/margins": 0.9669805765151978, "rewards/rejected": 1.7693960666656494, "step": 68210 }, { "epoch": 3.16727796090812, "grad_norm": 111.49701690673828, "learning_rate": 1.1004967732949533e-07, "logits/chosen": -18.77387237548828, "logits/rejected": -18.33336067199707, "logps/chosen": -451.1414489746094, "logps/rejected": -423.92840576171875, "loss": 0.8853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.266918897628784, "rewards/margins": 0.3279055655002594, "rewards/rejected": 2.9390132427215576, "step": 68220 }, { "epoch": 3.1677422350155533, "grad_norm": 52.194366455078125, "learning_rate": 1.1002182088304934e-07, "logits/chosen": -19.52987289428711, "logits/rejected": -18.414203643798828, "logps/chosen": -392.82080078125, "logps/rejected": -273.8404235839844, "loss": 0.3716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5604724884033203, "rewards/margins": 1.5545907020568848, "rewards/rejected": 2.0058817863464355, "step": 68230 }, { "epoch": 3.168206509122986, "grad_norm": 6.931065559387207, "learning_rate": 1.0999396443660337e-07, "logits/chosen": -18.823650360107422, "logits/rejected": -18.339176177978516, "logps/chosen": -451.71563720703125, "logps/rejected": -329.83172607421875, "loss": 0.6408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.51110577583313, "rewards/margins": 1.8185508251190186, "rewards/rejected": 1.6925547122955322, "step": 68240 }, { "epoch": 3.1686707832304193, "grad_norm": 64.00712585449219, "learning_rate": 1.0996610799015739e-07, "logits/chosen": -18.99392318725586, "logits/rejected": -18.42926025390625, "logps/chosen": -405.19537353515625, "logps/rejected": -321.2706604003906, "loss": 0.914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.104580402374268, "rewards/margins": 1.4432015419006348, "rewards/rejected": 2.661378860473633, "step": 68250 }, { "epoch": 3.1691350573378525, "grad_norm": 3.1586415767669678, "learning_rate": 1.0993825154371141e-07, "logits/chosen": -19.222591400146484, "logits/rejected": -17.687671661376953, "logps/chosen": -425.68768310546875, "logps/rejected": -376.59490966796875, "loss": 0.5787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.680510997772217, "rewards/margins": 1.3677524328231812, "rewards/rejected": 2.312758445739746, "step": 68260 }, { "epoch": 3.1695993314452853, "grad_norm": 9.177261352539062, "learning_rate": 1.0991039509726542e-07, "logits/chosen": -19.26082992553711, "logits/rejected": -18.772933959960938, "logps/chosen": -409.17608642578125, "logps/rejected": -406.02825927734375, "loss": 0.7137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5369675159454346, "rewards/margins": 1.2309490442276, "rewards/rejected": 2.306018829345703, "step": 68270 }, { "epoch": 3.1700636055527185, "grad_norm": 103.33097076416016, "learning_rate": 1.0988253865081944e-07, "logits/chosen": -20.123716354370117, "logits/rejected": -19.252355575561523, "logps/chosen": -373.06243896484375, "logps/rejected": -340.0602722167969, "loss": 1.0097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6528549194335938, "rewards/margins": 1.187545895576477, "rewards/rejected": 2.4653091430664062, "step": 68280 }, { "epoch": 3.1705278796601513, "grad_norm": 17.34539794921875, "learning_rate": 1.0985468220437346e-07, "logits/chosen": -19.460107803344727, "logits/rejected": -19.268224716186523, "logps/chosen": -319.43096923828125, "logps/rejected": -316.48968505859375, "loss": 0.7634, "rewards/accuracies": 0.5, "rewards/chosen": 4.0524187088012695, "rewards/margins": 1.3786277770996094, "rewards/rejected": 2.673790693283081, "step": 68290 }, { "epoch": 3.1709921537675845, "grad_norm": 8.366166114807129, "learning_rate": 1.0982682575792749e-07, "logits/chosen": -18.139408111572266, "logits/rejected": -17.62753677368164, "logps/chosen": -343.08917236328125, "logps/rejected": -261.35687255859375, "loss": 1.0717, "rewards/accuracies": 0.5, "rewards/chosen": 1.8840484619140625, "rewards/margins": 0.21989139914512634, "rewards/rejected": 1.6641571521759033, "step": 68300 }, { "epoch": 3.1714564278750172, "grad_norm": 0.04451523721218109, "learning_rate": 1.0979896931148149e-07, "logits/chosen": -18.403852462768555, "logits/rejected": -17.46871566772461, "logps/chosen": -369.13067626953125, "logps/rejected": -201.75633239746094, "loss": 0.5227, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2642219066619873, "rewards/margins": 2.1381473541259766, "rewards/rejected": 1.1260745525360107, "step": 68310 }, { "epoch": 3.1719207019824505, "grad_norm": 160.0174560546875, "learning_rate": 1.0977111286503551e-07, "logits/chosen": -19.659496307373047, "logits/rejected": -18.034751892089844, "logps/chosen": -386.80303955078125, "logps/rejected": -235.89981079101562, "loss": 0.8135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6645331382751465, "rewards/margins": 2.489140272140503, "rewards/rejected": 2.175393581390381, "step": 68320 }, { "epoch": 3.1723849760898837, "grad_norm": 107.37340545654297, "learning_rate": 1.0974325641858953e-07, "logits/chosen": -18.9517879486084, "logits/rejected": -18.877086639404297, "logps/chosen": -333.65008544921875, "logps/rejected": -358.72772216796875, "loss": 1.0408, "rewards/accuracies": 0.5, "rewards/chosen": 4.100558280944824, "rewards/margins": 0.44602879881858826, "rewards/rejected": 3.654529094696045, "step": 68330 }, { "epoch": 3.1728492501973165, "grad_norm": 47.002140045166016, "learning_rate": 1.0971539997214354e-07, "logits/chosen": -19.48446273803711, "logits/rejected": -17.864696502685547, "logps/chosen": -511.59259033203125, "logps/rejected": -325.0458679199219, "loss": 0.4025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.1737284660339355, "rewards/margins": 2.4486660957336426, "rewards/rejected": 2.725062608718872, "step": 68340 }, { "epoch": 3.1733135243047497, "grad_norm": 134.05874633789062, "learning_rate": 1.0968754352569757e-07, "logits/chosen": -19.693906784057617, "logits/rejected": -18.134485244750977, "logps/chosen": -538.4920043945312, "logps/rejected": -393.37109375, "loss": 0.5883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.52190637588501, "rewards/margins": 1.5379947423934937, "rewards/rejected": 2.9839115142822266, "step": 68350 }, { "epoch": 3.1737777984121824, "grad_norm": 66.12909698486328, "learning_rate": 1.0965968707925157e-07, "logits/chosen": -19.346548080444336, "logits/rejected": -18.703445434570312, "logps/chosen": -436.9710998535156, "logps/rejected": -378.1803894042969, "loss": 0.3649, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.656464576721191, "rewards/margins": 1.4284660816192627, "rewards/rejected": 3.2279982566833496, "step": 68360 }, { "epoch": 3.1742420725196157, "grad_norm": 67.25330352783203, "learning_rate": 1.096318306328056e-07, "logits/chosen": -18.908924102783203, "logits/rejected": -18.85658073425293, "logps/chosen": -376.4052429199219, "logps/rejected": -317.9288024902344, "loss": 0.9604, "rewards/accuracies": 0.5, "rewards/chosen": 3.082261323928833, "rewards/margins": 0.5701707005500793, "rewards/rejected": 2.5120906829833984, "step": 68370 }, { "epoch": 3.1747063466270484, "grad_norm": 165.7770233154297, "learning_rate": 1.0960397418635961e-07, "logits/chosen": -18.50640106201172, "logits/rejected": -17.740833282470703, "logps/chosen": -343.6598205566406, "logps/rejected": -255.2808074951172, "loss": 1.0357, "rewards/accuracies": 0.5, "rewards/chosen": 2.3476498126983643, "rewards/margins": 0.36832955479621887, "rewards/rejected": 1.9793202877044678, "step": 68380 }, { "epoch": 3.1751706207344816, "grad_norm": 56.41627883911133, "learning_rate": 1.0957611773991364e-07, "logits/chosen": -19.4384822845459, "logits/rejected": -18.885852813720703, "logps/chosen": -366.87481689453125, "logps/rejected": -311.32513427734375, "loss": 0.4054, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.614138126373291, "rewards/margins": 1.200486421585083, "rewards/rejected": 2.413651704788208, "step": 68390 }, { "epoch": 3.175634894841915, "grad_norm": 35.78605270385742, "learning_rate": 1.0954826129346766e-07, "logits/chosen": -19.62906265258789, "logits/rejected": -18.955190658569336, "logps/chosen": -326.20709228515625, "logps/rejected": -225.2067413330078, "loss": 0.4017, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9267563819885254, "rewards/margins": 1.8336937427520752, "rewards/rejected": 1.0930625200271606, "step": 68400 }, { "epoch": 3.1760991689493476, "grad_norm": 32.9229850769043, "learning_rate": 1.0952040484702168e-07, "logits/chosen": -20.772153854370117, "logits/rejected": -19.151142120361328, "logps/chosen": -474.811279296875, "logps/rejected": -370.5951232910156, "loss": 0.4182, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.249810695648193, "rewards/margins": 1.4087852239608765, "rewards/rejected": 2.8410255908966064, "step": 68410 }, { "epoch": 3.176563443056781, "grad_norm": 257.07965087890625, "learning_rate": 1.0949254840057569e-07, "logits/chosen": -18.990713119506836, "logits/rejected": -18.604482650756836, "logps/chosen": -321.4752502441406, "logps/rejected": -359.4991760253906, "loss": 0.997, "rewards/accuracies": 0.5, "rewards/chosen": 2.930248975753784, "rewards/margins": 0.4663841128349304, "rewards/rejected": 2.463864803314209, "step": 68420 }, { "epoch": 3.1770277171642136, "grad_norm": 68.27055358886719, "learning_rate": 1.0946469195412971e-07, "logits/chosen": -18.397748947143555, "logits/rejected": -17.74515724182129, "logps/chosen": -284.4368591308594, "logps/rejected": -237.55838012695312, "loss": 0.6349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.578824043273926, "rewards/margins": 1.3636163473129272, "rewards/rejected": 1.215207576751709, "step": 68430 }, { "epoch": 3.177491991271647, "grad_norm": 93.86469268798828, "learning_rate": 1.0943683550768373e-07, "logits/chosen": -18.47391128540039, "logits/rejected": -17.618728637695312, "logps/chosen": -426.3583068847656, "logps/rejected": -347.23883056640625, "loss": 0.7145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.202012300491333, "rewards/margins": 1.8457332849502563, "rewards/rejected": 1.3562790155410767, "step": 68440 }, { "epoch": 3.1779562653790796, "grad_norm": 90.26561737060547, "learning_rate": 1.0940897906123776e-07, "logits/chosen": -19.558448791503906, "logits/rejected": -19.516990661621094, "logps/chosen": -399.35809326171875, "logps/rejected": -420.78851318359375, "loss": 1.1048, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.0782129764556885, "rewards/margins": 0.023143697530031204, "rewards/rejected": 3.0550692081451416, "step": 68450 }, { "epoch": 3.178420539486513, "grad_norm": 144.57470703125, "learning_rate": 1.0938112261479177e-07, "logits/chosen": -18.549001693725586, "logits/rejected": -17.175357818603516, "logps/chosen": -355.67547607421875, "logps/rejected": -229.7722930908203, "loss": 0.4612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3701882362365723, "rewards/margins": 0.9472250938415527, "rewards/rejected": 1.4229631423950195, "step": 68460 }, { "epoch": 3.178884813593946, "grad_norm": 34.04212188720703, "learning_rate": 1.093532661683458e-07, "logits/chosen": -18.45718002319336, "logits/rejected": -18.324644088745117, "logps/chosen": -372.3563232421875, "logps/rejected": -391.4097595214844, "loss": 0.6316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3800759315490723, "rewards/margins": 0.8447655439376831, "rewards/rejected": 2.5353105068206787, "step": 68470 }, { "epoch": 3.179349087701379, "grad_norm": 79.46736907958984, "learning_rate": 1.093254097218998e-07, "logits/chosen": -18.383838653564453, "logits/rejected": -17.53903579711914, "logps/chosen": -294.78033447265625, "logps/rejected": -279.05108642578125, "loss": 0.7546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.423985004425049, "rewards/margins": 0.6040952801704407, "rewards/rejected": 1.8198894262313843, "step": 68480 }, { "epoch": 3.179813361808812, "grad_norm": 56.542842864990234, "learning_rate": 1.0929755327545383e-07, "logits/chosen": -20.0806941986084, "logits/rejected": -19.2558536529541, "logps/chosen": -366.64593505859375, "logps/rejected": -339.91680908203125, "loss": 0.5543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.210982322692871, "rewards/margins": 1.2086158990859985, "rewards/rejected": 3.002366542816162, "step": 68490 }, { "epoch": 3.180277635916245, "grad_norm": 75.53116607666016, "learning_rate": 1.0926969682900784e-07, "logits/chosen": -18.534366607666016, "logits/rejected": -18.604747772216797, "logps/chosen": -282.0377502441406, "logps/rejected": -283.40380859375, "loss": 0.925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.369328498840332, "rewards/margins": -0.01980757713317871, "rewards/rejected": 2.3891360759735107, "step": 68500 }, { "epoch": 3.180741910023678, "grad_norm": 10.48018741607666, "learning_rate": 1.0924184038256187e-07, "logits/chosen": -20.304302215576172, "logits/rejected": -18.735637664794922, "logps/chosen": -462.91729736328125, "logps/rejected": -316.4124755859375, "loss": 0.6161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.111922264099121, "rewards/margins": 1.903792142868042, "rewards/rejected": 3.2081298828125, "step": 68510 }, { "epoch": 3.181206184131111, "grad_norm": 36.2362174987793, "learning_rate": 1.0921398393611587e-07, "logits/chosen": -18.55573081970215, "logits/rejected": -18.646419525146484, "logps/chosen": -326.9488830566406, "logps/rejected": -313.5172424316406, "loss": 1.0871, "rewards/accuracies": 0.5, "rewards/chosen": 2.4347314834594727, "rewards/margins": 0.2366320639848709, "rewards/rejected": 2.198099374771118, "step": 68520 }, { "epoch": 3.181670458238544, "grad_norm": 38.9339485168457, "learning_rate": 1.0918612748966989e-07, "logits/chosen": -19.630725860595703, "logits/rejected": -17.333919525146484, "logps/chosen": -408.7131652832031, "logps/rejected": -231.25131225585938, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 4.544763088226318, "rewards/margins": 3.1864490509033203, "rewards/rejected": 1.3583142757415771, "step": 68530 }, { "epoch": 3.182134732345977, "grad_norm": 25.133935928344727, "learning_rate": 1.0915827104322391e-07, "logits/chosen": -17.93060874938965, "logits/rejected": -18.522035598754883, "logps/chosen": -372.23095703125, "logps/rejected": -352.4480895996094, "loss": 0.7855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.616187572479248, "rewards/margins": 0.7890372276306152, "rewards/rejected": 1.8271503448486328, "step": 68540 }, { "epoch": 3.18259900645341, "grad_norm": 28.127620697021484, "learning_rate": 1.0913041459677793e-07, "logits/chosen": -19.34780502319336, "logits/rejected": -18.470378875732422, "logps/chosen": -327.5384216308594, "logps/rejected": -241.98312377929688, "loss": 0.5517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.193864345550537, "rewards/margins": 1.146851897239685, "rewards/rejected": 2.0470123291015625, "step": 68550 }, { "epoch": 3.183063280560843, "grad_norm": 97.31346130371094, "learning_rate": 1.0910255815033196e-07, "logits/chosen": -19.000133514404297, "logits/rejected": -18.428070068359375, "logps/chosen": -453.88555908203125, "logps/rejected": -379.279052734375, "loss": 0.2845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.190891981124878, "rewards/margins": 1.7297741174697876, "rewards/rejected": 1.4611178636550903, "step": 68560 }, { "epoch": 3.183527554668276, "grad_norm": 120.59506225585938, "learning_rate": 1.0907470170388596e-07, "logits/chosen": -18.548236846923828, "logits/rejected": -17.739320755004883, "logps/chosen": -549.90234375, "logps/rejected": -397.61297607421875, "loss": 0.4711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.561661243438721, "rewards/margins": 1.9706121683120728, "rewards/rejected": 2.5910494327545166, "step": 68570 }, { "epoch": 3.183991828775709, "grad_norm": 135.42245483398438, "learning_rate": 1.0904684525743998e-07, "logits/chosen": -19.034486770629883, "logits/rejected": -18.533527374267578, "logps/chosen": -446.156005859375, "logps/rejected": -348.5486145019531, "loss": 1.3706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7932205200195312, "rewards/margins": 0.15368804335594177, "rewards/rejected": 3.6395328044891357, "step": 68580 }, { "epoch": 3.1844561028831424, "grad_norm": 21.644500732421875, "learning_rate": 1.09018988810994e-07, "logits/chosen": -18.02371597290039, "logits/rejected": -17.938011169433594, "logps/chosen": -424.82086181640625, "logps/rejected": -384.5283508300781, "loss": 1.0152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.462601184844971, "rewards/margins": 0.5021817088127136, "rewards/rejected": 3.9604194164276123, "step": 68590 }, { "epoch": 3.184920376990575, "grad_norm": 146.79225158691406, "learning_rate": 1.0899113236454803e-07, "logits/chosen": -19.57693099975586, "logits/rejected": -18.540685653686523, "logps/chosen": -506.906982421875, "logps/rejected": -397.7213134765625, "loss": 0.4437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.392029762268066, "rewards/margins": 1.20845627784729, "rewards/rejected": 3.1835734844207764, "step": 68600 }, { "epoch": 3.1853846510980084, "grad_norm": 199.6684112548828, "learning_rate": 1.0896327591810204e-07, "logits/chosen": -18.70730972290039, "logits/rejected": -18.204620361328125, "logps/chosen": -429.0865173339844, "logps/rejected": -418.03216552734375, "loss": 1.6204, "rewards/accuracies": 0.5, "rewards/chosen": 3.844331741333008, "rewards/margins": -0.20352813601493835, "rewards/rejected": 4.047860145568848, "step": 68610 }, { "epoch": 3.185848925205441, "grad_norm": 42.16857147216797, "learning_rate": 1.0893541947165607e-07, "logits/chosen": -19.354785919189453, "logits/rejected": -18.6592960357666, "logps/chosen": -353.864501953125, "logps/rejected": -387.3486633300781, "loss": 0.5913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.533919334411621, "rewards/margins": 1.109128475189209, "rewards/rejected": 1.424790859222412, "step": 68620 }, { "epoch": 3.1863131993128744, "grad_norm": 52.48959732055664, "learning_rate": 1.0890756302521007e-07, "logits/chosen": -19.334341049194336, "logits/rejected": -18.358951568603516, "logps/chosen": -369.7778015136719, "logps/rejected": -336.239013671875, "loss": 0.373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.876307249069214, "rewards/margins": 1.4671111106872559, "rewards/rejected": 2.409196138381958, "step": 68630 }, { "epoch": 3.186777473420307, "grad_norm": 86.74628448486328, "learning_rate": 1.088797065787641e-07, "logits/chosen": -19.13850975036621, "logits/rejected": -18.6162052154541, "logps/chosen": -367.5865783691406, "logps/rejected": -336.19744873046875, "loss": 0.5731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.413534641265869, "rewards/margins": 1.5536787509918213, "rewards/rejected": 1.8598560094833374, "step": 68640 }, { "epoch": 3.1872417475277404, "grad_norm": 39.726951599121094, "learning_rate": 1.0885185013231811e-07, "logits/chosen": -18.506311416625977, "logits/rejected": -18.16008758544922, "logps/chosen": -272.6474609375, "logps/rejected": -270.0380554199219, "loss": 0.5471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3163344860076904, "rewards/margins": 1.3522355556488037, "rewards/rejected": 0.9640989303588867, "step": 68650 }, { "epoch": 3.1877060216351736, "grad_norm": 87.48185729980469, "learning_rate": 1.0882399368587214e-07, "logits/chosen": -19.10683822631836, "logits/rejected": -18.2887020111084, "logps/chosen": -523.2523193359375, "logps/rejected": -467.7843322753906, "loss": 0.4616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.835925579071045, "rewards/margins": 1.0949018001556396, "rewards/rejected": 3.741023540496826, "step": 68660 }, { "epoch": 3.1881702957426064, "grad_norm": 21.801481246948242, "learning_rate": 1.0879613723942616e-07, "logits/chosen": -18.213115692138672, "logits/rejected": -17.397348403930664, "logps/chosen": -378.889892578125, "logps/rejected": -305.3233337402344, "loss": 0.4641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.481015682220459, "rewards/margins": 1.3911316394805908, "rewards/rejected": 2.0898842811584473, "step": 68670 }, { "epoch": 3.1886345698500396, "grad_norm": 127.9286880493164, "learning_rate": 1.0876828079298018e-07, "logits/chosen": -19.315946578979492, "logits/rejected": -18.814998626708984, "logps/chosen": -360.68194580078125, "logps/rejected": -343.52899169921875, "loss": 0.6774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1291582584381104, "rewards/margins": 1.0056124925613403, "rewards/rejected": 2.1235454082489014, "step": 68680 }, { "epoch": 3.1890988439574723, "grad_norm": 57.21497344970703, "learning_rate": 1.0874042434653418e-07, "logits/chosen": -19.572429656982422, "logits/rejected": -18.451248168945312, "logps/chosen": -461.2955627441406, "logps/rejected": -390.8405456542969, "loss": 0.3907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7215373516082764, "rewards/margins": 1.449802041053772, "rewards/rejected": 2.271735191345215, "step": 68690 }, { "epoch": 3.1895631180649056, "grad_norm": 202.46337890625, "learning_rate": 1.0871256790008821e-07, "logits/chosen": -19.192615509033203, "logits/rejected": -18.282936096191406, "logps/chosen": -314.49481201171875, "logps/rejected": -291.9347229003906, "loss": 0.9368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1290886402130127, "rewards/margins": 1.017238974571228, "rewards/rejected": 2.111849546432495, "step": 68700 }, { "epoch": 3.1900273921723388, "grad_norm": 85.85551452636719, "learning_rate": 1.0868471145364223e-07, "logits/chosen": -18.22703742980957, "logits/rejected": -17.096118927001953, "logps/chosen": -353.6813049316406, "logps/rejected": -241.19033813476562, "loss": 0.3769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.890976905822754, "rewards/margins": 1.689601182937622, "rewards/rejected": 1.201375961303711, "step": 68710 }, { "epoch": 3.1904916662797715, "grad_norm": 0.589860200881958, "learning_rate": 1.0865685500719625e-07, "logits/chosen": -19.060571670532227, "logits/rejected": -17.664260864257812, "logps/chosen": -335.04107666015625, "logps/rejected": -270.0552673339844, "loss": 0.3667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.214930772781372, "rewards/margins": 1.5667693614959717, "rewards/rejected": 1.6481612920761108, "step": 68720 }, { "epoch": 3.1909559403872048, "grad_norm": 111.82848358154297, "learning_rate": 1.0862899856075026e-07, "logits/chosen": -17.719770431518555, "logits/rejected": -17.708892822265625, "logps/chosen": -477.82940673828125, "logps/rejected": -403.28582763671875, "loss": 0.8637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8670527935028076, "rewards/margins": 1.2814090251922607, "rewards/rejected": 2.585643768310547, "step": 68730 }, { "epoch": 3.1914202144946375, "grad_norm": 25.572938919067383, "learning_rate": 1.0860114211430427e-07, "logits/chosen": -18.601810455322266, "logits/rejected": -17.173053741455078, "logps/chosen": -444.4144592285156, "logps/rejected": -262.3202819824219, "loss": 0.1742, "rewards/accuracies": 1.0, "rewards/chosen": 3.8423123359680176, "rewards/margins": 2.5363359451293945, "rewards/rejected": 1.305976390838623, "step": 68740 }, { "epoch": 3.1918844886020707, "grad_norm": 0.5749295353889465, "learning_rate": 1.085732856678583e-07, "logits/chosen": -19.145191192626953, "logits/rejected": -18.81198501586914, "logps/chosen": -343.8308410644531, "logps/rejected": -254.9501953125, "loss": 0.6358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.779268741607666, "rewards/margins": 1.1723920106887817, "rewards/rejected": 2.6068763732910156, "step": 68750 }, { "epoch": 3.1923487627095035, "grad_norm": 45.02954864501953, "learning_rate": 1.0854542922141231e-07, "logits/chosen": -17.868106842041016, "logits/rejected": -17.760822296142578, "logps/chosen": -414.46014404296875, "logps/rejected": -391.9861755371094, "loss": 0.4905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.597914218902588, "rewards/margins": 1.1713883876800537, "rewards/rejected": 2.426525592803955, "step": 68760 }, { "epoch": 3.1928130368169367, "grad_norm": 0.2414298802614212, "learning_rate": 1.0851757277496634e-07, "logits/chosen": -19.472177505493164, "logits/rejected": -17.125097274780273, "logps/chosen": -443.423095703125, "logps/rejected": -203.51547241210938, "loss": 0.2934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6148152351379395, "rewards/margins": 2.865234851837158, "rewards/rejected": 1.7495803833007812, "step": 68770 }, { "epoch": 3.19327731092437, "grad_norm": 93.96807098388672, "learning_rate": 1.0848971632852034e-07, "logits/chosen": -19.144996643066406, "logits/rejected": -18.43606185913086, "logps/chosen": -314.42816162109375, "logps/rejected": -244.1540069580078, "loss": 0.3636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3028976917266846, "rewards/margins": 1.3563686609268188, "rewards/rejected": 0.9465287923812866, "step": 68780 }, { "epoch": 3.1937415850318027, "grad_norm": 24.886085510253906, "learning_rate": 1.0846185988207437e-07, "logits/chosen": -19.803232192993164, "logits/rejected": -19.043825149536133, "logps/chosen": -378.9710998535156, "logps/rejected": -324.3677978515625, "loss": 0.5159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.73994517326355, "rewards/margins": 1.0871843099594116, "rewards/rejected": 2.6527609825134277, "step": 68790 }, { "epoch": 3.194205859139236, "grad_norm": 98.0804672241211, "learning_rate": 1.0843400343562838e-07, "logits/chosen": -18.868587493896484, "logits/rejected": -18.228368759155273, "logps/chosen": -376.9639892578125, "logps/rejected": -334.98138427734375, "loss": 0.4309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5251686573028564, "rewards/margins": 1.2591289281845093, "rewards/rejected": 2.2660393714904785, "step": 68800 }, { "epoch": 3.1946701332466687, "grad_norm": 69.703125, "learning_rate": 1.0840614698918241e-07, "logits/chosen": -20.259876251220703, "logits/rejected": -19.784191131591797, "logps/chosen": -477.8180236816406, "logps/rejected": -392.42156982421875, "loss": 0.5537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6282172203063965, "rewards/margins": 1.7448676824569702, "rewards/rejected": 2.883349657058716, "step": 68810 }, { "epoch": 3.195134407354102, "grad_norm": 120.25617980957031, "learning_rate": 1.0837829054273643e-07, "logits/chosen": -18.70970344543457, "logits/rejected": -18.32357406616211, "logps/chosen": -305.708740234375, "logps/rejected": -327.41400146484375, "loss": 0.7413, "rewards/accuracies": 0.5, "rewards/chosen": 3.063781261444092, "rewards/margins": 0.688963770866394, "rewards/rejected": 2.3748176097869873, "step": 68820 }, { "epoch": 3.1955986814615347, "grad_norm": 0.5355536937713623, "learning_rate": 1.0835043409629045e-07, "logits/chosen": -19.091339111328125, "logits/rejected": -18.697938919067383, "logps/chosen": -358.615234375, "logps/rejected": -338.961669921875, "loss": 0.5448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.269794464111328, "rewards/margins": 1.0446195602416992, "rewards/rejected": 2.225175142288208, "step": 68830 }, { "epoch": 3.196062955568968, "grad_norm": 0.1822485476732254, "learning_rate": 1.0832257764984446e-07, "logits/chosen": -19.226512908935547, "logits/rejected": -18.022144317626953, "logps/chosen": -446.17352294921875, "logps/rejected": -308.3402404785156, "loss": 0.3741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7204322814941406, "rewards/margins": 2.0683090686798096, "rewards/rejected": 1.6521230936050415, "step": 68840 }, { "epoch": 3.196527229676401, "grad_norm": 3.146785259246826, "learning_rate": 1.0829472120339848e-07, "logits/chosen": -17.87919807434082, "logits/rejected": -17.990442276000977, "logps/chosen": -250.0990753173828, "logps/rejected": -252.8284149169922, "loss": 1.8287, "rewards/accuracies": 0.5, "rewards/chosen": 1.4312946796417236, "rewards/margins": -0.039181899279356, "rewards/rejected": 1.470476508140564, "step": 68850 }, { "epoch": 3.196991503783834, "grad_norm": 271.9958801269531, "learning_rate": 1.082668647569525e-07, "logits/chosen": -18.864734649658203, "logits/rejected": -18.7367000579834, "logps/chosen": -373.4006042480469, "logps/rejected": -304.6708068847656, "loss": 1.6694, "rewards/accuracies": 0.5, "rewards/chosen": 2.7772409915924072, "rewards/margins": 0.2639096677303314, "rewards/rejected": 2.513331651687622, "step": 68860 }, { "epoch": 3.197455777891267, "grad_norm": 9.653165817260742, "learning_rate": 1.0823900831050653e-07, "logits/chosen": -19.31565284729004, "logits/rejected": -18.26068115234375, "logps/chosen": -442.0917053222656, "logps/rejected": -363.34490966796875, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.907576322555542, "rewards/margins": 2.1574432849884033, "rewards/rejected": 1.7501329183578491, "step": 68870 }, { "epoch": 3.1979200519987, "grad_norm": 99.22888946533203, "learning_rate": 1.0821115186406054e-07, "logits/chosen": -19.137990951538086, "logits/rejected": -18.237585067749023, "logps/chosen": -325.89801025390625, "logps/rejected": -335.4219055175781, "loss": 0.6209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6710638999938965, "rewards/margins": 0.5011471509933472, "rewards/rejected": 2.1699166297912598, "step": 68880 }, { "epoch": 3.198384326106133, "grad_norm": 15.527257919311523, "learning_rate": 1.0818329541761457e-07, "logits/chosen": -19.355911254882812, "logits/rejected": -17.9823055267334, "logps/chosen": -364.3329162597656, "logps/rejected": -269.1629333496094, "loss": 0.304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5189285278320312, "rewards/margins": 1.5046241283416748, "rewards/rejected": 2.0143041610717773, "step": 68890 }, { "epoch": 3.198848600213566, "grad_norm": 93.16936492919922, "learning_rate": 1.0815543897116857e-07, "logits/chosen": -19.213916778564453, "logits/rejected": -18.55756187438965, "logps/chosen": -322.6864318847656, "logps/rejected": -332.9350891113281, "loss": 0.6117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0611863136291504, "rewards/margins": 1.2253572940826416, "rewards/rejected": 1.8358291387557983, "step": 68900 }, { "epoch": 3.199312874320999, "grad_norm": 36.784446716308594, "learning_rate": 1.081275825247226e-07, "logits/chosen": -19.42931365966797, "logits/rejected": -19.234451293945312, "logps/chosen": -377.1589050292969, "logps/rejected": -359.0081481933594, "loss": 0.9433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6009726524353027, "rewards/margins": 0.31866157054901123, "rewards/rejected": 2.282310962677002, "step": 68910 }, { "epoch": 3.1997771484284323, "grad_norm": 29.741790771484375, "learning_rate": 1.0809972607827661e-07, "logits/chosen": -19.157588958740234, "logits/rejected": -19.425094604492188, "logps/chosen": -386.7475280761719, "logps/rejected": -365.7440490722656, "loss": 0.7012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.915393352508545, "rewards/margins": 0.48483580350875854, "rewards/rejected": 2.4305572509765625, "step": 68920 }, { "epoch": 3.200241422535865, "grad_norm": 39.672054290771484, "learning_rate": 1.0807186963183063e-07, "logits/chosen": -18.60148811340332, "logits/rejected": -18.926815032958984, "logps/chosen": -353.517333984375, "logps/rejected": -364.7646484375, "loss": 0.7655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.485379457473755, "rewards/margins": 0.0571320541203022, "rewards/rejected": 3.4282474517822266, "step": 68930 }, { "epoch": 3.2007056966432983, "grad_norm": 67.97308349609375, "learning_rate": 1.0804401318538464e-07, "logits/chosen": -19.43064308166504, "logits/rejected": -18.039352416992188, "logps/chosen": -403.9501953125, "logps/rejected": -294.4334411621094, "loss": 0.6668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.155151844024658, "rewards/margins": 0.9949308633804321, "rewards/rejected": 2.1602210998535156, "step": 68940 }, { "epoch": 3.201169970750731, "grad_norm": 96.93046569824219, "learning_rate": 1.0801615673893866e-07, "logits/chosen": -19.45864486694336, "logits/rejected": -18.113224029541016, "logps/chosen": -333.28643798828125, "logps/rejected": -282.90008544921875, "loss": 0.3947, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0919089317321777, "rewards/margins": 1.2911465167999268, "rewards/rejected": 1.8007625341415405, "step": 68950 }, { "epoch": 3.2016342448581643, "grad_norm": 44.68054962158203, "learning_rate": 1.0798830029249268e-07, "logits/chosen": -19.125070571899414, "logits/rejected": -17.52935218811035, "logps/chosen": -335.9415588378906, "logps/rejected": -219.32022094726562, "loss": 0.3896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.601879358291626, "rewards/margins": 2.1835436820983887, "rewards/rejected": 0.41833561658859253, "step": 68960 }, { "epoch": 3.2020985189655975, "grad_norm": 98.00470733642578, "learning_rate": 1.079604438460467e-07, "logits/chosen": -19.46190071105957, "logits/rejected": -18.59726905822754, "logps/chosen": -383.24566650390625, "logps/rejected": -304.1975402832031, "loss": 0.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6833138465881348, "rewards/margins": 1.1240615844726562, "rewards/rejected": 2.5592520236968994, "step": 68970 }, { "epoch": 3.2025627930730303, "grad_norm": 224.86361694335938, "learning_rate": 1.0793258739960073e-07, "logits/chosen": -18.527156829833984, "logits/rejected": -17.73933219909668, "logps/chosen": -355.0860290527344, "logps/rejected": -299.802490234375, "loss": 0.6655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.029582977294922, "rewards/margins": 1.8810529708862305, "rewards/rejected": 1.1485300064086914, "step": 68980 }, { "epoch": 3.2030270671804635, "grad_norm": 15.122784614562988, "learning_rate": 1.0790473095315473e-07, "logits/chosen": -19.441509246826172, "logits/rejected": -18.99730682373047, "logps/chosen": -406.5562744140625, "logps/rejected": -401.772705078125, "loss": 0.5797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.444418430328369, "rewards/margins": 1.0604066848754883, "rewards/rejected": 3.384011745452881, "step": 68990 }, { "epoch": 3.2034913412878963, "grad_norm": 17.706768035888672, "learning_rate": 1.0787687450670875e-07, "logits/chosen": -18.55861473083496, "logits/rejected": -18.625926971435547, "logps/chosen": -361.7641906738281, "logps/rejected": -346.6371765136719, "loss": 0.8285, "rewards/accuracies": 0.5, "rewards/chosen": 2.634084939956665, "rewards/margins": 0.5767990350723267, "rewards/rejected": 2.057285785675049, "step": 69000 }, { "epoch": 3.2039556153953295, "grad_norm": 26.513715744018555, "learning_rate": 1.0784901806026277e-07, "logits/chosen": -18.911834716796875, "logits/rejected": -18.308902740478516, "logps/chosen": -289.00994873046875, "logps/rejected": -272.75799560546875, "loss": 0.7789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0584957599639893, "rewards/margins": 0.460477352142334, "rewards/rejected": 2.598018169403076, "step": 69010 }, { "epoch": 3.2044198895027622, "grad_norm": 28.971227645874023, "learning_rate": 1.078211616138168e-07, "logits/chosen": -19.366741180419922, "logits/rejected": -19.00448989868164, "logps/chosen": -346.91754150390625, "logps/rejected": -311.56439208984375, "loss": 0.7289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.041001796722412, "rewards/margins": 0.947891354560852, "rewards/rejected": 2.0931103229522705, "step": 69020 }, { "epoch": 3.2048841636101955, "grad_norm": 2.3403632640838623, "learning_rate": 1.0779330516737081e-07, "logits/chosen": -19.702665328979492, "logits/rejected": -17.624298095703125, "logps/chosen": -543.8665771484375, "logps/rejected": -337.7707214355469, "loss": 0.5685, "rewards/accuracies": 0.5, "rewards/chosen": 4.277596473693848, "rewards/margins": 2.786970615386963, "rewards/rejected": 1.4906256198883057, "step": 69030 }, { "epoch": 3.2053484377176287, "grad_norm": 2.726757287979126, "learning_rate": 1.0776544872092484e-07, "logits/chosen": -19.617725372314453, "logits/rejected": -18.516420364379883, "logps/chosen": -454.6189880371094, "logps/rejected": -342.0045471191406, "loss": 0.5244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.082424163818359, "rewards/margins": 1.5492032766342163, "rewards/rejected": 2.5332210063934326, "step": 69040 }, { "epoch": 3.2058127118250614, "grad_norm": 171.05044555664062, "learning_rate": 1.0773759227447884e-07, "logits/chosen": -19.555774688720703, "logits/rejected": -19.209503173828125, "logps/chosen": -483.843505859375, "logps/rejected": -415.1918029785156, "loss": 0.8579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.156566619873047, "rewards/margins": 0.25701671838760376, "rewards/rejected": 3.899549961090088, "step": 69050 }, { "epoch": 3.2062769859324947, "grad_norm": 0.2630479633808136, "learning_rate": 1.0770973582803287e-07, "logits/chosen": -17.61928939819336, "logits/rejected": -17.97587776184082, "logps/chosen": -337.3296813964844, "logps/rejected": -324.72412109375, "loss": 0.9982, "rewards/accuracies": 0.5, "rewards/chosen": 2.808040142059326, "rewards/margins": 0.7129874229431152, "rewards/rejected": 2.09505295753479, "step": 69060 }, { "epoch": 3.2067412600399274, "grad_norm": 78.23817443847656, "learning_rate": 1.0768187938158688e-07, "logits/chosen": -19.478960037231445, "logits/rejected": -18.748559951782227, "logps/chosen": -501.82177734375, "logps/rejected": -444.08642578125, "loss": 0.5692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.449715614318848, "rewards/margins": 1.2341070175170898, "rewards/rejected": 3.215608596801758, "step": 69070 }, { "epoch": 3.2072055341473606, "grad_norm": 77.96617889404297, "learning_rate": 1.0765402293514091e-07, "logits/chosen": -19.33981704711914, "logits/rejected": -18.284406661987305, "logps/chosen": -417.61126708984375, "logps/rejected": -362.6151123046875, "loss": 0.4662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8420722484588623, "rewards/margins": 1.673424482345581, "rewards/rejected": 2.168647527694702, "step": 69080 }, { "epoch": 3.207669808254794, "grad_norm": 237.44187927246094, "learning_rate": 1.0762616648869492e-07, "logits/chosen": -20.09033203125, "logits/rejected": -19.56743812561035, "logps/chosen": -426.29718017578125, "logps/rejected": -357.76715087890625, "loss": 0.7513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.172407150268555, "rewards/margins": 0.840445876121521, "rewards/rejected": 3.3319613933563232, "step": 69090 }, { "epoch": 3.2081340823622266, "grad_norm": 2.2392170429229736, "learning_rate": 1.0759831004224895e-07, "logits/chosen": -19.92290496826172, "logits/rejected": -19.514272689819336, "logps/chosen": -344.4991760253906, "logps/rejected": -272.3873291015625, "loss": 1.1809, "rewards/accuracies": 0.5, "rewards/chosen": 3.042928457260132, "rewards/margins": 0.695798933506012, "rewards/rejected": 2.3471293449401855, "step": 69100 }, { "epoch": 3.20859835646966, "grad_norm": 88.58257293701172, "learning_rate": 1.0757045359580295e-07, "logits/chosen": -19.55675506591797, "logits/rejected": -19.449222564697266, "logps/chosen": -419.0635681152344, "logps/rejected": -386.9974060058594, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.441899538040161, "rewards/margins": 0.6106189489364624, "rewards/rejected": 2.8312809467315674, "step": 69110 }, { "epoch": 3.2090626305770926, "grad_norm": 188.05380249023438, "learning_rate": 1.0754259714935697e-07, "logits/chosen": -19.726848602294922, "logits/rejected": -18.768659591674805, "logps/chosen": -349.2107849121094, "logps/rejected": -256.1064758300781, "loss": 0.4408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9161508083343506, "rewards/margins": 1.6786565780639648, "rewards/rejected": 1.2374942302703857, "step": 69120 }, { "epoch": 3.209526904684526, "grad_norm": 45.25035858154297, "learning_rate": 1.07514740702911e-07, "logits/chosen": -18.691753387451172, "logits/rejected": -19.080387115478516, "logps/chosen": -338.17449951171875, "logps/rejected": -353.36846923828125, "loss": 1.032, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.237717866897583, "rewards/margins": 0.12189595401287079, "rewards/rejected": 3.1158223152160645, "step": 69130 }, { "epoch": 3.2099911787919586, "grad_norm": 37.99463653564453, "learning_rate": 1.0748688425646501e-07, "logits/chosen": -19.098848342895508, "logits/rejected": -18.64322280883789, "logps/chosen": -418.818115234375, "logps/rejected": -319.91064453125, "loss": 0.1514, "rewards/accuracies": 1.0, "rewards/chosen": 4.5311055183410645, "rewards/margins": 2.5309715270996094, "rewards/rejected": 2.000133752822876, "step": 69140 }, { "epoch": 3.210455452899392, "grad_norm": 128.19515991210938, "learning_rate": 1.0745902781001903e-07, "logits/chosen": -18.538894653320312, "logits/rejected": -18.50613021850586, "logps/chosen": -356.3554992675781, "logps/rejected": -335.35943603515625, "loss": 1.06, "rewards/accuracies": 0.5, "rewards/chosen": 4.001162528991699, "rewards/margins": 0.2631841003894806, "rewards/rejected": 3.73797869682312, "step": 69150 }, { "epoch": 3.210919727006825, "grad_norm": 189.68324279785156, "learning_rate": 1.0743117136357304e-07, "logits/chosen": -19.10749626159668, "logits/rejected": -19.096923828125, "logps/chosen": -407.8134460449219, "logps/rejected": -321.8838806152344, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.399954795837402, "rewards/margins": 1.463247299194336, "rewards/rejected": 2.9367079734802246, "step": 69160 }, { "epoch": 3.211384001114258, "grad_norm": 29.061887741088867, "learning_rate": 1.0740331491712707e-07, "logits/chosen": -18.691049575805664, "logits/rejected": -18.762540817260742, "logps/chosen": -311.83587646484375, "logps/rejected": -279.915771484375, "loss": 1.4787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8656938076019287, "rewards/margins": 0.31869444251060486, "rewards/rejected": 2.546999454498291, "step": 69170 }, { "epoch": 3.211848275221691, "grad_norm": 2.5832746028900146, "learning_rate": 1.0737545847068108e-07, "logits/chosen": -19.079463958740234, "logits/rejected": -17.952991485595703, "logps/chosen": -485.9195861816406, "logps/rejected": -399.99188232421875, "loss": 0.6824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.021363258361816, "rewards/margins": 2.0123190879821777, "rewards/rejected": 3.0090439319610596, "step": 69180 }, { "epoch": 3.212312549329124, "grad_norm": 48.43466567993164, "learning_rate": 1.0734760202423511e-07, "logits/chosen": -18.972579956054688, "logits/rejected": -19.351659774780273, "logps/chosen": -325.00250244140625, "logps/rejected": -404.8710021972656, "loss": 1.578, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8119406700134277, "rewards/margins": -0.776168704032898, "rewards/rejected": 3.588109254837036, "step": 69190 }, { "epoch": 3.212776823436557, "grad_norm": 137.5272216796875, "learning_rate": 1.0731974557778911e-07, "logits/chosen": -19.326921463012695, "logits/rejected": -17.663347244262695, "logps/chosen": -357.05322265625, "logps/rejected": -302.9633483886719, "loss": 0.4383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.432921886444092, "rewards/margins": 1.6346172094345093, "rewards/rejected": 1.7983042001724243, "step": 69200 }, { "epoch": 3.21324109754399, "grad_norm": 209.74664306640625, "learning_rate": 1.0729188913134314e-07, "logits/chosen": -18.873851776123047, "logits/rejected": -18.227108001708984, "logps/chosen": -322.8796691894531, "logps/rejected": -237.08724975585938, "loss": 0.5763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6720259189605713, "rewards/margins": 1.296574354171753, "rewards/rejected": 1.3754513263702393, "step": 69210 }, { "epoch": 3.213705371651423, "grad_norm": 3.8238046169281006, "learning_rate": 1.0726403268489715e-07, "logits/chosen": -18.596153259277344, "logits/rejected": -19.075275421142578, "logps/chosen": -325.95263671875, "logps/rejected": -379.98736572265625, "loss": 0.8035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8072643280029297, "rewards/margins": 0.6755639314651489, "rewards/rejected": 3.131700038909912, "step": 69220 }, { "epoch": 3.214169645758856, "grad_norm": 4.421635150909424, "learning_rate": 1.0723617623845118e-07, "logits/chosen": -19.817583084106445, "logits/rejected": -18.750165939331055, "logps/chosen": -448.9111328125, "logps/rejected": -413.3729553222656, "loss": 0.2293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.727629661560059, "rewards/margins": 2.2010676860809326, "rewards/rejected": 2.526561975479126, "step": 69230 }, { "epoch": 3.214633919866289, "grad_norm": 61.033321380615234, "learning_rate": 1.072083197920052e-07, "logits/chosen": -19.723079681396484, "logits/rejected": -19.375869750976562, "logps/chosen": -385.0792236328125, "logps/rejected": -378.78289794921875, "loss": 1.0987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.412384033203125, "rewards/margins": 0.7094932794570923, "rewards/rejected": 3.702890396118164, "step": 69240 }, { "epoch": 3.215098193973722, "grad_norm": 51.13780212402344, "learning_rate": 1.0718046334555922e-07, "logits/chosen": -19.02382469177246, "logits/rejected": -18.291053771972656, "logps/chosen": -453.82647705078125, "logps/rejected": -404.757080078125, "loss": 0.3435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.589989423751831, "rewards/margins": 1.441955804824829, "rewards/rejected": 2.148033618927002, "step": 69250 }, { "epoch": 3.215562468081155, "grad_norm": 8.218316078186035, "learning_rate": 1.0715260689911322e-07, "logits/chosen": -18.68701171875, "logits/rejected": -17.61897087097168, "logps/chosen": -425.15045166015625, "logps/rejected": -308.2095031738281, "loss": 0.437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.886824131011963, "rewards/margins": 1.3896342515945435, "rewards/rejected": 1.4971897602081299, "step": 69260 }, { "epoch": 3.216026742188588, "grad_norm": 203.6115264892578, "learning_rate": 1.0712475045266725e-07, "logits/chosen": -19.478801727294922, "logits/rejected": -19.048370361328125, "logps/chosen": -332.3717041015625, "logps/rejected": -272.8890075683594, "loss": 0.7821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5495760440826416, "rewards/margins": 0.9449039697647095, "rewards/rejected": 0.6046720147132874, "step": 69270 }, { "epoch": 3.216491016296021, "grad_norm": 223.41896057128906, "learning_rate": 1.0709689400622127e-07, "logits/chosen": -19.59417724609375, "logits/rejected": -18.596492767333984, "logps/chosen": -416.52960205078125, "logps/rejected": -381.57305908203125, "loss": 0.5679, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.294398069381714, "rewards/margins": 1.3133469820022583, "rewards/rejected": 1.9810512065887451, "step": 69280 }, { "epoch": 3.216955290403454, "grad_norm": 250.44529724121094, "learning_rate": 1.070690375597753e-07, "logits/chosen": -19.657167434692383, "logits/rejected": -19.388286590576172, "logps/chosen": -399.5148010253906, "logps/rejected": -350.4669494628906, "loss": 1.2624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.175293922424316, "rewards/margins": 0.3513657748699188, "rewards/rejected": 3.8239283561706543, "step": 69290 }, { "epoch": 3.2174195645108874, "grad_norm": 189.3461151123047, "learning_rate": 1.0704118111332931e-07, "logits/chosen": -19.53466796875, "logits/rejected": -19.463016510009766, "logps/chosen": -534.3348388671875, "logps/rejected": -515.599609375, "loss": 0.9518, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.427146911621094, "rewards/margins": -0.06001415103673935, "rewards/rejected": 4.487161159515381, "step": 69300 }, { "epoch": 3.21788383861832, "grad_norm": 56.3260612487793, "learning_rate": 1.0701332466688334e-07, "logits/chosen": -18.431625366210938, "logits/rejected": -17.326879501342773, "logps/chosen": -444.30718994140625, "logps/rejected": -298.0523376464844, "loss": 0.6071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.096324920654297, "rewards/margins": 1.4521615505218506, "rewards/rejected": 1.6441634893417358, "step": 69310 }, { "epoch": 3.2183481127257534, "grad_norm": 11.838216781616211, "learning_rate": 1.0698546822043734e-07, "logits/chosen": -19.752899169921875, "logits/rejected": -19.514461517333984, "logps/chosen": -401.5388488769531, "logps/rejected": -363.6671447753906, "loss": 0.7635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.251791477203369, "rewards/margins": 1.099157691001892, "rewards/rejected": 2.1526336669921875, "step": 69320 }, { "epoch": 3.218812386833186, "grad_norm": 27.38123321533203, "learning_rate": 1.0695761177399135e-07, "logits/chosen": -19.11450958251953, "logits/rejected": -18.338720321655273, "logps/chosen": -439.4353942871094, "logps/rejected": -305.7360534667969, "loss": 0.4565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.115744113922119, "rewards/margins": 2.2663819789886475, "rewards/rejected": 1.8493621349334717, "step": 69330 }, { "epoch": 3.2192766609406194, "grad_norm": 157.868408203125, "learning_rate": 1.0692975532754538e-07, "logits/chosen": -18.881412506103516, "logits/rejected": -18.742534637451172, "logps/chosen": -364.01776123046875, "logps/rejected": -456.1548767089844, "loss": 1.0459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1537935733795166, "rewards/margins": 0.6684833765029907, "rewards/rejected": 2.4853100776672363, "step": 69340 }, { "epoch": 3.219740935048052, "grad_norm": 30.91042709350586, "learning_rate": 1.069018988810994e-07, "logits/chosen": -19.100032806396484, "logits/rejected": -17.53390884399414, "logps/chosen": -302.14422607421875, "logps/rejected": -226.14993286132812, "loss": 0.5693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.862776517868042, "rewards/margins": 1.9447593688964844, "rewards/rejected": 0.9180175065994263, "step": 69350 }, { "epoch": 3.2202052091554854, "grad_norm": 81.22314453125, "learning_rate": 1.0687404243465341e-07, "logits/chosen": -19.237850189208984, "logits/rejected": -18.581836700439453, "logps/chosen": -436.36834716796875, "logps/rejected": -386.9908142089844, "loss": 1.1657, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8899497985839844, "rewards/margins": 0.09822654724121094, "rewards/rejected": 3.7917227745056152, "step": 69360 }, { "epoch": 3.2206694832629186, "grad_norm": 236.96115112304688, "learning_rate": 1.0684618598820742e-07, "logits/chosen": -18.989816665649414, "logits/rejected": -19.19301986694336, "logps/chosen": -345.322021484375, "logps/rejected": -255.04849243164062, "loss": 0.5515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3222739696502686, "rewards/margins": 1.2699459791183472, "rewards/rejected": 2.052328109741211, "step": 69370 }, { "epoch": 3.2211337573703513, "grad_norm": 0.5911391973495483, "learning_rate": 1.0681832954176145e-07, "logits/chosen": -20.38414192199707, "logits/rejected": -17.973806381225586, "logps/chosen": -549.7188110351562, "logps/rejected": -316.6253356933594, "loss": 0.4527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.733384609222412, "rewards/margins": 2.7204816341400146, "rewards/rejected": 2.0129029750823975, "step": 69380 }, { "epoch": 3.2215980314777846, "grad_norm": 14.145158767700195, "learning_rate": 1.0679047309531547e-07, "logits/chosen": -18.16266632080078, "logits/rejected": -17.148677825927734, "logps/chosen": -388.3573913574219, "logps/rejected": -286.1988220214844, "loss": 0.7218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3408188819885254, "rewards/margins": 0.9241873025894165, "rewards/rejected": 2.4166312217712402, "step": 69390 }, { "epoch": 3.2220623055852173, "grad_norm": 36.724449157714844, "learning_rate": 1.067626166488695e-07, "logits/chosen": -19.68368911743164, "logits/rejected": -18.817628860473633, "logps/chosen": -379.66168212890625, "logps/rejected": -280.1814880371094, "loss": 0.8159, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7156991958618164, "rewards/margins": 0.2774803042411804, "rewards/rejected": 2.438218832015991, "step": 69400 }, { "epoch": 3.2225265796926506, "grad_norm": 14.343390464782715, "learning_rate": 1.067347602024235e-07, "logits/chosen": -19.498126983642578, "logits/rejected": -17.960678100585938, "logps/chosen": -381.91107177734375, "logps/rejected": -310.66326904296875, "loss": 0.5435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.289632797241211, "rewards/margins": 1.698683738708496, "rewards/rejected": 1.5909491777420044, "step": 69410 }, { "epoch": 3.2229908538000838, "grad_norm": 72.62676239013672, "learning_rate": 1.0670690375597752e-07, "logits/chosen": -18.66642189025879, "logits/rejected": -18.025531768798828, "logps/chosen": -360.9048767089844, "logps/rejected": -318.9615173339844, "loss": 0.6851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.488661050796509, "rewards/margins": 0.4809403419494629, "rewards/rejected": 2.007720708847046, "step": 69420 }, { "epoch": 3.2234551279075165, "grad_norm": 109.87336730957031, "learning_rate": 1.0667904730953154e-07, "logits/chosen": -19.020286560058594, "logits/rejected": -17.77580451965332, "logps/chosen": -428.92010498046875, "logps/rejected": -279.31829833984375, "loss": 0.9243, "rewards/accuracies": 0.5, "rewards/chosen": 3.026789903640747, "rewards/margins": 0.9327966570854187, "rewards/rejected": 2.0939931869506836, "step": 69430 }, { "epoch": 3.2239194020149498, "grad_norm": 2.9722225666046143, "learning_rate": 1.0665119086308557e-07, "logits/chosen": -18.509021759033203, "logits/rejected": -17.01047706604004, "logps/chosen": -508.3888244628906, "logps/rejected": -347.08551025390625, "loss": 0.33, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.085786819458008, "rewards/margins": 2.262648105621338, "rewards/rejected": 1.823138952255249, "step": 69440 }, { "epoch": 3.2243836761223825, "grad_norm": 43.5007209777832, "learning_rate": 1.0662333441663958e-07, "logits/chosen": -18.869352340698242, "logits/rejected": -17.506351470947266, "logps/chosen": -338.9702453613281, "logps/rejected": -223.90933227539062, "loss": 0.2619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.607591152191162, "rewards/margins": 2.6793925762176514, "rewards/rejected": 0.9281988143920898, "step": 69450 }, { "epoch": 3.2248479502298157, "grad_norm": 16.9998779296875, "learning_rate": 1.0659547797019361e-07, "logits/chosen": -18.06989288330078, "logits/rejected": -17.87751579284668, "logps/chosen": -417.1971740722656, "logps/rejected": -372.27606201171875, "loss": 0.7462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.914036512374878, "rewards/margins": 1.7464599609375, "rewards/rejected": 2.167576313018799, "step": 69460 }, { "epoch": 3.2253122243372485, "grad_norm": 37.99250793457031, "learning_rate": 1.0656762152374761e-07, "logits/chosen": -19.375654220581055, "logits/rejected": -18.988889694213867, "logps/chosen": -385.4646911621094, "logps/rejected": -359.0550537109375, "loss": 0.9273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5276527404785156, "rewards/margins": 0.4500236511230469, "rewards/rejected": 3.0776290893554688, "step": 69470 }, { "epoch": 3.2257764984446817, "grad_norm": 7.378598213195801, "learning_rate": 1.0653976507730164e-07, "logits/chosen": -18.469852447509766, "logits/rejected": -17.823144912719727, "logps/chosen": -459.43768310546875, "logps/rejected": -339.50921630859375, "loss": 0.5803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.652353763580322, "rewards/margins": 1.9932701587677002, "rewards/rejected": 2.6590828895568848, "step": 69480 }, { "epoch": 3.226240772552115, "grad_norm": 246.79444885253906, "learning_rate": 1.0651190863085565e-07, "logits/chosen": -18.723430633544922, "logits/rejected": -18.38943862915039, "logps/chosen": -381.9895935058594, "logps/rejected": -338.2250061035156, "loss": 1.3424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.796201229095459, "rewards/margins": -0.04408721998333931, "rewards/rejected": 2.8402884006500244, "step": 69490 }, { "epoch": 3.2267050466595477, "grad_norm": 107.3350830078125, "learning_rate": 1.0648405218440968e-07, "logits/chosen": -19.105398178100586, "logits/rejected": -18.61389923095703, "logps/chosen": -436.97064208984375, "logps/rejected": -460.359375, "loss": 0.9131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6579482555389404, "rewards/margins": 0.18274331092834473, "rewards/rejected": 3.4752049446105957, "step": 69500 }, { "epoch": 3.227169320766981, "grad_norm": 205.51193237304688, "learning_rate": 1.064561957379637e-07, "logits/chosen": -20.364009857177734, "logits/rejected": -19.595088958740234, "logps/chosen": -356.6108703613281, "logps/rejected": -358.3470458984375, "loss": 0.7329, "rewards/accuracies": 0.5, "rewards/chosen": 3.9929840564727783, "rewards/margins": 0.9030762910842896, "rewards/rejected": 3.0899081230163574, "step": 69510 }, { "epoch": 3.2276335948744137, "grad_norm": 75.58719635009766, "learning_rate": 1.064283392915177e-07, "logits/chosen": -20.04548454284668, "logits/rejected": -19.975072860717773, "logps/chosen": -548.1502685546875, "logps/rejected": -491.52667236328125, "loss": 0.8256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.880262851715088, "rewards/margins": 0.48731571435928345, "rewards/rejected": 4.392947196960449, "step": 69520 }, { "epoch": 3.228097868981847, "grad_norm": 66.98633575439453, "learning_rate": 1.0640048284507172e-07, "logits/chosen": -18.436843872070312, "logits/rejected": -17.071392059326172, "logps/chosen": -462.429931640625, "logps/rejected": -298.09210205078125, "loss": 0.6461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.904592752456665, "rewards/margins": 1.4503319263458252, "rewards/rejected": 1.4542605876922607, "step": 69530 }, { "epoch": 3.22856214308928, "grad_norm": 150.99432373046875, "learning_rate": 1.0637262639862574e-07, "logits/chosen": -18.377552032470703, "logits/rejected": -18.987045288085938, "logps/chosen": -300.5263366699219, "logps/rejected": -345.7371826171875, "loss": 1.3761, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2053093910217285, "rewards/margins": -0.6470606923103333, "rewards/rejected": 2.852370262145996, "step": 69540 }, { "epoch": 3.229026417196713, "grad_norm": 68.16758728027344, "learning_rate": 1.0634476995217977e-07, "logits/chosen": -19.169498443603516, "logits/rejected": -18.403202056884766, "logps/chosen": -537.7251586914062, "logps/rejected": -348.1578369140625, "loss": 0.5272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.203400611877441, "rewards/margins": 0.9285176396369934, "rewards/rejected": 3.2748827934265137, "step": 69550 }, { "epoch": 3.229490691304146, "grad_norm": 47.31800079345703, "learning_rate": 1.0631691350573378e-07, "logits/chosen": -18.76998519897461, "logits/rejected": -18.450092315673828, "logps/chosen": -472.9183654785156, "logps/rejected": -454.2705993652344, "loss": 0.3676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.383819103240967, "rewards/margins": 1.1129263639450073, "rewards/rejected": 3.27089262008667, "step": 69560 }, { "epoch": 3.229954965411579, "grad_norm": 0.1635613590478897, "learning_rate": 1.062890570592878e-07, "logits/chosen": -20.07404136657715, "logits/rejected": -16.960514068603516, "logps/chosen": -490.533447265625, "logps/rejected": -239.72085571289062, "loss": 0.1051, "rewards/accuracies": 1.0, "rewards/chosen": 4.765036582946777, "rewards/margins": 3.829725980758667, "rewards/rejected": 0.9353100061416626, "step": 69570 }, { "epoch": 3.230419239519012, "grad_norm": 1.4010504484176636, "learning_rate": 1.0626120061284181e-07, "logits/chosen": -20.21145248413086, "logits/rejected": -19.22574806213379, "logps/chosen": -422.1468811035156, "logps/rejected": -324.2276306152344, "loss": 0.3601, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1507039070129395, "rewards/margins": 2.2289204597473145, "rewards/rejected": 1.921783208847046, "step": 69580 }, { "epoch": 3.230883513626445, "grad_norm": 119.4255142211914, "learning_rate": 1.0623334416639584e-07, "logits/chosen": -19.51764488220215, "logits/rejected": -18.262889862060547, "logps/chosen": -392.3125915527344, "logps/rejected": -296.66790771484375, "loss": 0.2737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.346715927124023, "rewards/margins": 2.0128173828125, "rewards/rejected": 2.3338983058929443, "step": 69590 }, { "epoch": 3.231347787733878, "grad_norm": 13.00200080871582, "learning_rate": 1.0620548771994985e-07, "logits/chosen": -18.946949005126953, "logits/rejected": -18.211563110351562, "logps/chosen": -448.7265625, "logps/rejected": -393.09600830078125, "loss": 0.587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.013970375061035, "rewards/margins": 1.5688941478729248, "rewards/rejected": 2.4450767040252686, "step": 69600 }, { "epoch": 3.2318120618413113, "grad_norm": 45.77345657348633, "learning_rate": 1.0617763127350388e-07, "logits/chosen": -19.971603393554688, "logits/rejected": -19.41228485107422, "logps/chosen": -366.7820739746094, "logps/rejected": -283.7728271484375, "loss": 0.64, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8366539478302, "rewards/margins": 1.5593117475509644, "rewards/rejected": 2.277341604232788, "step": 69610 }, { "epoch": 3.232276335948744, "grad_norm": 3.077106475830078, "learning_rate": 1.0614977482705788e-07, "logits/chosen": -19.670490264892578, "logits/rejected": -19.608760833740234, "logps/chosen": -448.6778259277344, "logps/rejected": -345.4847106933594, "loss": 0.5284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.696507692337036, "rewards/margins": 1.0108373165130615, "rewards/rejected": 2.6856706142425537, "step": 69620 }, { "epoch": 3.2327406100561773, "grad_norm": 9.596525192260742, "learning_rate": 1.0612191838061191e-07, "logits/chosen": -19.254369735717773, "logits/rejected": -18.378215789794922, "logps/chosen": -431.50164794921875, "logps/rejected": -368.19061279296875, "loss": 0.9268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.374338150024414, "rewards/margins": 1.6815941333770752, "rewards/rejected": 3.6927437782287598, "step": 69630 }, { "epoch": 3.23320488416361, "grad_norm": 18.524234771728516, "learning_rate": 1.0609406193416592e-07, "logits/chosen": -19.552053451538086, "logits/rejected": -17.90248680114746, "logps/chosen": -286.6897277832031, "logps/rejected": -149.41781616210938, "loss": 0.5949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4083876609802246, "rewards/margins": 1.6646859645843506, "rewards/rejected": 0.7437017560005188, "step": 69640 }, { "epoch": 3.2336691582710433, "grad_norm": 99.20742797851562, "learning_rate": 1.0606620548771995e-07, "logits/chosen": -18.86611557006836, "logits/rejected": -18.6084041595459, "logps/chosen": -430.7552795410156, "logps/rejected": -395.5133361816406, "loss": 0.7944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3602778911590576, "rewards/margins": 0.3233826756477356, "rewards/rejected": 3.036895275115967, "step": 69650 }, { "epoch": 3.234133432378476, "grad_norm": 33.21940612792969, "learning_rate": 1.0603834904127397e-07, "logits/chosen": -17.856801986694336, "logits/rejected": -17.29831314086914, "logps/chosen": -378.7519226074219, "logps/rejected": -267.5098571777344, "loss": 0.4116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.564589262008667, "rewards/margins": 2.006634473800659, "rewards/rejected": 1.5579547882080078, "step": 69660 }, { "epoch": 3.2345977064859093, "grad_norm": 1.1782197952270508, "learning_rate": 1.0601049259482799e-07, "logits/chosen": -19.143054962158203, "logits/rejected": -17.450735092163086, "logps/chosen": -473.08953857421875, "logps/rejected": -320.55181884765625, "loss": 0.3077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.224177360534668, "rewards/margins": 2.8200244903564453, "rewards/rejected": 2.4041528701782227, "step": 69670 }, { "epoch": 3.2350619805933425, "grad_norm": 170.00506591796875, "learning_rate": 1.05982636148382e-07, "logits/chosen": -19.09555435180664, "logits/rejected": -18.764936447143555, "logps/chosen": -305.8172607421875, "logps/rejected": -314.38433837890625, "loss": 0.621, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.824617862701416, "rewards/margins": 0.8016040921211243, "rewards/rejected": 2.0230135917663574, "step": 69680 }, { "epoch": 3.2355262547007753, "grad_norm": 31.688501358032227, "learning_rate": 1.0595477970193602e-07, "logits/chosen": -18.875438690185547, "logits/rejected": -18.819957733154297, "logps/chosen": -477.21368408203125, "logps/rejected": -469.8904724121094, "loss": 0.7519, "rewards/accuracies": 0.5, "rewards/chosen": 3.4788315296173096, "rewards/margins": 0.513719916343689, "rewards/rejected": 2.9651119709014893, "step": 69690 }, { "epoch": 3.2359905288082085, "grad_norm": 14.268047332763672, "learning_rate": 1.0592692325549004e-07, "logits/chosen": -19.12966537475586, "logits/rejected": -18.287538528442383, "logps/chosen": -264.0706481933594, "logps/rejected": -216.5811004638672, "loss": 1.1566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.103217363357544, "rewards/margins": 0.6784408688545227, "rewards/rejected": 1.424776554107666, "step": 69700 }, { "epoch": 3.2364548029156412, "grad_norm": 154.76136779785156, "learning_rate": 1.0589906680904406e-07, "logits/chosen": -19.00891876220703, "logits/rejected": -18.09465980529785, "logps/chosen": -438.761962890625, "logps/rejected": -364.94500732421875, "loss": 0.5153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7110276222229004, "rewards/margins": 1.2420703172683716, "rewards/rejected": 2.46895694732666, "step": 69710 }, { "epoch": 3.2369190770230745, "grad_norm": 223.5733184814453, "learning_rate": 1.0587121036259808e-07, "logits/chosen": -18.524789810180664, "logits/rejected": -18.098377227783203, "logps/chosen": -344.76202392578125, "logps/rejected": -299.43603515625, "loss": 0.7782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5795085430145264, "rewards/margins": 0.3413090705871582, "rewards/rejected": 2.2381997108459473, "step": 69720 }, { "epoch": 3.2373833511305072, "grad_norm": 96.76322937011719, "learning_rate": 1.0584335391615208e-07, "logits/chosen": -19.265892028808594, "logits/rejected": -18.68155288696289, "logps/chosen": -368.55157470703125, "logps/rejected": -384.89007568359375, "loss": 0.892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7958765029907227, "rewards/margins": 0.48795515298843384, "rewards/rejected": 2.3079216480255127, "step": 69730 }, { "epoch": 3.2378476252379405, "grad_norm": 15.381884574890137, "learning_rate": 1.0581549746970611e-07, "logits/chosen": -18.846908569335938, "logits/rejected": -18.399192810058594, "logps/chosen": -380.6123962402344, "logps/rejected": -291.5940246582031, "loss": 0.8683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1740002632141113, "rewards/margins": 0.6972125768661499, "rewards/rejected": 2.476787567138672, "step": 69740 }, { "epoch": 3.2383118993453737, "grad_norm": 0.9511375427246094, "learning_rate": 1.0578764102326012e-07, "logits/chosen": -20.031885147094727, "logits/rejected": -18.48923683166504, "logps/chosen": -508.7610778808594, "logps/rejected": -376.06671142578125, "loss": 0.7464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.645674228668213, "rewards/margins": 1.1244304180145264, "rewards/rejected": 3.5212433338165283, "step": 69750 }, { "epoch": 3.2387761734528064, "grad_norm": 96.94093322753906, "learning_rate": 1.0575978457681415e-07, "logits/chosen": -19.24130630493164, "logits/rejected": -18.33998680114746, "logps/chosen": -365.9488220214844, "logps/rejected": -283.916015625, "loss": 0.5874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.010540723800659, "rewards/margins": 1.8459850549697876, "rewards/rejected": 1.1645554304122925, "step": 69760 }, { "epoch": 3.2392404475602397, "grad_norm": 217.62374877929688, "learning_rate": 1.0573192813036816e-07, "logits/chosen": -19.272682189941406, "logits/rejected": -18.65019416809082, "logps/chosen": -373.98541259765625, "logps/rejected": -283.32012939453125, "loss": 1.2374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8744704723358154, "rewards/margins": 0.7308250665664673, "rewards/rejected": 3.1436450481414795, "step": 69770 }, { "epoch": 3.2397047216676724, "grad_norm": 35.6781120300293, "learning_rate": 1.0570407168392218e-07, "logits/chosen": -20.1910400390625, "logits/rejected": -19.30253028869629, "logps/chosen": -381.81231689453125, "logps/rejected": -250.1500701904297, "loss": 0.3252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8837814331054688, "rewards/margins": 1.7590538263320923, "rewards/rejected": 2.124727725982666, "step": 69780 }, { "epoch": 3.2401689957751056, "grad_norm": 1.166260004043579, "learning_rate": 1.056762152374762e-07, "logits/chosen": -18.892715454101562, "logits/rejected": -18.14133071899414, "logps/chosen": -400.7304992675781, "logps/rejected": -285.62762451171875, "loss": 0.8535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.358752489089966, "rewards/margins": 1.1651229858398438, "rewards/rejected": 2.193629741668701, "step": 69790 }, { "epoch": 3.240633269882539, "grad_norm": 26.218521118164062, "learning_rate": 1.0564835879103022e-07, "logits/chosen": -19.04890251159668, "logits/rejected": -17.897457122802734, "logps/chosen": -368.12896728515625, "logps/rejected": -273.927490234375, "loss": 0.4005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8657774925231934, "rewards/margins": 1.7407585382461548, "rewards/rejected": 2.125019073486328, "step": 69800 }, { "epoch": 3.2410975439899716, "grad_norm": 223.1648712158203, "learning_rate": 1.0562050234458424e-07, "logits/chosen": -19.210071563720703, "logits/rejected": -17.93918800354004, "logps/chosen": -478.0230407714844, "logps/rejected": -337.09625244140625, "loss": 0.4943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.73612642288208, "rewards/margins": 2.65822172164917, "rewards/rejected": 2.0779049396514893, "step": 69810 }, { "epoch": 3.241561818097405, "grad_norm": 278.1164245605469, "learning_rate": 1.0559264589813826e-07, "logits/chosen": -19.561307907104492, "logits/rejected": -19.515369415283203, "logps/chosen": -506.52935791015625, "logps/rejected": -433.902099609375, "loss": 0.5086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.049900531768799, "rewards/margins": 1.4081566333770752, "rewards/rejected": 2.6417438983917236, "step": 69820 }, { "epoch": 3.2420260922048376, "grad_norm": 5.828030109405518, "learning_rate": 1.0556478945169227e-07, "logits/chosen": -17.882966995239258, "logits/rejected": -18.036592483520508, "logps/chosen": -340.58367919921875, "logps/rejected": -330.4532775878906, "loss": 0.8968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0264229774475098, "rewards/margins": 0.6806092858314514, "rewards/rejected": 2.345813274383545, "step": 69830 }, { "epoch": 3.242490366312271, "grad_norm": 61.417633056640625, "learning_rate": 1.0553693300524629e-07, "logits/chosen": -19.168764114379883, "logits/rejected": -18.03154945373535, "logps/chosen": -406.710205078125, "logps/rejected": -265.3391418457031, "loss": 0.4675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3593735694885254, "rewards/margins": 1.4884397983551025, "rewards/rejected": 1.8709337711334229, "step": 69840 }, { "epoch": 3.2429546404197036, "grad_norm": 3.9596643447875977, "learning_rate": 1.0550907655880031e-07, "logits/chosen": -18.3209171295166, "logits/rejected": -18.015583038330078, "logps/chosen": -355.6203308105469, "logps/rejected": -268.3731384277344, "loss": 0.7907, "rewards/accuracies": 0.5, "rewards/chosen": 3.125737190246582, "rewards/margins": 0.6317949295043945, "rewards/rejected": 2.4939422607421875, "step": 69850 }, { "epoch": 3.243418914527137, "grad_norm": 180.74813842773438, "learning_rate": 1.0548122011235434e-07, "logits/chosen": -18.56997299194336, "logits/rejected": -18.313016891479492, "logps/chosen": -380.38568115234375, "logps/rejected": -382.4376525878906, "loss": 0.8914, "rewards/accuracies": 0.5, "rewards/chosen": 3.658283233642578, "rewards/margins": 0.2111508846282959, "rewards/rejected": 3.4471325874328613, "step": 69860 }, { "epoch": 3.24388318863457, "grad_norm": 149.09881591796875, "learning_rate": 1.0545336366590835e-07, "logits/chosen": -19.248104095458984, "logits/rejected": -19.18307876586914, "logps/chosen": -390.3692932128906, "logps/rejected": -364.71630859375, "loss": 1.1879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.400128364562988, "rewards/margins": 0.7062414288520813, "rewards/rejected": 3.6938862800598145, "step": 69870 }, { "epoch": 3.244347462742003, "grad_norm": 72.7493667602539, "learning_rate": 1.0542550721946238e-07, "logits/chosen": -19.970027923583984, "logits/rejected": -18.55754852294922, "logps/chosen": -467.20465087890625, "logps/rejected": -352.61376953125, "loss": 0.5795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6108036041259766, "rewards/margins": 0.9474911689758301, "rewards/rejected": 2.6633124351501465, "step": 69880 }, { "epoch": 3.244811736849436, "grad_norm": 61.641544342041016, "learning_rate": 1.0539765077301638e-07, "logits/chosen": -18.691879272460938, "logits/rejected": -17.89096450805664, "logps/chosen": -323.37396240234375, "logps/rejected": -250.309814453125, "loss": 0.5426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.516584873199463, "rewards/margins": 1.4722827672958374, "rewards/rejected": 2.044301986694336, "step": 69890 }, { "epoch": 3.245276010956869, "grad_norm": 33.73451232910156, "learning_rate": 1.053697943265704e-07, "logits/chosen": -18.205551147460938, "logits/rejected": -18.36031723022461, "logps/chosen": -418.1773986816406, "logps/rejected": -327.05865478515625, "loss": 0.3994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.585569381713867, "rewards/margins": 1.2691653966903687, "rewards/rejected": 2.316403865814209, "step": 69900 }, { "epoch": 3.245740285064302, "grad_norm": 10.111549377441406, "learning_rate": 1.0534193788012442e-07, "logits/chosen": -18.352901458740234, "logits/rejected": -17.222631454467773, "logps/chosen": -283.18133544921875, "logps/rejected": -175.9834747314453, "loss": 0.4441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1508877277374268, "rewards/margins": 2.096349000930786, "rewards/rejected": 0.054538846015930176, "step": 69910 }, { "epoch": 3.2462045591717352, "grad_norm": 5.175336837768555, "learning_rate": 1.0531408143367844e-07, "logits/chosen": -18.321678161621094, "logits/rejected": -17.505374908447266, "logps/chosen": -358.14007568359375, "logps/rejected": -258.15460205078125, "loss": 0.4536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.996915817260742, "rewards/margins": 1.6404619216918945, "rewards/rejected": 1.3564538955688477, "step": 69920 }, { "epoch": 3.246668833279168, "grad_norm": 225.1630401611328, "learning_rate": 1.0528622498723246e-07, "logits/chosen": -18.8566837310791, "logits/rejected": -18.025484085083008, "logps/chosen": -377.908203125, "logps/rejected": -272.11822509765625, "loss": 0.5976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3818275928497314, "rewards/margins": 1.5718414783477783, "rewards/rejected": 1.809985876083374, "step": 69930 }, { "epoch": 3.247133107386601, "grad_norm": 23.210494995117188, "learning_rate": 1.0525836854078646e-07, "logits/chosen": -19.832664489746094, "logits/rejected": -18.69668960571289, "logps/chosen": -494.7154846191406, "logps/rejected": -382.60101318359375, "loss": 0.5656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.171296119689941, "rewards/margins": 1.3350529670715332, "rewards/rejected": 2.83624267578125, "step": 69940 }, { "epoch": 3.247597381494034, "grad_norm": 32.17026901245117, "learning_rate": 1.0523051209434049e-07, "logits/chosen": -18.941038131713867, "logits/rejected": -18.33110237121582, "logps/chosen": -383.67987060546875, "logps/rejected": -312.87371826171875, "loss": 0.958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3793978691101074, "rewards/margins": 1.301349401473999, "rewards/rejected": 2.0780487060546875, "step": 69950 }, { "epoch": 3.248061655601467, "grad_norm": 55.13179397583008, "learning_rate": 1.0520265564789451e-07, "logits/chosen": -18.957500457763672, "logits/rejected": -17.689516067504883, "logps/chosen": -390.12786865234375, "logps/rejected": -221.46542358398438, "loss": 0.6193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.590259075164795, "rewards/margins": 1.452987790107727, "rewards/rejected": 1.1372716426849365, "step": 69960 }, { "epoch": 3.2485259297089, "grad_norm": 53.6980094909668, "learning_rate": 1.0517479920144853e-07, "logits/chosen": -18.84722137451172, "logits/rejected": -18.076648712158203, "logps/chosen": -433.4554138183594, "logps/rejected": -291.6053161621094, "loss": 0.6242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.062623023986816, "rewards/margins": 2.2838215827941895, "rewards/rejected": 1.7788015604019165, "step": 69970 }, { "epoch": 3.248990203816333, "grad_norm": 2.2111988067626953, "learning_rate": 1.0514694275500255e-07, "logits/chosen": -18.310623168945312, "logits/rejected": -17.4406795501709, "logps/chosen": -409.1028747558594, "logps/rejected": -288.3639831542969, "loss": 0.4659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0587852001190186, "rewards/margins": 1.3054529428482056, "rewards/rejected": 1.7533323764801025, "step": 69980 }, { "epoch": 3.2494544779237664, "grad_norm": 2.436224937438965, "learning_rate": 1.0511908630855656e-07, "logits/chosen": -19.63759994506836, "logits/rejected": -18.742630004882812, "logps/chosen": -336.3674621582031, "logps/rejected": -266.3961486816406, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 3.7422726154327393, "rewards/margins": 1.9445021152496338, "rewards/rejected": 1.7977702617645264, "step": 69990 }, { "epoch": 3.249918752031199, "grad_norm": 43.628997802734375, "learning_rate": 1.0509122986211058e-07, "logits/chosen": -18.424837112426758, "logits/rejected": -17.859619140625, "logps/chosen": -261.2109375, "logps/rejected": -234.1092071533203, "loss": 0.3925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0422637462615967, "rewards/margins": 1.1146401166915894, "rewards/rejected": 0.9276237487792969, "step": 70000 }, { "epoch": 3.2503830261386324, "grad_norm": 216.3043212890625, "learning_rate": 1.050633734156646e-07, "logits/chosen": -17.823326110839844, "logits/rejected": -17.901893615722656, "logps/chosen": -300.1159973144531, "logps/rejected": -271.01513671875, "loss": 1.4997, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3950939178466797, "rewards/margins": -0.7521415948867798, "rewards/rejected": 2.14723539352417, "step": 70010 }, { "epoch": 3.250847300246065, "grad_norm": 135.89805603027344, "learning_rate": 1.0503551696921862e-07, "logits/chosen": -18.997066497802734, "logits/rejected": -18.364328384399414, "logps/chosen": -454.94903564453125, "logps/rejected": -380.8934020996094, "loss": 0.717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.876154899597168, "rewards/margins": 1.4025453329086304, "rewards/rejected": 3.473609209060669, "step": 70020 }, { "epoch": 3.2513115743534984, "grad_norm": 5.082739353179932, "learning_rate": 1.0500766052277265e-07, "logits/chosen": -18.4752197265625, "logits/rejected": -18.449710845947266, "logps/chosen": -321.45733642578125, "logps/rejected": -320.283447265625, "loss": 0.9669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.940783977508545, "rewards/margins": 0.3813283443450928, "rewards/rejected": 2.559455394744873, "step": 70030 }, { "epoch": 3.251775848460931, "grad_norm": 46.27705764770508, "learning_rate": 1.0497980407632665e-07, "logits/chosen": -18.776796340942383, "logits/rejected": -18.247587203979492, "logps/chosen": -378.9250183105469, "logps/rejected": -273.8658752441406, "loss": 0.8598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.384206771850586, "rewards/margins": 0.9679377675056458, "rewards/rejected": 3.4162685871124268, "step": 70040 }, { "epoch": 3.2522401225683644, "grad_norm": 32.48563003540039, "learning_rate": 1.0495194762988068e-07, "logits/chosen": -18.933639526367188, "logits/rejected": -19.493885040283203, "logps/chosen": -346.95220947265625, "logps/rejected": -331.08245849609375, "loss": 1.1698, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.149437665939331, "rewards/margins": -0.41410574316978455, "rewards/rejected": 2.5635430812835693, "step": 70050 }, { "epoch": 3.2527043966757976, "grad_norm": 223.97059631347656, "learning_rate": 1.0492409118343469e-07, "logits/chosen": -19.958572387695312, "logits/rejected": -19.037755966186523, "logps/chosen": -465.00262451171875, "logps/rejected": -331.62921142578125, "loss": 0.3679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.937544822692871, "rewards/margins": 1.6864410638809204, "rewards/rejected": 3.2511038780212402, "step": 70060 }, { "epoch": 3.2531686707832304, "grad_norm": 6.20524787902832, "learning_rate": 1.0489623473698872e-07, "logits/chosen": -18.697917938232422, "logits/rejected": -18.327199935913086, "logps/chosen": -397.64508056640625, "logps/rejected": -430.9856872558594, "loss": 0.7468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0371387004852295, "rewards/margins": 0.3415710926055908, "rewards/rejected": 2.6955676078796387, "step": 70070 }, { "epoch": 3.2536329448906636, "grad_norm": 0.503424882888794, "learning_rate": 1.0486837829054273e-07, "logits/chosen": -18.886850357055664, "logits/rejected": -18.216156005859375, "logps/chosen": -369.55511474609375, "logps/rejected": -334.73175048828125, "loss": 0.6301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2798004150390625, "rewards/margins": 0.9391925930976868, "rewards/rejected": 2.3406074047088623, "step": 70080 }, { "epoch": 3.2540972189980963, "grad_norm": 116.33808898925781, "learning_rate": 1.0484052184409676e-07, "logits/chosen": -18.23100471496582, "logits/rejected": -17.759952545166016, "logps/chosen": -300.7984619140625, "logps/rejected": -285.35467529296875, "loss": 0.8534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7904984951019287, "rewards/margins": 1.015188455581665, "rewards/rejected": 1.7753101587295532, "step": 70090 }, { "epoch": 3.2545614931055296, "grad_norm": 30.9730281829834, "learning_rate": 1.0481266539765076e-07, "logits/chosen": -18.777851104736328, "logits/rejected": -18.6947078704834, "logps/chosen": -320.3904113769531, "logps/rejected": -335.03338623046875, "loss": 1.118, "rewards/accuracies": 0.5, "rewards/chosen": 3.1554551124572754, "rewards/margins": 0.2229938507080078, "rewards/rejected": 2.9324612617492676, "step": 70100 }, { "epoch": 3.2550257672129623, "grad_norm": 146.4351806640625, "learning_rate": 1.0478480895120478e-07, "logits/chosen": -18.657970428466797, "logits/rejected": -18.316410064697266, "logps/chosen": -493.107177734375, "logps/rejected": -352.00457763671875, "loss": 0.7688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4377243518829346, "rewards/margins": 1.9979747533798218, "rewards/rejected": 1.439749836921692, "step": 70110 }, { "epoch": 3.2554900413203955, "grad_norm": 8.418732643127441, "learning_rate": 1.047569525047588e-07, "logits/chosen": -18.246021270751953, "logits/rejected": -16.41547966003418, "logps/chosen": -409.2180480957031, "logps/rejected": -217.375, "loss": 0.2215, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.72493052482605, "rewards/margins": 2.784546375274658, "rewards/rejected": 0.9403842091560364, "step": 70120 }, { "epoch": 3.2559543154278288, "grad_norm": 238.05209350585938, "learning_rate": 1.0472909605831282e-07, "logits/chosen": -19.670501708984375, "logits/rejected": -19.270462036132812, "logps/chosen": -445.418212890625, "logps/rejected": -450.80267333984375, "loss": 0.7346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.189211845397949, "rewards/margins": 1.4967668056488037, "rewards/rejected": 3.6924450397491455, "step": 70130 }, { "epoch": 3.2564185895352615, "grad_norm": 103.74564361572266, "learning_rate": 1.0470123961186685e-07, "logits/chosen": -18.701854705810547, "logits/rejected": -18.537357330322266, "logps/chosen": -371.29681396484375, "logps/rejected": -315.58660888671875, "loss": 0.7685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.009030818939209, "rewards/margins": 1.135948896408081, "rewards/rejected": 2.873082399368286, "step": 70140 }, { "epoch": 3.2568828636426947, "grad_norm": 6.050167560577393, "learning_rate": 1.0467338316542085e-07, "logits/chosen": -18.750690460205078, "logits/rejected": -18.17410659790039, "logps/chosen": -342.83514404296875, "logps/rejected": -292.0443115234375, "loss": 1.0846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.282158613204956, "rewards/margins": 1.4613059759140015, "rewards/rejected": 1.8208526372909546, "step": 70150 }, { "epoch": 3.2573471377501275, "grad_norm": 44.471805572509766, "learning_rate": 1.0464552671897488e-07, "logits/chosen": -19.299598693847656, "logits/rejected": -17.416290283203125, "logps/chosen": -489.9564514160156, "logps/rejected": -322.0596923828125, "loss": 0.2245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.021564483642578, "rewards/margins": 3.2935314178466797, "rewards/rejected": 1.7280333042144775, "step": 70160 }, { "epoch": 3.2578114118575607, "grad_norm": 88.46265411376953, "learning_rate": 1.0461767027252889e-07, "logits/chosen": -18.914323806762695, "logits/rejected": -18.21385955810547, "logps/chosen": -390.2199401855469, "logps/rejected": -334.4919128417969, "loss": 1.2349, "rewards/accuracies": 0.5, "rewards/chosen": 4.223462104797363, "rewards/margins": 0.5914398431777954, "rewards/rejected": 3.6320221424102783, "step": 70170 }, { "epoch": 3.2582756859649935, "grad_norm": 0.552673876285553, "learning_rate": 1.0458981382608292e-07, "logits/chosen": -19.896957397460938, "logits/rejected": -18.523441314697266, "logps/chosen": -384.0121154785156, "logps/rejected": -308.2586364746094, "loss": 0.6521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.556544065475464, "rewards/margins": 1.330612063407898, "rewards/rejected": 2.2259321212768555, "step": 70180 }, { "epoch": 3.2587399600724267, "grad_norm": 0.8204490542411804, "learning_rate": 1.0456195737963693e-07, "logits/chosen": -19.851415634155273, "logits/rejected": -17.81022834777832, "logps/chosen": -509.31939697265625, "logps/rejected": -334.18206787109375, "loss": 0.5473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.549076080322266, "rewards/margins": 1.9697411060333252, "rewards/rejected": 2.5793347358703613, "step": 70190 }, { "epoch": 3.25920423417986, "grad_norm": 58.1274528503418, "learning_rate": 1.0453410093319095e-07, "logits/chosen": -19.52965545654297, "logits/rejected": -19.105884552001953, "logps/chosen": -481.4747009277344, "logps/rejected": -414.5963439941406, "loss": 0.7008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.697111129760742, "rewards/margins": 0.9784933924674988, "rewards/rejected": 2.7186179161071777, "step": 70200 }, { "epoch": 3.2596685082872927, "grad_norm": 50.495147705078125, "learning_rate": 1.0450624448674496e-07, "logits/chosen": -20.017887115478516, "logits/rejected": -19.844467163085938, "logps/chosen": -387.43646240234375, "logps/rejected": -352.5088195800781, "loss": 0.8549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8990538120269775, "rewards/margins": 0.42049336433410645, "rewards/rejected": 2.478560209274292, "step": 70210 }, { "epoch": 3.260132782394726, "grad_norm": 2.00830340385437, "learning_rate": 1.0447838804029899e-07, "logits/chosen": -17.958261489868164, "logits/rejected": -17.648279190063477, "logps/chosen": -398.30145263671875, "logps/rejected": -355.14068603515625, "loss": 0.8608, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.29512357711792, "rewards/margins": 1.2798449993133545, "rewards/rejected": 2.0152783393859863, "step": 70220 }, { "epoch": 3.2605970565021587, "grad_norm": 90.2759017944336, "learning_rate": 1.04450531593853e-07, "logits/chosen": -19.03575897216797, "logits/rejected": -18.280986785888672, "logps/chosen": -430.17083740234375, "logps/rejected": -379.17413330078125, "loss": 0.5561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7182605266571045, "rewards/margins": 1.1148632764816284, "rewards/rejected": 2.603397846221924, "step": 70230 }, { "epoch": 3.261061330609592, "grad_norm": 22.107013702392578, "learning_rate": 1.0442267514740703e-07, "logits/chosen": -19.682092666625977, "logits/rejected": -18.764904022216797, "logps/chosen": -356.65496826171875, "logps/rejected": -280.4919128417969, "loss": 0.3683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.658604145050049, "rewards/margins": 1.5818361043930054, "rewards/rejected": 2.0767674446105957, "step": 70240 }, { "epoch": 3.2615256047170247, "grad_norm": 61.923866271972656, "learning_rate": 1.0439481870096103e-07, "logits/chosen": -20.1149845123291, "logits/rejected": -19.74546241760254, "logps/chosen": -454.57293701171875, "logps/rejected": -374.753662109375, "loss": 0.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.142956733703613, "rewards/margins": 1.4839032888412476, "rewards/rejected": 3.659053325653076, "step": 70250 }, { "epoch": 3.261989878824458, "grad_norm": 175.67994689941406, "learning_rate": 1.0436696225451506e-07, "logits/chosen": -19.54697036743164, "logits/rejected": -18.09959602355957, "logps/chosen": -513.9287109375, "logps/rejected": -321.4893798828125, "loss": 0.4537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.120577812194824, "rewards/margins": 2.3452603816986084, "rewards/rejected": 2.775317430496216, "step": 70260 }, { "epoch": 3.262454152931891, "grad_norm": 46.12060546875, "learning_rate": 1.0433910580806908e-07, "logits/chosen": -20.262310028076172, "logits/rejected": -19.394386291503906, "logps/chosen": -429.6376037597656, "logps/rejected": -421.21441650390625, "loss": 0.5374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8397598266601562, "rewards/margins": 1.0629456043243408, "rewards/rejected": 2.7768144607543945, "step": 70270 }, { "epoch": 3.262918427039324, "grad_norm": 79.21919250488281, "learning_rate": 1.043112493616231e-07, "logits/chosen": -18.742816925048828, "logits/rejected": -17.99742889404297, "logps/chosen": -372.6468811035156, "logps/rejected": -304.77716064453125, "loss": 0.5436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2884578704833984, "rewards/margins": 0.7974140048027039, "rewards/rejected": 2.491044044494629, "step": 70280 }, { "epoch": 3.263382701146757, "grad_norm": 31.72481346130371, "learning_rate": 1.0428339291517712e-07, "logits/chosen": -19.06534194946289, "logits/rejected": -18.960033416748047, "logps/chosen": -390.54791259765625, "logps/rejected": -399.6741638183594, "loss": 0.9381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6393649578094482, "rewards/margins": 0.5461322069168091, "rewards/rejected": 3.093233346939087, "step": 70290 }, { "epoch": 3.2638469752541903, "grad_norm": 20.832433700561523, "learning_rate": 1.0425553646873115e-07, "logits/chosen": -18.157018661499023, "logits/rejected": -17.841899871826172, "logps/chosen": -459.962890625, "logps/rejected": -446.28814697265625, "loss": 1.6197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.048568248748779, "rewards/margins": -0.01689019240438938, "rewards/rejected": 4.065458297729492, "step": 70300 }, { "epoch": 3.264311249361623, "grad_norm": 6.667510032653809, "learning_rate": 1.0422768002228515e-07, "logits/chosen": -19.310335159301758, "logits/rejected": -17.82994270324707, "logps/chosen": -467.95306396484375, "logps/rejected": -326.1168518066406, "loss": 0.7293, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.416079998016357, "rewards/margins": 1.9821679592132568, "rewards/rejected": 2.4339122772216797, "step": 70310 }, { "epoch": 3.2647755234690563, "grad_norm": 8.030513763427734, "learning_rate": 1.0419982357583916e-07, "logits/chosen": -19.224971771240234, "logits/rejected": -17.854917526245117, "logps/chosen": -478.20611572265625, "logps/rejected": -335.7425842285156, "loss": 0.3599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.606449604034424, "rewards/margins": 2.4330830574035645, "rewards/rejected": 2.1733665466308594, "step": 70320 }, { "epoch": 3.265239797576489, "grad_norm": 108.21146392822266, "learning_rate": 1.0417196712939319e-07, "logits/chosen": -18.720447540283203, "logits/rejected": -19.651369094848633, "logps/chosen": -397.0846862792969, "logps/rejected": -441.828369140625, "loss": 1.6496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7235920429229736, "rewards/margins": -0.9191194772720337, "rewards/rejected": 4.642711639404297, "step": 70330 }, { "epoch": 3.2657040716839223, "grad_norm": 54.10569763183594, "learning_rate": 1.041441106829472e-07, "logits/chosen": -19.493106842041016, "logits/rejected": -18.890230178833008, "logps/chosen": -550.5899658203125, "logps/rejected": -465.8975524902344, "loss": 0.9848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.789759874343872, "rewards/margins": -0.04218139499425888, "rewards/rejected": 3.8319411277770996, "step": 70340 }, { "epoch": 3.266168345791355, "grad_norm": 137.4677734375, "learning_rate": 1.0411625423650123e-07, "logits/chosen": -18.637432098388672, "logits/rejected": -17.21324348449707, "logps/chosen": -298.17364501953125, "logps/rejected": -222.67074584960938, "loss": 0.8622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.735463857650757, "rewards/margins": 1.5398035049438477, "rewards/rejected": 1.1956603527069092, "step": 70350 }, { "epoch": 3.2666326198987883, "grad_norm": 79.8627700805664, "learning_rate": 1.0408839779005523e-07, "logits/chosen": -18.82499885559082, "logits/rejected": -18.412029266357422, "logps/chosen": -439.39654541015625, "logps/rejected": -398.90020751953125, "loss": 0.6358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6795997619628906, "rewards/margins": 0.8112798929214478, "rewards/rejected": 2.8683199882507324, "step": 70360 }, { "epoch": 3.2670968940062215, "grad_norm": 8.572884559631348, "learning_rate": 1.0406054134360926e-07, "logits/chosen": -19.752338409423828, "logits/rejected": -18.686525344848633, "logps/chosen": -374.41796875, "logps/rejected": -282.2861633300781, "loss": 0.8064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.674217939376831, "rewards/margins": 1.554559588432312, "rewards/rejected": 2.1196582317352295, "step": 70370 }, { "epoch": 3.2675611681136543, "grad_norm": 12.562915802001953, "learning_rate": 1.0403268489716328e-07, "logits/chosen": -18.662311553955078, "logits/rejected": -17.920181274414062, "logps/chosen": -484.043212890625, "logps/rejected": -368.41082763671875, "loss": 0.3916, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.773468017578125, "rewards/margins": 1.9313020706176758, "rewards/rejected": 1.8421663045883179, "step": 70380 }, { "epoch": 3.2680254422210875, "grad_norm": 103.4307861328125, "learning_rate": 1.040048284507173e-07, "logits/chosen": -17.791976928710938, "logits/rejected": -17.48416519165039, "logps/chosen": -411.91900634765625, "logps/rejected": -394.68621826171875, "loss": 0.7534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.005603075027466, "rewards/margins": 0.6002796292304993, "rewards/rejected": 2.4053235054016113, "step": 70390 }, { "epoch": 3.2684897163285203, "grad_norm": 6.464208602905273, "learning_rate": 1.0397697200427132e-07, "logits/chosen": -19.512208938598633, "logits/rejected": -18.84170913696289, "logps/chosen": -442.27081298828125, "logps/rejected": -321.79730224609375, "loss": 0.4502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6986358165740967, "rewards/margins": 1.2675567865371704, "rewards/rejected": 2.431079149246216, "step": 70400 }, { "epoch": 3.2689539904359535, "grad_norm": 3.048581838607788, "learning_rate": 1.0394911555782533e-07, "logits/chosen": -19.88052749633789, "logits/rejected": -18.453384399414062, "logps/chosen": -376.02154541015625, "logps/rejected": -205.96609497070312, "loss": 0.3986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6960091590881348, "rewards/margins": 2.4199564456939697, "rewards/rejected": 1.276052713394165, "step": 70410 }, { "epoch": 3.2694182645433862, "grad_norm": 7.234139919281006, "learning_rate": 1.0392125911137935e-07, "logits/chosen": -20.287899017333984, "logits/rejected": -19.57871437072754, "logps/chosen": -428.4408264160156, "logps/rejected": -377.70574951171875, "loss": 1.0215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6201579570770264, "rewards/margins": 1.0832126140594482, "rewards/rejected": 2.536945343017578, "step": 70420 }, { "epoch": 3.2698825386508195, "grad_norm": 71.56073760986328, "learning_rate": 1.0389340266493338e-07, "logits/chosen": -19.19186782836914, "logits/rejected": -18.860858917236328, "logps/chosen": -425.6900329589844, "logps/rejected": -493.66497802734375, "loss": 1.2586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4482998847961426, "rewards/margins": 0.5996931195259094, "rewards/rejected": 2.848606824874878, "step": 70430 }, { "epoch": 3.2703468127582527, "grad_norm": 134.0673828125, "learning_rate": 1.0386554621848739e-07, "logits/chosen": -19.66599464416504, "logits/rejected": -18.760238647460938, "logps/chosen": -464.84210205078125, "logps/rejected": -411.78851318359375, "loss": 0.3878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.333893775939941, "rewards/margins": 1.3812296390533447, "rewards/rejected": 2.952664375305176, "step": 70440 }, { "epoch": 3.2708110868656854, "grad_norm": 62.39621353149414, "learning_rate": 1.0383768977204142e-07, "logits/chosen": -18.550979614257812, "logits/rejected": -17.53814697265625, "logps/chosen": -473.4974060058594, "logps/rejected": -304.20086669921875, "loss": 0.4275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.002851486206055, "rewards/margins": 2.2481939792633057, "rewards/rejected": 1.754657506942749, "step": 70450 }, { "epoch": 3.2712753609731187, "grad_norm": 0.2045062631368637, "learning_rate": 1.0380983332559542e-07, "logits/chosen": -18.546085357666016, "logits/rejected": -18.328166961669922, "logps/chosen": -394.6247253417969, "logps/rejected": -361.5572814941406, "loss": 1.4386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.607455015182495, "rewards/margins": 0.9444962739944458, "rewards/rejected": 2.662958860397339, "step": 70460 }, { "epoch": 3.2717396350805514, "grad_norm": 225.45343017578125, "learning_rate": 1.0378197687914945e-07, "logits/chosen": -18.345714569091797, "logits/rejected": -17.828197479248047, "logps/chosen": -384.81927490234375, "logps/rejected": -403.6499328613281, "loss": 0.9835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0810658931732178, "rewards/margins": 0.5227788686752319, "rewards/rejected": 2.5582871437072754, "step": 70470 }, { "epoch": 3.2722039091879846, "grad_norm": 164.08375549316406, "learning_rate": 1.0375412043270346e-07, "logits/chosen": -18.410303115844727, "logits/rejected": -17.877166748046875, "logps/chosen": -395.3091735839844, "logps/rejected": -401.4183654785156, "loss": 0.6447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5592544078826904, "rewards/margins": 1.148409128189087, "rewards/rejected": 2.4108452796936035, "step": 70480 }, { "epoch": 3.2726681832954174, "grad_norm": 15.535387992858887, "learning_rate": 1.0372626398625749e-07, "logits/chosen": -18.671497344970703, "logits/rejected": -17.824966430664062, "logps/chosen": -491.42266845703125, "logps/rejected": -360.5012512207031, "loss": 0.8638, "rewards/accuracies": 0.5, "rewards/chosen": 3.865868091583252, "rewards/margins": 0.9156137704849243, "rewards/rejected": 2.950254440307617, "step": 70490 }, { "epoch": 3.2731324574028506, "grad_norm": 21.22980499267578, "learning_rate": 1.036984075398115e-07, "logits/chosen": -19.464519500732422, "logits/rejected": -19.076068878173828, "logps/chosen": -345.2139892578125, "logps/rejected": -287.9571533203125, "loss": 0.7772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.254063844680786, "rewards/margins": 0.861504077911377, "rewards/rejected": 2.39255952835083, "step": 70500 }, { "epoch": 3.273596731510284, "grad_norm": 5.275608539581299, "learning_rate": 1.036705510933655e-07, "logits/chosen": -20.152729034423828, "logits/rejected": -18.272947311401367, "logps/chosen": -374.7831115722656, "logps/rejected": -320.91131591796875, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": 4.237022876739502, "rewards/margins": 1.7749011516571045, "rewards/rejected": 2.4621217250823975, "step": 70510 }, { "epoch": 3.2740610056177166, "grad_norm": 70.40060424804688, "learning_rate": 1.0364269464691953e-07, "logits/chosen": -19.01868438720703, "logits/rejected": -17.761978149414062, "logps/chosen": -387.0950622558594, "logps/rejected": -203.8762664794922, "loss": 0.3693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.007908344268799, "rewards/margins": 1.6946977376937866, "rewards/rejected": 1.3132102489471436, "step": 70520 }, { "epoch": 3.27452527972515, "grad_norm": 90.63370513916016, "learning_rate": 1.0361483820047355e-07, "logits/chosen": -19.799375534057617, "logits/rejected": -17.793567657470703, "logps/chosen": -406.4062194824219, "logps/rejected": -193.19479370117188, "loss": 0.2299, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9324917793273926, "rewards/margins": 2.489297389984131, "rewards/rejected": 0.44319432973861694, "step": 70530 }, { "epoch": 3.2749895538325826, "grad_norm": 164.20506286621094, "learning_rate": 1.0358698175402758e-07, "logits/chosen": -18.28115463256836, "logits/rejected": -17.439476013183594, "logps/chosen": -348.2051696777344, "logps/rejected": -269.3438415527344, "loss": 0.5935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.909770965576172, "rewards/margins": 1.5605826377868652, "rewards/rejected": 1.3491883277893066, "step": 70540 }, { "epoch": 3.275453827940016, "grad_norm": 21.144081115722656, "learning_rate": 1.0355912530758159e-07, "logits/chosen": -19.288677215576172, "logits/rejected": -18.412981033325195, "logps/chosen": -355.077880859375, "logps/rejected": -318.0503845214844, "loss": 0.5359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.592672824859619, "rewards/margins": 1.2604515552520752, "rewards/rejected": 2.3322207927703857, "step": 70550 }, { "epoch": 3.2759181020474486, "grad_norm": 248.06195068359375, "learning_rate": 1.0353126886113562e-07, "logits/chosen": -19.147777557373047, "logits/rejected": -17.977188110351562, "logps/chosen": -417.41064453125, "logps/rejected": -377.1527404785156, "loss": 0.931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.296159267425537, "rewards/margins": 2.0391883850097656, "rewards/rejected": 2.2569706439971924, "step": 70560 }, { "epoch": 3.276382376154882, "grad_norm": 201.82763671875, "learning_rate": 1.0350341241468962e-07, "logits/chosen": -18.56759262084961, "logits/rejected": -18.27103614807129, "logps/chosen": -436.9707946777344, "logps/rejected": -376.53350830078125, "loss": 0.9852, "rewards/accuracies": 0.5, "rewards/chosen": 3.543529510498047, "rewards/margins": 0.6414356231689453, "rewards/rejected": 2.9020938873291016, "step": 70570 }, { "epoch": 3.276846650262315, "grad_norm": 10.509529113769531, "learning_rate": 1.0347555596824365e-07, "logits/chosen": -18.949260711669922, "logits/rejected": -18.107946395874023, "logps/chosen": -567.4537353515625, "logps/rejected": -406.58697509765625, "loss": 0.4162, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.515984058380127, "rewards/margins": 1.866567850112915, "rewards/rejected": 2.649416446685791, "step": 70580 }, { "epoch": 3.277310924369748, "grad_norm": 177.13096618652344, "learning_rate": 1.0344769952179766e-07, "logits/chosen": -18.568906784057617, "logits/rejected": -17.863178253173828, "logps/chosen": -459.31280517578125, "logps/rejected": -370.1894836425781, "loss": 0.7323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.032200813293457, "rewards/margins": 2.256340742111206, "rewards/rejected": 2.77586030960083, "step": 70590 }, { "epoch": 3.277775198477181, "grad_norm": 5.196664810180664, "learning_rate": 1.0341984307535169e-07, "logits/chosen": -19.027050018310547, "logits/rejected": -18.410783767700195, "logps/chosen": -387.42333984375, "logps/rejected": -355.87591552734375, "loss": 0.7857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.659925937652588, "rewards/margins": 0.7906055450439453, "rewards/rejected": 2.8693201541900635, "step": 70600 }, { "epoch": 3.278239472584614, "grad_norm": 1.603796124458313, "learning_rate": 1.033919866289057e-07, "logits/chosen": -18.72867202758789, "logits/rejected": -18.72135353088379, "logps/chosen": -305.00830078125, "logps/rejected": -285.1385192871094, "loss": 0.6068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9809515476226807, "rewards/margins": 1.0127818584442139, "rewards/rejected": 1.9681694507598877, "step": 70610 }, { "epoch": 3.278703746692047, "grad_norm": 197.5193328857422, "learning_rate": 1.0336413018245972e-07, "logits/chosen": -18.931224822998047, "logits/rejected": -17.603239059448242, "logps/chosen": -388.41827392578125, "logps/rejected": -317.1604309082031, "loss": 0.5177, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4809703826904297, "rewards/margins": 1.6493648290634155, "rewards/rejected": 1.831605315208435, "step": 70620 }, { "epoch": 3.2791680207994798, "grad_norm": 49.17784118652344, "learning_rate": 1.0333627373601373e-07, "logits/chosen": -18.9157657623291, "logits/rejected": -18.764074325561523, "logps/chosen": -379.0233154296875, "logps/rejected": -430.81170654296875, "loss": 0.9994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.858189105987549, "rewards/margins": 0.353817880153656, "rewards/rejected": 2.504371404647827, "step": 70630 }, { "epoch": 3.279632294906913, "grad_norm": 21.457626342773438, "learning_rate": 1.0330841728956776e-07, "logits/chosen": -19.612722396850586, "logits/rejected": -19.16872787475586, "logps/chosen": -445.1246643066406, "logps/rejected": -320.4237365722656, "loss": 0.6074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.560405731201172, "rewards/margins": 1.5343329906463623, "rewards/rejected": 2.0260722637176514, "step": 70640 }, { "epoch": 3.280096569014346, "grad_norm": 43.548770904541016, "learning_rate": 1.0328056084312177e-07, "logits/chosen": -19.46519660949707, "logits/rejected": -18.29825782775879, "logps/chosen": -439.328857421875, "logps/rejected": -360.0563049316406, "loss": 0.5935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.879025459289551, "rewards/margins": 2.03080153465271, "rewards/rejected": 2.84822416305542, "step": 70650 }, { "epoch": 3.280560843121779, "grad_norm": 14.726387977600098, "learning_rate": 1.032527043966758e-07, "logits/chosen": -18.763904571533203, "logits/rejected": -18.667903900146484, "logps/chosen": -394.50567626953125, "logps/rejected": -358.43548583984375, "loss": 0.7653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7589545249938965, "rewards/margins": 1.4572043418884277, "rewards/rejected": 2.3017501831054688, "step": 70660 }, { "epoch": 3.281025117229212, "grad_norm": 100.5416030883789, "learning_rate": 1.032248479502298e-07, "logits/chosen": -18.98379898071289, "logits/rejected": -17.540302276611328, "logps/chosen": -312.80584716796875, "logps/rejected": -295.7792053222656, "loss": 0.5117, "rewards/accuracies": 0.5, "rewards/chosen": 2.7052953243255615, "rewards/margins": 1.1607816219329834, "rewards/rejected": 1.5445139408111572, "step": 70670 }, { "epoch": 3.281489391336645, "grad_norm": 19.45890235900879, "learning_rate": 1.0319699150378383e-07, "logits/chosen": -19.18960952758789, "logits/rejected": -18.828523635864258, "logps/chosen": -430.0548400878906, "logps/rejected": -386.34100341796875, "loss": 0.4653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.492879390716553, "rewards/margins": 1.0684967041015625, "rewards/rejected": 3.4243826866149902, "step": 70680 }, { "epoch": 3.281953665444078, "grad_norm": 70.28260803222656, "learning_rate": 1.0316913505733785e-07, "logits/chosen": -19.45062828063965, "logits/rejected": -18.706806182861328, "logps/chosen": -329.92987060546875, "logps/rejected": -228.1240234375, "loss": 0.4658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9906015396118164, "rewards/margins": 1.6532576084136963, "rewards/rejected": 1.3373435735702515, "step": 70690 }, { "epoch": 3.2824179395515114, "grad_norm": 1.7105300426483154, "learning_rate": 1.0314127861089187e-07, "logits/chosen": -19.287277221679688, "logits/rejected": -18.456134796142578, "logps/chosen": -375.75408935546875, "logps/rejected": -378.10003662109375, "loss": 0.6119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3694446086883545, "rewards/margins": 0.65337073802948, "rewards/rejected": 2.716073989868164, "step": 70700 }, { "epoch": 3.282882213658944, "grad_norm": 0.10486171394586563, "learning_rate": 1.031162078090905e-07, "logits/chosen": -18.671045303344727, "logits/rejected": -18.231128692626953, "logps/chosen": -409.0764465332031, "logps/rejected": -316.1470947265625, "loss": 0.7576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.624924421310425, "rewards/margins": 1.2767069339752197, "rewards/rejected": 2.348217725753784, "step": 70710 }, { "epoch": 3.2833464877663774, "grad_norm": 11.644367218017578, "learning_rate": 1.030883513626445e-07, "logits/chosen": -20.312885284423828, "logits/rejected": -18.66311264038086, "logps/chosen": -457.16925048828125, "logps/rejected": -385.3401184082031, "loss": 0.2961, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.506793022155762, "rewards/margins": 2.3137781620025635, "rewards/rejected": 3.1930150985717773, "step": 70720 }, { "epoch": 3.28381076187381, "grad_norm": 12.825325012207031, "learning_rate": 1.0306049491619852e-07, "logits/chosen": -19.11734962463379, "logits/rejected": -18.5262508392334, "logps/chosen": -374.95794677734375, "logps/rejected": -307.47076416015625, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.009603500366211, "rewards/margins": 0.5333831310272217, "rewards/rejected": 2.4762203693389893, "step": 70730 }, { "epoch": 3.2842750359812434, "grad_norm": 10.328058242797852, "learning_rate": 1.0303263846975254e-07, "logits/chosen": -20.450950622558594, "logits/rejected": -19.489416122436523, "logps/chosen": -385.87933349609375, "logps/rejected": -295.8192138671875, "loss": 0.7988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.643650531768799, "rewards/margins": 0.7611972689628601, "rewards/rejected": 1.8824535608291626, "step": 70740 }, { "epoch": 3.2847393100886766, "grad_norm": 78.64061737060547, "learning_rate": 1.0300478202330657e-07, "logits/chosen": -19.451860427856445, "logits/rejected": -18.242183685302734, "logps/chosen": -404.7405700683594, "logps/rejected": -341.84478759765625, "loss": 0.5005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.136249303817749, "rewards/margins": 1.3994033336639404, "rewards/rejected": 1.7368457317352295, "step": 70750 }, { "epoch": 3.2852035841961094, "grad_norm": 9.8020601272583, "learning_rate": 1.0297692557686057e-07, "logits/chosen": -18.369224548339844, "logits/rejected": -18.269351959228516, "logps/chosen": -377.9062194824219, "logps/rejected": -409.64703369140625, "loss": 0.9451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0030009746551514, "rewards/margins": 0.2976318895816803, "rewards/rejected": 2.705369234085083, "step": 70760 }, { "epoch": 3.2856678583035426, "grad_norm": 130.24658203125, "learning_rate": 1.0294906913041458e-07, "logits/chosen": -19.545846939086914, "logits/rejected": -19.082752227783203, "logps/chosen": -510.753662109375, "logps/rejected": -421.80645751953125, "loss": 0.8612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9753661155700684, "rewards/margins": 0.648086667060852, "rewards/rejected": 3.327279567718506, "step": 70770 }, { "epoch": 3.2861321324109753, "grad_norm": 63.13420486450195, "learning_rate": 1.0292121268396861e-07, "logits/chosen": -19.14852523803711, "logits/rejected": -18.33460807800293, "logps/chosen": -528.1425170898438, "logps/rejected": -456.0245666503906, "loss": 0.6335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.514532089233398, "rewards/margins": 0.7110744118690491, "rewards/rejected": 3.803457736968994, "step": 70780 }, { "epoch": 3.2865964065184086, "grad_norm": 32.28990936279297, "learning_rate": 1.0289335623752262e-07, "logits/chosen": -19.059743881225586, "logits/rejected": -17.56020164489746, "logps/chosen": -361.669189453125, "logps/rejected": -291.93267822265625, "loss": 0.3664, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.134596824645996, "rewards/margins": 2.3939406871795654, "rewards/rejected": 1.7406564950942993, "step": 70790 }, { "epoch": 3.2870606806258413, "grad_norm": 137.6610107421875, "learning_rate": 1.0286828543572126e-07, "logits/chosen": -19.0198974609375, "logits/rejected": -18.378143310546875, "logps/chosen": -429.2037048339844, "logps/rejected": -397.0796813964844, "loss": 0.8535, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5970497131347656, "rewards/margins": 0.4293034076690674, "rewards/rejected": 3.1677463054656982, "step": 70800 }, { "epoch": 3.2875249547332746, "grad_norm": 6.285725116729736, "learning_rate": 1.0284042898927526e-07, "logits/chosen": -18.575672149658203, "logits/rejected": -17.545291900634766, "logps/chosen": -413.7364196777344, "logps/rejected": -301.71783447265625, "loss": 0.3525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.307598114013672, "rewards/margins": 1.8268591165542603, "rewards/rejected": 1.4807393550872803, "step": 70810 }, { "epoch": 3.2879892288407078, "grad_norm": 18.787044525146484, "learning_rate": 1.0281257254282929e-07, "logits/chosen": -20.050262451171875, "logits/rejected": -19.032276153564453, "logps/chosen": -480.83367919921875, "logps/rejected": -408.2081298828125, "loss": 0.382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.54709005355835, "rewards/margins": 1.0602134466171265, "rewards/rejected": 3.4868767261505127, "step": 70820 }, { "epoch": 3.2884535029481405, "grad_norm": 113.09648132324219, "learning_rate": 1.027847160963833e-07, "logits/chosen": -19.831621170043945, "logits/rejected": -18.140840530395508, "logps/chosen": -400.3568420410156, "logps/rejected": -249.892333984375, "loss": 0.5067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.748321056365967, "rewards/margins": 1.7761449813842773, "rewards/rejected": 1.972176194190979, "step": 70830 }, { "epoch": 3.2889177770555738, "grad_norm": 27.303213119506836, "learning_rate": 1.0275685964993732e-07, "logits/chosen": -18.809322357177734, "logits/rejected": -18.7929744720459, "logps/chosen": -423.011474609375, "logps/rejected": -421.2586975097656, "loss": 1.5935, "rewards/accuracies": 0.5, "rewards/chosen": 3.3130977153778076, "rewards/margins": -0.17749996483325958, "rewards/rejected": 3.4905974864959717, "step": 70840 }, { "epoch": 3.2893820511630065, "grad_norm": 46.80707931518555, "learning_rate": 1.0272900320349133e-07, "logits/chosen": -19.032306671142578, "logits/rejected": -18.540395736694336, "logps/chosen": -363.29107666015625, "logps/rejected": -357.0434875488281, "loss": 0.5499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7646431922912598, "rewards/margins": 0.8692053556442261, "rewards/rejected": 2.895437717437744, "step": 70850 }, { "epoch": 3.2898463252704397, "grad_norm": 7.885986804962158, "learning_rate": 1.0270114675704535e-07, "logits/chosen": -19.0770320892334, "logits/rejected": -17.43514633178711, "logps/chosen": -345.6285705566406, "logps/rejected": -176.76771545410156, "loss": 0.2356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.821307897567749, "rewards/margins": 2.665219783782959, "rewards/rejected": 0.15608815848827362, "step": 70860 }, { "epoch": 3.2903105993778725, "grad_norm": 62.87030792236328, "learning_rate": 1.0267329031059937e-07, "logits/chosen": -18.767330169677734, "logits/rejected": -17.447526931762695, "logps/chosen": -521.1925659179688, "logps/rejected": -413.8885192871094, "loss": 0.8387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.915859699249268, "rewards/margins": 1.503488302230835, "rewards/rejected": 3.4123713970184326, "step": 70870 }, { "epoch": 3.2907748734853057, "grad_norm": 86.0749282836914, "learning_rate": 1.0264543386415339e-07, "logits/chosen": -18.447555541992188, "logits/rejected": -18.370512008666992, "logps/chosen": -371.2369384765625, "logps/rejected": -332.40447998046875, "loss": 0.4186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1845669746398926, "rewards/margins": 0.7593642473220825, "rewards/rejected": 1.4252026081085205, "step": 70880 }, { "epoch": 3.291239147592739, "grad_norm": 175.62069702148438, "learning_rate": 1.0261757741770742e-07, "logits/chosen": -18.364421844482422, "logits/rejected": -18.371028900146484, "logps/chosen": -435.3038024902344, "logps/rejected": -415.046630859375, "loss": 0.7941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5799338817596436, "rewards/margins": 0.7592010498046875, "rewards/rejected": 2.820732831954956, "step": 70890 }, { "epoch": 3.2917034217001717, "grad_norm": 63.63411331176758, "learning_rate": 1.0258972097126142e-07, "logits/chosen": -18.968791961669922, "logits/rejected": -19.113567352294922, "logps/chosen": -355.9587097167969, "logps/rejected": -335.51934814453125, "loss": 0.7478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6173224449157715, "rewards/margins": 0.5889961123466492, "rewards/rejected": 3.0283262729644775, "step": 70900 }, { "epoch": 3.292167695807605, "grad_norm": 39.86606979370117, "learning_rate": 1.0256186452481545e-07, "logits/chosen": -18.343332290649414, "logits/rejected": -17.862197875976562, "logps/chosen": -435.6294860839844, "logps/rejected": -354.2265625, "loss": 0.5, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3804306983947754, "rewards/margins": 0.7814850807189941, "rewards/rejected": 2.598945379257202, "step": 70910 }, { "epoch": 3.2926319699150377, "grad_norm": 59.06352233886719, "learning_rate": 1.0253400807836946e-07, "logits/chosen": -19.68818473815918, "logits/rejected": -18.774959564208984, "logps/chosen": -553.5108032226562, "logps/rejected": -477.27203369140625, "loss": 0.7414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.024252891540527, "rewards/margins": 0.4571908414363861, "rewards/rejected": 3.5670619010925293, "step": 70920 }, { "epoch": 3.293096244022471, "grad_norm": 204.82431030273438, "learning_rate": 1.0250615163192349e-07, "logits/chosen": -19.05010414123535, "logits/rejected": -18.81346321105957, "logps/chosen": -384.29254150390625, "logps/rejected": -339.37249755859375, "loss": 1.2953, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.800915479660034, "rewards/margins": 0.11870662122964859, "rewards/rejected": 2.682208776473999, "step": 70930 }, { "epoch": 3.2935605181299037, "grad_norm": 45.531131744384766, "learning_rate": 1.024782951854775e-07, "logits/chosen": -18.324085235595703, "logits/rejected": -18.44685173034668, "logps/chosen": -356.5144958496094, "logps/rejected": -275.93878173828125, "loss": 1.064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.177292585372925, "rewards/margins": 1.0678107738494873, "rewards/rejected": 2.1094822883605957, "step": 70940 }, { "epoch": 3.294024792237337, "grad_norm": 99.73876190185547, "learning_rate": 1.0245043873903153e-07, "logits/chosen": -19.11014175415039, "logits/rejected": -18.881572723388672, "logps/chosen": -329.27703857421875, "logps/rejected": -293.43218994140625, "loss": 0.4276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.75500226020813, "rewards/margins": 1.6096160411834717, "rewards/rejected": 2.145386219024658, "step": 70950 }, { "epoch": 3.29448906634477, "grad_norm": 5.941989421844482, "learning_rate": 1.0242258229258553e-07, "logits/chosen": -18.12226104736328, "logits/rejected": -17.866085052490234, "logps/chosen": -324.4366760253906, "logps/rejected": -218.05508422851562, "loss": 0.6695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7742761373519897, "rewards/margins": 1.0897873640060425, "rewards/rejected": 0.6844887137413025, "step": 70960 }, { "epoch": 3.294953340452203, "grad_norm": 114.37227630615234, "learning_rate": 1.0239472584613956e-07, "logits/chosen": -18.470123291015625, "logits/rejected": -17.31242561340332, "logps/chosen": -401.79620361328125, "logps/rejected": -353.61309814453125, "loss": 0.9562, "rewards/accuracies": 0.5, "rewards/chosen": 3.3230247497558594, "rewards/margins": 1.3635762929916382, "rewards/rejected": 1.9594484567642212, "step": 70970 }, { "epoch": 3.295417614559636, "grad_norm": 72.57808685302734, "learning_rate": 1.0236686939969357e-07, "logits/chosen": -19.79617691040039, "logits/rejected": -18.791278839111328, "logps/chosen": -326.2255859375, "logps/rejected": -300.6629333496094, "loss": 0.5228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8127601146698, "rewards/margins": 1.1553988456726074, "rewards/rejected": 2.6573615074157715, "step": 70980 }, { "epoch": 3.295881888667069, "grad_norm": 18.319639205932617, "learning_rate": 1.023390129532476e-07, "logits/chosen": -19.262462615966797, "logits/rejected": -18.567218780517578, "logps/chosen": -445.12432861328125, "logps/rejected": -309.13946533203125, "loss": 0.4859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.162705421447754, "rewards/margins": 2.261030673980713, "rewards/rejected": 2.901674747467041, "step": 70990 }, { "epoch": 3.296346162774502, "grad_norm": 0.07840240001678467, "learning_rate": 1.0231115650680162e-07, "logits/chosen": -18.86854362487793, "logits/rejected": -17.891536712646484, "logps/chosen": -443.79296875, "logps/rejected": -337.27252197265625, "loss": 0.828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.193876266479492, "rewards/margins": 0.8655575513839722, "rewards/rejected": 3.3283188343048096, "step": 71000 }, { "epoch": 3.296810436881935, "grad_norm": 0.8107624053955078, "learning_rate": 1.0228330006035564e-07, "logits/chosen": -19.268659591674805, "logits/rejected": -18.8795166015625, "logps/chosen": -418.7701721191406, "logps/rejected": -347.9844055175781, "loss": 1.2049, "rewards/accuracies": 0.5, "rewards/chosen": 2.7170660495758057, "rewards/margins": -0.019521046429872513, "rewards/rejected": 2.7365870475769043, "step": 71010 }, { "epoch": 3.297274710989368, "grad_norm": 22.794937133789062, "learning_rate": 1.0225544361390964e-07, "logits/chosen": -18.81393814086914, "logits/rejected": -18.647436141967773, "logps/chosen": -439.20684814453125, "logps/rejected": -394.42926025390625, "loss": 0.774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.357906818389893, "rewards/margins": 1.2471354007720947, "rewards/rejected": 3.1107711791992188, "step": 71020 }, { "epoch": 3.2977389850968013, "grad_norm": 4.179852485656738, "learning_rate": 1.0222758716746366e-07, "logits/chosen": -18.078861236572266, "logits/rejected": -17.89267921447754, "logps/chosen": -297.2530517578125, "logps/rejected": -276.91693115234375, "loss": 0.5999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5819339752197266, "rewards/margins": 1.4031617641448975, "rewards/rejected": 1.17877197265625, "step": 71030 }, { "epoch": 3.298203259204234, "grad_norm": 60.26611328125, "learning_rate": 1.0219973072101769e-07, "logits/chosen": -17.76354217529297, "logits/rejected": -18.205730438232422, "logps/chosen": -304.9295349121094, "logps/rejected": -328.71929931640625, "loss": 1.3886, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.9874624013900757, "rewards/margins": -0.6630023717880249, "rewards/rejected": 2.6504645347595215, "step": 71040 }, { "epoch": 3.2986675333116673, "grad_norm": 51.07445526123047, "learning_rate": 1.021718742745717e-07, "logits/chosen": -19.03972816467285, "logits/rejected": -18.095457077026367, "logps/chosen": -430.23370361328125, "logps/rejected": -353.49322509765625, "loss": 0.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.894606113433838, "rewards/margins": 1.1531906127929688, "rewards/rejected": 2.741415023803711, "step": 71050 }, { "epoch": 3.2991318074191, "grad_norm": 20.635486602783203, "learning_rate": 1.0214401782812572e-07, "logits/chosen": -18.925973892211914, "logits/rejected": -18.204479217529297, "logps/chosen": -377.93695068359375, "logps/rejected": -310.83453369140625, "loss": 0.7316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1401450634002686, "rewards/margins": 0.7333295345306396, "rewards/rejected": 2.406815767288208, "step": 71060 }, { "epoch": 3.2995960815265333, "grad_norm": 309.1621398925781, "learning_rate": 1.0211616138167973e-07, "logits/chosen": -19.153722763061523, "logits/rejected": -17.9510440826416, "logps/chosen": -271.5283203125, "logps/rejected": -257.48480224609375, "loss": 1.428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.916320323944092, "rewards/margins": 0.5901298522949219, "rewards/rejected": 2.32619047164917, "step": 71070 }, { "epoch": 3.300060355633966, "grad_norm": 118.69034576416016, "learning_rate": 1.0208830493523376e-07, "logits/chosen": -18.419567108154297, "logits/rejected": -17.868961334228516, "logps/chosen": -335.7073669433594, "logps/rejected": -282.5052490234375, "loss": 0.4262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2443244457244873, "rewards/margins": 1.3794176578521729, "rewards/rejected": 1.864906668663025, "step": 71080 }, { "epoch": 3.3005246297413993, "grad_norm": 0.8692331910133362, "learning_rate": 1.0206044848878777e-07, "logits/chosen": -19.872150421142578, "logits/rejected": -19.63094139099121, "logps/chosen": -418.294921875, "logps/rejected": -373.30865478515625, "loss": 0.6049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.194506645202637, "rewards/margins": 1.4608821868896484, "rewards/rejected": 2.7336244583129883, "step": 71090 }, { "epoch": 3.3009889038488325, "grad_norm": 32.146461486816406, "learning_rate": 1.020325920423418e-07, "logits/chosen": -19.695667266845703, "logits/rejected": -18.655529022216797, "logps/chosen": -407.7132263183594, "logps/rejected": -301.2298889160156, "loss": 0.5285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5829951763153076, "rewards/margins": 0.8687809109687805, "rewards/rejected": 2.714214563369751, "step": 71100 }, { "epoch": 3.3014531779562653, "grad_norm": 85.34375, "learning_rate": 1.020047355958958e-07, "logits/chosen": -19.022266387939453, "logits/rejected": -17.926286697387695, "logps/chosen": -433.93841552734375, "logps/rejected": -258.3888854980469, "loss": 0.602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.499452590942383, "rewards/margins": 2.1715140342712402, "rewards/rejected": 2.3279383182525635, "step": 71110 }, { "epoch": 3.3019174520636985, "grad_norm": 0.015026232227683067, "learning_rate": 1.0197687914944983e-07, "logits/chosen": -18.601987838745117, "logits/rejected": -18.214462280273438, "logps/chosen": -476.0267639160156, "logps/rejected": -380.5105285644531, "loss": 0.5449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.022582530975342, "rewards/margins": 2.0965027809143066, "rewards/rejected": 1.9260801076889038, "step": 71120 }, { "epoch": 3.3023817261711317, "grad_norm": 73.05934143066406, "learning_rate": 1.0194902270300384e-07, "logits/chosen": -18.632200241088867, "logits/rejected": -18.36768913269043, "logps/chosen": -519.7797241210938, "logps/rejected": -408.02557373046875, "loss": 0.8677, "rewards/accuracies": 0.5, "rewards/chosen": 3.477402925491333, "rewards/margins": 0.6494826674461365, "rewards/rejected": 2.8279201984405518, "step": 71130 }, { "epoch": 3.3028460002785645, "grad_norm": 16.43124008178711, "learning_rate": 1.0192116625655787e-07, "logits/chosen": -18.190793991088867, "logits/rejected": -18.060161590576172, "logps/chosen": -356.7544250488281, "logps/rejected": -406.50347900390625, "loss": 1.4234, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9166924953460693, "rewards/margins": -0.591079592704773, "rewards/rejected": 3.5077719688415527, "step": 71140 }, { "epoch": 3.3033102743859977, "grad_norm": 76.08421325683594, "learning_rate": 1.0189330981011189e-07, "logits/chosen": -20.172224044799805, "logits/rejected": -18.629436492919922, "logps/chosen": -393.20538330078125, "logps/rejected": -287.9227294921875, "loss": 0.2335, "rewards/accuracies": 1.0, "rewards/chosen": 4.606759071350098, "rewards/margins": 2.104266405105591, "rewards/rejected": 2.502492904663086, "step": 71150 }, { "epoch": 3.3037745484934304, "grad_norm": 0.6560456156730652, "learning_rate": 1.0186545336366591e-07, "logits/chosen": -18.59025001525879, "logits/rejected": -16.807552337646484, "logps/chosen": -460.01177978515625, "logps/rejected": -191.30014038085938, "loss": 0.3894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3177599906921387, "rewards/margins": 2.6118323802948, "rewards/rejected": 0.7059279680252075, "step": 71160 }, { "epoch": 3.3042388226008637, "grad_norm": 77.07840728759766, "learning_rate": 1.0183759691721992e-07, "logits/chosen": -18.455636978149414, "logits/rejected": -17.875, "logps/chosen": -473.7808532714844, "logps/rejected": -290.9644470214844, "loss": 0.1545, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.846378803253174, "rewards/margins": 2.795229196548462, "rewards/rejected": 2.051149845123291, "step": 71170 }, { "epoch": 3.3047030967082964, "grad_norm": 148.21080017089844, "learning_rate": 1.0180974047077394e-07, "logits/chosen": -19.109128952026367, "logits/rejected": -18.434181213378906, "logps/chosen": -448.89056396484375, "logps/rejected": -373.90716552734375, "loss": 0.8757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.030261039733887, "rewards/margins": 0.8230462074279785, "rewards/rejected": 3.2072150707244873, "step": 71180 }, { "epoch": 3.3051673708157296, "grad_norm": 36.177825927734375, "learning_rate": 1.0178188402432796e-07, "logits/chosen": -19.114797592163086, "logits/rejected": -18.00613021850586, "logps/chosen": -395.3860778808594, "logps/rejected": -342.4740295410156, "loss": 0.2661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.949864625930786, "rewards/margins": 1.7840230464935303, "rewards/rejected": 2.165841817855835, "step": 71190 }, { "epoch": 3.305631644923163, "grad_norm": 41.240501403808594, "learning_rate": 1.0175402757788199e-07, "logits/chosen": -18.645479202270508, "logits/rejected": -17.76014518737793, "logps/chosen": -448.1273498535156, "logps/rejected": -350.6096496582031, "loss": 1.1365, "rewards/accuracies": 0.5, "rewards/chosen": 3.3193726539611816, "rewards/margins": 0.5237249135971069, "rewards/rejected": 2.7956480979919434, "step": 71200 }, { "epoch": 3.3060959190305956, "grad_norm": 231.8562469482422, "learning_rate": 1.01726171131436e-07, "logits/chosen": -17.813337326049805, "logits/rejected": -18.22866439819336, "logps/chosen": -361.05828857421875, "logps/rejected": -405.7526550292969, "loss": 1.2057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6114068031311035, "rewards/margins": 0.09148535132408142, "rewards/rejected": 2.519921064376831, "step": 71210 }, { "epoch": 3.306560193138029, "grad_norm": 4.775619029998779, "learning_rate": 1.0169831468499e-07, "logits/chosen": -19.030271530151367, "logits/rejected": -17.857585906982422, "logps/chosen": -415.33807373046875, "logps/rejected": -275.36907958984375, "loss": 0.6069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.935842990875244, "rewards/margins": 2.3656954765319824, "rewards/rejected": 1.5701478719711304, "step": 71220 }, { "epoch": 3.3070244672454616, "grad_norm": 4.348057270050049, "learning_rate": 1.0167045823854403e-07, "logits/chosen": -18.667316436767578, "logits/rejected": -18.075145721435547, "logps/chosen": -377.0538635253906, "logps/rejected": -306.6073303222656, "loss": 0.6037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.825385808944702, "rewards/margins": 1.4133878946304321, "rewards/rejected": 1.4119977951049805, "step": 71230 }, { "epoch": 3.307488741352895, "grad_norm": 15.090291976928711, "learning_rate": 1.0164260179209804e-07, "logits/chosen": -19.210721969604492, "logits/rejected": -18.149810791015625, "logps/chosen": -416.54962158203125, "logps/rejected": -328.8879089355469, "loss": 0.5019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4828219413757324, "rewards/margins": 1.1168715953826904, "rewards/rejected": 1.365950345993042, "step": 71240 }, { "epoch": 3.3079530154603276, "grad_norm": 18.511484146118164, "learning_rate": 1.0161474534565207e-07, "logits/chosen": -18.720073699951172, "logits/rejected": -17.71071434020996, "logps/chosen": -400.7139587402344, "logps/rejected": -358.8282775878906, "loss": 0.7012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8749701976776123, "rewards/margins": 1.4475016593933105, "rewards/rejected": 2.4274682998657227, "step": 71250 }, { "epoch": 3.308417289567761, "grad_norm": 2.7553787231445312, "learning_rate": 1.0158688889920609e-07, "logits/chosen": -19.629486083984375, "logits/rejected": -18.67458724975586, "logps/chosen": -437.84619140625, "logps/rejected": -315.9048156738281, "loss": 0.7139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.14984130859375, "rewards/margins": 1.4239225387573242, "rewards/rejected": 2.7259182929992676, "step": 71260 }, { "epoch": 3.308881563675194, "grad_norm": 51.89985656738281, "learning_rate": 1.015590324527601e-07, "logits/chosen": -18.44155502319336, "logits/rejected": -18.570541381835938, "logps/chosen": -295.4994201660156, "logps/rejected": -271.8892517089844, "loss": 1.0593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.723590612411499, "rewards/margins": -0.06279094517230988, "rewards/rejected": 1.786381483078003, "step": 71270 }, { "epoch": 3.309345837782627, "grad_norm": 275.018310546875, "learning_rate": 1.0153117600631412e-07, "logits/chosen": -20.245040893554688, "logits/rejected": -18.40227508544922, "logps/chosen": -438.8717346191406, "logps/rejected": -340.5378112792969, "loss": 0.7771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.259079456329346, "rewards/margins": 1.6648452281951904, "rewards/rejected": 2.5942342281341553, "step": 71280 }, { "epoch": 3.30981011189006, "grad_norm": 1.3059077262878418, "learning_rate": 1.0150331955986814e-07, "logits/chosen": -19.32659912109375, "logits/rejected": -18.87694549560547, "logps/chosen": -467.9178771972656, "logps/rejected": -376.6673278808594, "loss": 0.8945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.6970672607421875, "rewards/margins": 1.6200546026229858, "rewards/rejected": 3.077012300491333, "step": 71290 }, { "epoch": 3.310274385997493, "grad_norm": 35.06759262084961, "learning_rate": 1.0147546311342216e-07, "logits/chosen": -19.616283416748047, "logits/rejected": -17.920167922973633, "logps/chosen": -424.89227294921875, "logps/rejected": -355.594970703125, "loss": 0.5004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8779735565185547, "rewards/margins": 2.1526684761047363, "rewards/rejected": 1.7253048419952393, "step": 71300 }, { "epoch": 3.310738660104926, "grad_norm": 14.311615943908691, "learning_rate": 1.0144760666697619e-07, "logits/chosen": -18.408294677734375, "logits/rejected": -17.464839935302734, "logps/chosen": -371.9515686035156, "logps/rejected": -288.3111267089844, "loss": 0.4237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0108699798583984, "rewards/margins": 1.7291336059570312, "rewards/rejected": 1.2817366123199463, "step": 71310 }, { "epoch": 3.311202934212359, "grad_norm": 51.961708068847656, "learning_rate": 1.0141975022053019e-07, "logits/chosen": -18.570514678955078, "logits/rejected": -17.70669174194336, "logps/chosen": -371.97247314453125, "logps/rejected": -320.8475646972656, "loss": 1.0124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.692739963531494, "rewards/margins": 2.1283650398254395, "rewards/rejected": 1.5643750429153442, "step": 71320 }, { "epoch": 3.311667208319792, "grad_norm": 131.279052734375, "learning_rate": 1.0139189377408421e-07, "logits/chosen": -18.576879501342773, "logits/rejected": -18.391456604003906, "logps/chosen": -550.5662841796875, "logps/rejected": -445.15582275390625, "loss": 0.7571, "rewards/accuracies": 0.5, "rewards/chosen": 3.5956578254699707, "rewards/margins": 0.32485347986221313, "rewards/rejected": 3.2708046436309814, "step": 71330 }, { "epoch": 3.312131482427225, "grad_norm": 15.476202964782715, "learning_rate": 1.0136403732763823e-07, "logits/chosen": -19.550399780273438, "logits/rejected": -18.649803161621094, "logps/chosen": -437.03436279296875, "logps/rejected": -349.6248779296875, "loss": 0.4899, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.295371055603027, "rewards/margins": 1.8552939891815186, "rewards/rejected": 2.4400768280029297, "step": 71340 }, { "epoch": 3.312595756534658, "grad_norm": 237.9364471435547, "learning_rate": 1.0133618088119226e-07, "logits/chosen": -19.106975555419922, "logits/rejected": -17.9575252532959, "logps/chosen": -370.3546142578125, "logps/rejected": -330.4137268066406, "loss": 1.2055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.141005516052246, "rewards/margins": 0.9708030819892883, "rewards/rejected": 2.1702024936676025, "step": 71350 }, { "epoch": 3.313060030642091, "grad_norm": 22.7717227935791, "learning_rate": 1.0130832443474627e-07, "logits/chosen": -19.699565887451172, "logits/rejected": -19.300718307495117, "logps/chosen": -432.95135498046875, "logps/rejected": -377.8271179199219, "loss": 0.7686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.719414472579956, "rewards/margins": 0.8725293874740601, "rewards/rejected": 2.8468849658966064, "step": 71360 }, { "epoch": 3.313524304749524, "grad_norm": 36.07128143310547, "learning_rate": 1.012804679883003e-07, "logits/chosen": -19.269699096679688, "logits/rejected": -18.408658981323242, "logps/chosen": -473.68634033203125, "logps/rejected": -401.7463073730469, "loss": 0.7671, "rewards/accuracies": 0.5, "rewards/chosen": 4.451201438903809, "rewards/margins": 0.8064815402030945, "rewards/rejected": 3.6447200775146484, "step": 71370 }, { "epoch": 3.313988578856957, "grad_norm": 18.73240852355957, "learning_rate": 1.012526115418543e-07, "logits/chosen": -18.7683162689209, "logits/rejected": -18.345264434814453, "logps/chosen": -321.27423095703125, "logps/rejected": -300.4017028808594, "loss": 0.6861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.868393898010254, "rewards/margins": 0.8026592135429382, "rewards/rejected": 2.065734624862671, "step": 71380 }, { "epoch": 3.31445285296439, "grad_norm": 61.466129302978516, "learning_rate": 1.0122475509540833e-07, "logits/chosen": -19.15618324279785, "logits/rejected": -17.831188201904297, "logps/chosen": -469.68414306640625, "logps/rejected": -357.04400634765625, "loss": 0.3355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.206521034240723, "rewards/margins": 2.548022508621216, "rewards/rejected": 1.658498764038086, "step": 71390 }, { "epoch": 3.314917127071823, "grad_norm": 54.21510314941406, "learning_rate": 1.0119689864896234e-07, "logits/chosen": -18.21847915649414, "logits/rejected": -17.184673309326172, "logps/chosen": -308.2291259765625, "logps/rejected": -230.9650421142578, "loss": 0.4686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1819489002227783, "rewards/margins": 1.9649921655654907, "rewards/rejected": 1.216956615447998, "step": 71400 }, { "epoch": 3.3153814011792564, "grad_norm": 58.34537124633789, "learning_rate": 1.0116904220251637e-07, "logits/chosen": -18.724552154541016, "logits/rejected": -18.121681213378906, "logps/chosen": -349.88970947265625, "logps/rejected": -349.12615966796875, "loss": 0.8615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.581348419189453, "rewards/margins": 0.4957534670829773, "rewards/rejected": 2.08559513092041, "step": 71410 }, { "epoch": 3.315845675286689, "grad_norm": 75.90985107421875, "learning_rate": 1.0114118575607039e-07, "logits/chosen": -18.99521827697754, "logits/rejected": -17.68222999572754, "logps/chosen": -593.23974609375, "logps/rejected": -423.8560485839844, "loss": 0.6695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.733950614929199, "rewards/margins": 1.5470435619354248, "rewards/rejected": 3.1869070529937744, "step": 71420 }, { "epoch": 3.3163099493941224, "grad_norm": 24.121402740478516, "learning_rate": 1.0111332930962439e-07, "logits/chosen": -18.44430923461914, "logits/rejected": -17.52450180053711, "logps/chosen": -415.0934143066406, "logps/rejected": -330.30706787109375, "loss": 0.9282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9137046337127686, "rewards/margins": 1.4790805578231812, "rewards/rejected": 2.434624195098877, "step": 71430 }, { "epoch": 3.316774223501555, "grad_norm": 43.696285247802734, "learning_rate": 1.0108547286317841e-07, "logits/chosen": -19.577150344848633, "logits/rejected": -18.092369079589844, "logps/chosen": -374.84503173828125, "logps/rejected": -363.95953369140625, "loss": 0.4748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.919449806213379, "rewards/margins": 1.750243902206421, "rewards/rejected": 3.169206142425537, "step": 71440 }, { "epoch": 3.3172384976089884, "grad_norm": 132.29727172851562, "learning_rate": 1.0105761641673243e-07, "logits/chosen": -18.852903366088867, "logits/rejected": -18.13100814819336, "logps/chosen": -370.0221862792969, "logps/rejected": -331.78851318359375, "loss": 0.515, "rewards/accuracies": 0.5, "rewards/chosen": 3.3091349601745605, "rewards/margins": 1.338335633277893, "rewards/rejected": 1.9707996845245361, "step": 71450 }, { "epoch": 3.317702771716421, "grad_norm": 160.7612762451172, "learning_rate": 1.0102975997028646e-07, "logits/chosen": -18.890850067138672, "logits/rejected": -17.869834899902344, "logps/chosen": -452.7760314941406, "logps/rejected": -354.75701904296875, "loss": 0.472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4906697273254395, "rewards/margins": 1.6738862991333008, "rewards/rejected": 2.8167836666107178, "step": 71460 }, { "epoch": 3.3181670458238544, "grad_norm": 22.221017837524414, "learning_rate": 1.0100190352384047e-07, "logits/chosen": -19.009607315063477, "logits/rejected": -18.168563842773438, "logps/chosen": -350.2771301269531, "logps/rejected": -288.512939453125, "loss": 0.7079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.326608180999756, "rewards/margins": 0.9872954487800598, "rewards/rejected": 2.33931303024292, "step": 71470 }, { "epoch": 3.3186313199312876, "grad_norm": 97.19329833984375, "learning_rate": 1.0097404707739449e-07, "logits/chosen": -19.220600128173828, "logits/rejected": -18.23971176147461, "logps/chosen": -311.4030456542969, "logps/rejected": -242.79232788085938, "loss": 0.9952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.935634136199951, "rewards/margins": 0.8319503664970398, "rewards/rejected": 2.1036839485168457, "step": 71480 }, { "epoch": 3.3190955940387203, "grad_norm": 24.20535659790039, "learning_rate": 1.009461906309485e-07, "logits/chosen": -19.324060440063477, "logits/rejected": -18.248050689697266, "logps/chosen": -384.7802734375, "logps/rejected": -262.3035583496094, "loss": 0.6688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.233902931213379, "rewards/margins": 1.753291368484497, "rewards/rejected": 2.480612277984619, "step": 71490 }, { "epoch": 3.3195598681461536, "grad_norm": 280.2445983886719, "learning_rate": 1.0091833418450253e-07, "logits/chosen": -19.543928146362305, "logits/rejected": -19.35856819152832, "logps/chosen": -419.51519775390625, "logps/rejected": -393.5540466308594, "loss": 0.6871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9285075664520264, "rewards/margins": 0.925828754901886, "rewards/rejected": 3.002678871154785, "step": 71500 }, { "epoch": 3.3200241422535863, "grad_norm": 71.85516357421875, "learning_rate": 1.0089047773805654e-07, "logits/chosen": -19.268932342529297, "logits/rejected": -18.41655921936035, "logps/chosen": -358.8231201171875, "logps/rejected": -289.4400634765625, "loss": 0.235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.262914180755615, "rewards/margins": 3.0669567584991455, "rewards/rejected": 1.1959574222564697, "step": 71510 }, { "epoch": 3.3204884163610195, "grad_norm": 45.29810333251953, "learning_rate": 1.0086262129161057e-07, "logits/chosen": -18.3177433013916, "logits/rejected": -18.036823272705078, "logps/chosen": -415.094482421875, "logps/rejected": -352.8443603515625, "loss": 0.8576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.261559247970581, "rewards/margins": 0.7027010917663574, "rewards/rejected": 2.5588583946228027, "step": 71520 }, { "epoch": 3.3209526904684528, "grad_norm": 23.5275821685791, "learning_rate": 1.0083476484516457e-07, "logits/chosen": -19.068389892578125, "logits/rejected": -18.799442291259766, "logps/chosen": -433.4092712402344, "logps/rejected": -345.14996337890625, "loss": 0.7988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8996002674102783, "rewards/margins": 0.6013395190238953, "rewards/rejected": 2.298260450363159, "step": 71530 }, { "epoch": 3.3214169645758855, "grad_norm": 61.43917465209961, "learning_rate": 1.008069083987186e-07, "logits/chosen": -18.31339454650879, "logits/rejected": -17.709823608398438, "logps/chosen": -293.448486328125, "logps/rejected": -173.67872619628906, "loss": 0.6013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1966545581817627, "rewards/margins": 1.0497846603393555, "rewards/rejected": 1.1468697786331177, "step": 71540 }, { "epoch": 3.3218812386833187, "grad_norm": 203.23838806152344, "learning_rate": 1.0077905195227261e-07, "logits/chosen": -18.731826782226562, "logits/rejected": -18.406436920166016, "logps/chosen": -508.3885803222656, "logps/rejected": -420.9347229003906, "loss": 0.7985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.710226535797119, "rewards/margins": 1.191262125968933, "rewards/rejected": 3.5189647674560547, "step": 71550 }, { "epoch": 3.3223455127907515, "grad_norm": 0.5268242955207825, "learning_rate": 1.0075119550582664e-07, "logits/chosen": -19.762256622314453, "logits/rejected": -18.686115264892578, "logps/chosen": -512.85302734375, "logps/rejected": -377.93011474609375, "loss": 0.4678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8420090675354, "rewards/margins": 2.476392984390259, "rewards/rejected": 2.3656163215637207, "step": 71560 }, { "epoch": 3.3228097868981847, "grad_norm": 34.90898132324219, "learning_rate": 1.0072333905938066e-07, "logits/chosen": -18.88982391357422, "logits/rejected": -18.010053634643555, "logps/chosen": -323.7625427246094, "logps/rejected": -192.189697265625, "loss": 0.6341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8970634937286377, "rewards/margins": 1.5740598440170288, "rewards/rejected": 1.3230035305023193, "step": 71570 }, { "epoch": 3.323274061005618, "grad_norm": 89.037353515625, "learning_rate": 1.0069548261293468e-07, "logits/chosen": -19.886486053466797, "logits/rejected": -19.775686264038086, "logps/chosen": -403.8605041503906, "logps/rejected": -356.83282470703125, "loss": 1.1095, "rewards/accuracies": 0.5, "rewards/chosen": 3.591418504714966, "rewards/margins": -0.15180650353431702, "rewards/rejected": 3.743225574493408, "step": 71580 }, { "epoch": 3.3237383351130507, "grad_norm": 308.2440185546875, "learning_rate": 1.0066762616648869e-07, "logits/chosen": -18.25137710571289, "logits/rejected": -18.206090927124023, "logps/chosen": -320.7008972167969, "logps/rejected": -308.5061950683594, "loss": 1.4409, "rewards/accuracies": 0.5, "rewards/chosen": 2.079005718231201, "rewards/margins": 0.07268822193145752, "rewards/rejected": 2.0063178539276123, "step": 71590 }, { "epoch": 3.324202609220484, "grad_norm": 69.90925598144531, "learning_rate": 1.0063976972004271e-07, "logits/chosen": -19.05341148376465, "logits/rejected": -17.70558738708496, "logps/chosen": -506.62127685546875, "logps/rejected": -371.07147216796875, "loss": 0.8286, "rewards/accuracies": 0.5, "rewards/chosen": 3.628861904144287, "rewards/margins": 0.9852511286735535, "rewards/rejected": 2.643611192703247, "step": 71600 }, { "epoch": 3.3246668833279167, "grad_norm": 70.49044036865234, "learning_rate": 1.0061191327359673e-07, "logits/chosen": -19.72463607788086, "logits/rejected": -18.679811477661133, "logps/chosen": -401.47821044921875, "logps/rejected": -314.11383056640625, "loss": 0.7766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7730135917663574, "rewards/margins": 1.300991177558899, "rewards/rejected": 2.472022771835327, "step": 71610 }, { "epoch": 3.32513115743535, "grad_norm": 100.0000991821289, "learning_rate": 1.0058405682715074e-07, "logits/chosen": -18.141521453857422, "logits/rejected": -18.046716690063477, "logps/chosen": -447.39422607421875, "logps/rejected": -397.59112548828125, "loss": 0.6802, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1949079036712646, "rewards/margins": 1.002137541770935, "rewards/rejected": 2.19277024269104, "step": 71620 }, { "epoch": 3.3255954315427827, "grad_norm": 78.30315399169922, "learning_rate": 1.0055620038070477e-07, "logits/chosen": -18.428630828857422, "logits/rejected": -17.820079803466797, "logps/chosen": -398.3193359375, "logps/rejected": -284.4143371582031, "loss": 0.4532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6201319694519043, "rewards/margins": 1.50992751121521, "rewards/rejected": 1.1102042198181152, "step": 71630 }, { "epoch": 3.326059705650216, "grad_norm": 65.02151489257812, "learning_rate": 1.0052834393425877e-07, "logits/chosen": -19.986608505249023, "logits/rejected": -18.887271881103516, "logps/chosen": -458.903564453125, "logps/rejected": -421.0318298339844, "loss": 0.7435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.576817035675049, "rewards/margins": 0.5635353326797485, "rewards/rejected": 4.01328182220459, "step": 71640 }, { "epoch": 3.326523979757649, "grad_norm": 141.869140625, "learning_rate": 1.005004874878128e-07, "logits/chosen": -18.698476791381836, "logits/rejected": -18.199838638305664, "logps/chosen": -387.71356201171875, "logps/rejected": -318.9467468261719, "loss": 0.7206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0291197299957275, "rewards/margins": 1.047256588935852, "rewards/rejected": 1.981863260269165, "step": 71650 }, { "epoch": 3.326988253865082, "grad_norm": 129.25579833984375, "learning_rate": 1.0047263104136681e-07, "logits/chosen": -18.969985961914062, "logits/rejected": -18.565534591674805, "logps/chosen": -358.92437744140625, "logps/rejected": -328.2550048828125, "loss": 0.8656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.073906898498535, "rewards/margins": 0.5762117505073547, "rewards/rejected": 2.4976954460144043, "step": 71660 }, { "epoch": 3.327452527972515, "grad_norm": 80.86051177978516, "learning_rate": 1.0044477459492084e-07, "logits/chosen": -19.340835571289062, "logits/rejected": -17.819393157958984, "logps/chosen": -432.77264404296875, "logps/rejected": -337.7789306640625, "loss": 0.3317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.724602699279785, "rewards/margins": 2.337679147720337, "rewards/rejected": 2.3869235515594482, "step": 71670 }, { "epoch": 3.327916802079948, "grad_norm": 34.406009674072266, "learning_rate": 1.0041691814847486e-07, "logits/chosen": -18.50879669189453, "logits/rejected": -17.81382942199707, "logps/chosen": -344.67913818359375, "logps/rejected": -303.0875549316406, "loss": 0.8724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.014587879180908, "rewards/margins": 0.990867018699646, "rewards/rejected": 2.0237202644348145, "step": 71680 }, { "epoch": 3.328381076187381, "grad_norm": 0.6475875973701477, "learning_rate": 1.0038906170202887e-07, "logits/chosen": -18.43593406677246, "logits/rejected": -18.00960350036621, "logps/chosen": -436.5927734375, "logps/rejected": -428.88433837890625, "loss": 0.5374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.126124858856201, "rewards/margins": 1.1854512691497803, "rewards/rejected": 2.940674304962158, "step": 71690 }, { "epoch": 3.328845350294814, "grad_norm": 16.241214752197266, "learning_rate": 1.0036120525558288e-07, "logits/chosen": -18.328847885131836, "logits/rejected": -17.99985122680664, "logps/chosen": -487.9557189941406, "logps/rejected": -406.1889343261719, "loss": 0.9212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.041355848312378, "rewards/margins": 0.07297936826944351, "rewards/rejected": 2.968376398086548, "step": 71700 }, { "epoch": 3.329309624402247, "grad_norm": 102.15958404541016, "learning_rate": 1.0033334880913691e-07, "logits/chosen": -18.899873733520508, "logits/rejected": -17.911441802978516, "logps/chosen": -523.9593505859375, "logps/rejected": -354.46038818359375, "loss": 0.6375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.047741889953613, "rewards/margins": 1.677513837814331, "rewards/rejected": 3.3702285289764404, "step": 71710 }, { "epoch": 3.3297738985096803, "grad_norm": 124.99374389648438, "learning_rate": 1.0030549236269093e-07, "logits/chosen": -18.737285614013672, "logits/rejected": -17.912534713745117, "logps/chosen": -525.7153930664062, "logps/rejected": -591.3316650390625, "loss": 0.2229, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.3749589920043945, "rewards/margins": 2.929898738861084, "rewards/rejected": 2.4450604915618896, "step": 71720 }, { "epoch": 3.330238172617113, "grad_norm": 115.42462921142578, "learning_rate": 1.0027763591624495e-07, "logits/chosen": -18.724668502807617, "logits/rejected": -18.61433219909668, "logps/chosen": -320.5332946777344, "logps/rejected": -325.0771179199219, "loss": 0.9024, "rewards/accuracies": 0.5, "rewards/chosen": 2.4882631301879883, "rewards/margins": 0.2891383767127991, "rewards/rejected": 2.199124813079834, "step": 71730 }, { "epoch": 3.3307024467245463, "grad_norm": 103.15202331542969, "learning_rate": 1.0024977946979896e-07, "logits/chosen": -20.05712890625, "logits/rejected": -19.839786529541016, "logps/chosen": -366.16119384765625, "logps/rejected": -378.9330749511719, "loss": 0.5359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3815219402313232, "rewards/margins": 0.7040753364562988, "rewards/rejected": 2.6774468421936035, "step": 71740 }, { "epoch": 3.331166720831979, "grad_norm": 392.2733459472656, "learning_rate": 1.0022192302335298e-07, "logits/chosen": -18.79197883605957, "logits/rejected": -19.694726943969727, "logps/chosen": -347.7102966308594, "logps/rejected": -327.38873291015625, "loss": 1.4228, "rewards/accuracies": 0.5, "rewards/chosen": 2.755284070968628, "rewards/margins": -0.030103158205747604, "rewards/rejected": 2.7853875160217285, "step": 71750 }, { "epoch": 3.3316309949394123, "grad_norm": 73.50032806396484, "learning_rate": 1.00194066576907e-07, "logits/chosen": -18.504465103149414, "logits/rejected": -18.22565460205078, "logps/chosen": -471.433837890625, "logps/rejected": -350.38775634765625, "loss": 0.9564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.633124828338623, "rewards/margins": 0.6100979447364807, "rewards/rejected": 4.023026943206787, "step": 71760 }, { "epoch": 3.332095269046845, "grad_norm": 10.506657600402832, "learning_rate": 1.0016621013046103e-07, "logits/chosen": -18.857059478759766, "logits/rejected": -18.501068115234375, "logps/chosen": -303.94525146484375, "logps/rejected": -305.93902587890625, "loss": 0.9459, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.747673511505127, "rewards/margins": 0.29982027411460876, "rewards/rejected": 2.4478535652160645, "step": 71770 }, { "epoch": 3.3325595431542783, "grad_norm": 90.66739654541016, "learning_rate": 1.0013835368401504e-07, "logits/chosen": -18.544414520263672, "logits/rejected": -17.797164916992188, "logps/chosen": -462.25982666015625, "logps/rejected": -305.5306701660156, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": 3.8661608695983887, "rewards/margins": 1.8186298608779907, "rewards/rejected": 2.0475311279296875, "step": 71780 }, { "epoch": 3.3330238172617115, "grad_norm": 0.8049350380897522, "learning_rate": 1.0011049723756907e-07, "logits/chosen": -18.900623321533203, "logits/rejected": -18.37515640258789, "logps/chosen": -408.02215576171875, "logps/rejected": -412.80126953125, "loss": 1.4832, "rewards/accuracies": 0.5, "rewards/chosen": 3.418898105621338, "rewards/margins": -0.03984994813799858, "rewards/rejected": 3.458747386932373, "step": 71790 }, { "epoch": 3.3334880913691443, "grad_norm": 43.860836029052734, "learning_rate": 1.0008264079112307e-07, "logits/chosen": -19.051910400390625, "logits/rejected": -17.936464309692383, "logps/chosen": -429.31854248046875, "logps/rejected": -342.3219299316406, "loss": 0.6163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.304697513580322, "rewards/margins": 1.729790449142456, "rewards/rejected": 2.574906587600708, "step": 71800 }, { "epoch": 3.3339523654765775, "grad_norm": 12.857888221740723, "learning_rate": 1.0005478434467708e-07, "logits/chosen": -19.310022354125977, "logits/rejected": -18.687946319580078, "logps/chosen": -344.63385009765625, "logps/rejected": -307.66754150390625, "loss": 1.0907, "rewards/accuracies": 0.5, "rewards/chosen": 3.4280426502227783, "rewards/margins": 0.6375614404678345, "rewards/rejected": 2.7904810905456543, "step": 71810 }, { "epoch": 3.3344166395840102, "grad_norm": 252.20469665527344, "learning_rate": 1.0002692789823111e-07, "logits/chosen": -19.523855209350586, "logits/rejected": -18.673160552978516, "logps/chosen": -346.2252502441406, "logps/rejected": -267.8957214355469, "loss": 1.1189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9532110691070557, "rewards/margins": 0.6719566583633423, "rewards/rejected": 1.281254529953003, "step": 71820 }, { "epoch": 3.3348809136914435, "grad_norm": 0.0033317923080176115, "learning_rate": 9.999907145178513e-08, "logits/chosen": -19.87193489074707, "logits/rejected": -17.298555374145508, "logps/chosen": -422.84967041015625, "logps/rejected": -213.51895141601562, "loss": 0.2943, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.266260623931885, "rewards/margins": 3.9668517112731934, "rewards/rejected": 0.299408495426178, "step": 71830 }, { "epoch": 3.3353451877988762, "grad_norm": 153.10617065429688, "learning_rate": 9.997121500533915e-08, "logits/chosen": -18.978290557861328, "logits/rejected": -18.99697494506836, "logps/chosen": -457.55889892578125, "logps/rejected": -426.3958435058594, "loss": 0.7398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.25112247467041, "rewards/margins": 0.5112727284431458, "rewards/rejected": 3.73984956741333, "step": 71840 }, { "epoch": 3.3358094619063094, "grad_norm": 218.67454528808594, "learning_rate": 9.994335855889316e-08, "logits/chosen": -18.93701171875, "logits/rejected": -19.669876098632812, "logps/chosen": -335.7503662109375, "logps/rejected": -403.4429626464844, "loss": 1.9646, "rewards/accuracies": 0.5, "rewards/chosen": 3.1704955101013184, "rewards/margins": -0.37141865491867065, "rewards/rejected": 3.5419139862060547, "step": 71850 }, { "epoch": 3.3362737360137427, "grad_norm": 47.81862258911133, "learning_rate": 9.991550211244718e-08, "logits/chosen": -18.302833557128906, "logits/rejected": -17.275014877319336, "logps/chosen": -454.67840576171875, "logps/rejected": -275.93023681640625, "loss": 0.6935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0618813037872314, "rewards/margins": 1.4674898386001587, "rewards/rejected": 1.5943915843963623, "step": 71860 }, { "epoch": 3.3367380101211754, "grad_norm": 131.3820343017578, "learning_rate": 9.98876456660012e-08, "logits/chosen": -18.567916870117188, "logits/rejected": -19.106813430786133, "logps/chosen": -310.57763671875, "logps/rejected": -313.0829162597656, "loss": 1.3545, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.591128945350647, "rewards/margins": -0.7836669683456421, "rewards/rejected": 2.374795436859131, "step": 71870 }, { "epoch": 3.3372022842286087, "grad_norm": 88.38215637207031, "learning_rate": 9.985978921955523e-08, "logits/chosen": -19.111244201660156, "logits/rejected": -17.94656753540039, "logps/chosen": -342.567138671875, "logps/rejected": -260.0830993652344, "loss": 0.5559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.446976900100708, "rewards/margins": 1.9819486141204834, "rewards/rejected": 1.4650282859802246, "step": 71880 }, { "epoch": 3.3376665583360414, "grad_norm": 10.31025505065918, "learning_rate": 9.983193277310924e-08, "logits/chosen": -18.12249183654785, "logits/rejected": -17.857511520385742, "logps/chosen": -343.31597900390625, "logps/rejected": -291.3519592285156, "loss": 1.2573, "rewards/accuracies": 0.5, "rewards/chosen": 2.0275449752807617, "rewards/margins": 0.22603020071983337, "rewards/rejected": 1.8015146255493164, "step": 71890 }, { "epoch": 3.3381308324434746, "grad_norm": 85.30870819091797, "learning_rate": 9.980407632666325e-08, "logits/chosen": -19.272335052490234, "logits/rejected": -19.062376022338867, "logps/chosen": -332.3017578125, "logps/rejected": -270.22930908203125, "loss": 0.4395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.362112522125244, "rewards/margins": 1.4204957485198975, "rewards/rejected": 1.9416167736053467, "step": 71900 }, { "epoch": 3.3385951065509074, "grad_norm": 14.540034294128418, "learning_rate": 9.977621988021727e-08, "logits/chosen": -19.374263763427734, "logits/rejected": -18.512191772460938, "logps/chosen": -411.8799743652344, "logps/rejected": -366.44317626953125, "loss": 0.5186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8165409564971924, "rewards/margins": 0.8322008848190308, "rewards/rejected": 2.984340190887451, "step": 71910 }, { "epoch": 3.3390593806583406, "grad_norm": 0.09853267669677734, "learning_rate": 9.97483634337713e-08, "logits/chosen": -20.519079208374023, "logits/rejected": -18.698286056518555, "logps/chosen": -621.5692138671875, "logps/rejected": -390.8329772949219, "loss": 0.1649, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.26613712310791, "rewards/margins": 3.011406421661377, "rewards/rejected": 2.254730701446533, "step": 71920 }, { "epoch": 3.339523654765774, "grad_norm": 134.14036560058594, "learning_rate": 9.972050698732531e-08, "logits/chosen": -19.426517486572266, "logits/rejected": -18.692106246948242, "logps/chosen": -311.5465393066406, "logps/rejected": -355.8406982421875, "loss": 1.2507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5755772590637207, "rewards/margins": 0.04140181466937065, "rewards/rejected": 3.5341758728027344, "step": 71930 }, { "epoch": 3.3399879288732066, "grad_norm": 4.312527656555176, "learning_rate": 9.969265054087934e-08, "logits/chosen": -18.13364601135254, "logits/rejected": -17.880765914916992, "logps/chosen": -324.0753479003906, "logps/rejected": -256.0826110839844, "loss": 0.6235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8310835361480713, "rewards/margins": 0.9233630895614624, "rewards/rejected": 1.9077202081680298, "step": 71940 }, { "epoch": 3.34045220298064, "grad_norm": 115.040283203125, "learning_rate": 9.966479409443334e-08, "logits/chosen": -19.451454162597656, "logits/rejected": -18.669118881225586, "logps/chosen": -393.45733642578125, "logps/rejected": -426.990234375, "loss": 1.0569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3922176361083984, "rewards/margins": 0.16952252388000488, "rewards/rejected": 3.2226951122283936, "step": 71950 }, { "epoch": 3.340916477088073, "grad_norm": 4.2893757820129395, "learning_rate": 9.963693764798737e-08, "logits/chosen": -19.201671600341797, "logits/rejected": -18.308719635009766, "logps/chosen": -337.89105224609375, "logps/rejected": -264.63055419921875, "loss": 0.5061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3601176738739014, "rewards/margins": 1.1535414457321167, "rewards/rejected": 2.206575870513916, "step": 71960 }, { "epoch": 3.341380751195506, "grad_norm": 90.33283233642578, "learning_rate": 9.960908120154138e-08, "logits/chosen": -18.982206344604492, "logits/rejected": -18.999223709106445, "logps/chosen": -346.3295593261719, "logps/rejected": -333.63494873046875, "loss": 0.9356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.890028238296509, "rewards/margins": 0.21168629825115204, "rewards/rejected": 2.678341865539551, "step": 71970 }, { "epoch": 3.341845025302939, "grad_norm": 22.55891227722168, "learning_rate": 9.958122475509541e-08, "logits/chosen": -18.35788345336914, "logits/rejected": -17.723384857177734, "logps/chosen": -457.464599609375, "logps/rejected": -455.57305908203125, "loss": 0.5521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.259077548980713, "rewards/margins": 0.6768902540206909, "rewards/rejected": 2.5821874141693115, "step": 71980 }, { "epoch": 3.342309299410372, "grad_norm": 204.5424041748047, "learning_rate": 9.955336830864943e-08, "logits/chosen": -18.477909088134766, "logits/rejected": -17.67703628540039, "logps/chosen": -419.6568908691406, "logps/rejected": -373.6871032714844, "loss": 1.2357, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1580944061279297, "rewards/margins": -0.07733702659606934, "rewards/rejected": 3.23543119430542, "step": 71990 }, { "epoch": 3.342773573517805, "grad_norm": 56.9965705871582, "learning_rate": 9.952551186220345e-08, "logits/chosen": -19.30908966064453, "logits/rejected": -17.937923431396484, "logps/chosen": -423.4178161621094, "logps/rejected": -237.5466766357422, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 4.579130172729492, "rewards/margins": 2.9334068298339844, "rewards/rejected": 1.6457237005233765, "step": 72000 }, { "epoch": 3.343237847625238, "grad_norm": 4.809239864349365, "learning_rate": 9.949765541575745e-08, "logits/chosen": -18.877994537353516, "logits/rejected": -18.28357696533203, "logps/chosen": -371.3690490722656, "logps/rejected": -380.95819091796875, "loss": 0.9948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5207343101501465, "rewards/margins": 0.783894419670105, "rewards/rejected": 1.7368398904800415, "step": 72010 }, { "epoch": 3.343702121732671, "grad_norm": 193.08982849121094, "learning_rate": 9.946979896931147e-08, "logits/chosen": -18.623477935791016, "logits/rejected": -18.372642517089844, "logps/chosen": -384.1056213378906, "logps/rejected": -339.0370178222656, "loss": 0.9093, "rewards/accuracies": 0.5, "rewards/chosen": 3.3209197521209717, "rewards/margins": 0.5902344584465027, "rewards/rejected": 2.730685234069824, "step": 72020 }, { "epoch": 3.3441663958401042, "grad_norm": 10.540830612182617, "learning_rate": 9.94419425228655e-08, "logits/chosen": -19.715892791748047, "logits/rejected": -18.91847801208496, "logps/chosen": -377.0096740722656, "logps/rejected": -333.1470947265625, "loss": 0.2595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.487179756164551, "rewards/margins": 1.9592052698135376, "rewards/rejected": 2.527974843978882, "step": 72030 }, { "epoch": 3.344630669947537, "grad_norm": 1.6894186735153198, "learning_rate": 9.941408607641951e-08, "logits/chosen": -19.42148208618164, "logits/rejected": -18.876916885375977, "logps/chosen": -382.7690124511719, "logps/rejected": -316.4713134765625, "loss": 0.6994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1916327476501465, "rewards/margins": 1.7946962118148804, "rewards/rejected": 2.3969366550445557, "step": 72040 }, { "epoch": 3.34509494405497, "grad_norm": 27.42177391052246, "learning_rate": 9.938622962997354e-08, "logits/chosen": -19.197021484375, "logits/rejected": -18.392227172851562, "logps/chosen": -334.09478759765625, "logps/rejected": -277.40069580078125, "loss": 0.4581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.375377655029297, "rewards/margins": 1.0253673791885376, "rewards/rejected": 1.3500101566314697, "step": 72050 }, { "epoch": 3.345559218162403, "grad_norm": 97.04052734375, "learning_rate": 9.935837318352754e-08, "logits/chosen": -18.581554412841797, "logits/rejected": -17.448925018310547, "logps/chosen": -401.08001708984375, "logps/rejected": -293.6026306152344, "loss": 0.3478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.368685245513916, "rewards/margins": 1.714765191078186, "rewards/rejected": 1.6539199352264404, "step": 72060 }, { "epoch": 3.346023492269836, "grad_norm": 146.98397827148438, "learning_rate": 9.933051673708157e-08, "logits/chosen": -19.571834564208984, "logits/rejected": -18.02752685546875, "logps/chosen": -477.7247619628906, "logps/rejected": -269.4834899902344, "loss": 0.4134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.455883026123047, "rewards/margins": 1.7745850086212158, "rewards/rejected": 2.6812987327575684, "step": 72070 }, { "epoch": 3.346487766377269, "grad_norm": 15.060066223144531, "learning_rate": 9.930266029063558e-08, "logits/chosen": -17.502796173095703, "logits/rejected": -17.791126251220703, "logps/chosen": -384.9289245605469, "logps/rejected": -427.39208984375, "loss": 0.8894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.90885591506958, "rewards/margins": 0.26388123631477356, "rewards/rejected": 2.644974708557129, "step": 72080 }, { "epoch": 3.346952040484702, "grad_norm": 1.3848813772201538, "learning_rate": 9.927480384418961e-08, "logits/chosen": -18.865785598754883, "logits/rejected": -18.53462028503418, "logps/chosen": -331.11669921875, "logps/rejected": -278.9400329589844, "loss": 0.8455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6138839721679688, "rewards/margins": 1.1619585752487183, "rewards/rejected": 1.4519253969192505, "step": 72090 }, { "epoch": 3.3474163145921354, "grad_norm": 132.87278747558594, "learning_rate": 9.924694739774363e-08, "logits/chosen": -18.371688842773438, "logits/rejected": -17.552419662475586, "logps/chosen": -411.3907775878906, "logps/rejected": -319.8428955078125, "loss": 0.7383, "rewards/accuracies": 0.5, "rewards/chosen": 3.064005136489868, "rewards/margins": 0.9211921691894531, "rewards/rejected": 2.142812728881836, "step": 72100 }, { "epoch": 3.347880588699568, "grad_norm": 19.865543365478516, "learning_rate": 9.921909095129764e-08, "logits/chosen": -18.569454193115234, "logits/rejected": -18.12531852722168, "logps/chosen": -372.7322692871094, "logps/rejected": -263.7293395996094, "loss": 1.1047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5127453804016113, "rewards/margins": 0.508305013179779, "rewards/rejected": 2.0044403076171875, "step": 72110 }, { "epoch": 3.3483448628070014, "grad_norm": 21.295394897460938, "learning_rate": 9.919123450485165e-08, "logits/chosen": -18.503414154052734, "logits/rejected": -17.466121673583984, "logps/chosen": -413.4789123535156, "logps/rejected": -303.61114501953125, "loss": 0.3802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0276429653167725, "rewards/margins": 1.1836357116699219, "rewards/rejected": 1.844007134437561, "step": 72120 }, { "epoch": 3.348809136914434, "grad_norm": 25.126794815063477, "learning_rate": 9.916337805840568e-08, "logits/chosen": -18.911970138549805, "logits/rejected": -17.598758697509766, "logps/chosen": -554.21484375, "logps/rejected": -375.04241943359375, "loss": 0.2629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.5262064933776855, "rewards/margins": 3.110327959060669, "rewards/rejected": 2.4158787727355957, "step": 72130 }, { "epoch": 3.3492734110218674, "grad_norm": 68.4475326538086, "learning_rate": 9.91355216119597e-08, "logits/chosen": -19.031387329101562, "logits/rejected": -17.51174545288086, "logps/chosen": -508.8832092285156, "logps/rejected": -254.9633331298828, "loss": 0.2291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.870980262756348, "rewards/margins": 4.038870811462402, "rewards/rejected": 1.8321088552474976, "step": 72140 }, { "epoch": 3.3497376851293, "grad_norm": 16.688730239868164, "learning_rate": 9.910766516551372e-08, "logits/chosen": -18.940019607543945, "logits/rejected": -18.152721405029297, "logps/chosen": -336.3533935546875, "logps/rejected": -380.07733154296875, "loss": 0.6682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3406155109405518, "rewards/margins": 1.0337424278259277, "rewards/rejected": 2.306873321533203, "step": 72150 }, { "epoch": 3.3502019592367334, "grad_norm": 16.4787654876709, "learning_rate": 9.907980871906773e-08, "logits/chosen": -18.205028533935547, "logits/rejected": -17.643768310546875, "logps/chosen": -441.89410400390625, "logps/rejected": -392.3468933105469, "loss": 0.6219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3608806133270264, "rewards/margins": 0.6737518310546875, "rewards/rejected": 2.687129259109497, "step": 72160 }, { "epoch": 3.3506662333441666, "grad_norm": 209.36477661132812, "learning_rate": 9.905195227262175e-08, "logits/chosen": -19.38375473022461, "logits/rejected": -18.57634925842285, "logps/chosen": -408.703369140625, "logps/rejected": -287.169677734375, "loss": 0.8037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8287570476531982, "rewards/margins": 0.9240023493766785, "rewards/rejected": 1.9047552347183228, "step": 72170 }, { "epoch": 3.3511305074515993, "grad_norm": 263.7728576660156, "learning_rate": 9.902409582617577e-08, "logits/chosen": -19.053295135498047, "logits/rejected": -19.37923812866211, "logps/chosen": -398.4046630859375, "logps/rejected": -373.1358642578125, "loss": 1.4741, "rewards/accuracies": 0.5, "rewards/chosen": 3.7880280017852783, "rewards/margins": 0.253528892993927, "rewards/rejected": 3.534499406814575, "step": 72180 }, { "epoch": 3.3515947815590326, "grad_norm": 8.674002647399902, "learning_rate": 9.89962393797298e-08, "logits/chosen": -19.131214141845703, "logits/rejected": -18.536502838134766, "logps/chosen": -399.1288146972656, "logps/rejected": -358.17498779296875, "loss": 0.306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6774139404296875, "rewards/margins": 1.327605962753296, "rewards/rejected": 2.3498079776763916, "step": 72190 }, { "epoch": 3.3520590556664653, "grad_norm": 143.4425048828125, "learning_rate": 9.896838293328381e-08, "logits/chosen": -19.535343170166016, "logits/rejected": -19.593502044677734, "logps/chosen": -313.6127624511719, "logps/rejected": -370.4710388183594, "loss": 1.706, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.102114200592041, "rewards/margins": -1.2648171186447144, "rewards/rejected": 3.366931200027466, "step": 72200 }, { "epoch": 3.3525233297738986, "grad_norm": 64.61016082763672, "learning_rate": 9.894052648683781e-08, "logits/chosen": -18.932666778564453, "logits/rejected": -17.652812957763672, "logps/chosen": -345.34942626953125, "logps/rejected": -234.99612426757812, "loss": 0.3208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2809040546417236, "rewards/margins": 1.8563034534454346, "rewards/rejected": 1.4246004819869995, "step": 72210 }, { "epoch": 3.3529876038813313, "grad_norm": 196.04380798339844, "learning_rate": 9.891267004039184e-08, "logits/chosen": -18.92986488342285, "logits/rejected": -17.58349609375, "logps/chosen": -390.35870361328125, "logps/rejected": -308.3227233886719, "loss": 0.8341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.338079452514648, "rewards/margins": 1.900382399559021, "rewards/rejected": 2.437697172164917, "step": 72220 }, { "epoch": 3.3534518779887645, "grad_norm": 9.092473030090332, "learning_rate": 9.888481359394585e-08, "logits/chosen": -19.655275344848633, "logits/rejected": -18.522390365600586, "logps/chosen": -282.8641357421875, "logps/rejected": -183.94837951660156, "loss": 0.5367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6317765712738037, "rewards/margins": 1.4526780843734741, "rewards/rejected": 1.1790984869003296, "step": 72230 }, { "epoch": 3.3539161520961978, "grad_norm": 35.634033203125, "learning_rate": 9.885695714749988e-08, "logits/chosen": -18.744876861572266, "logits/rejected": -16.72309684753418, "logps/chosen": -351.2095947265625, "logps/rejected": -211.8171844482422, "loss": 0.5068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.431398391723633, "rewards/margins": 1.948590636253357, "rewards/rejected": 1.482808232307434, "step": 72240 }, { "epoch": 3.3543804262036305, "grad_norm": 196.65084838867188, "learning_rate": 9.88291007010539e-08, "logits/chosen": -18.783519744873047, "logits/rejected": -17.392322540283203, "logps/chosen": -362.07489013671875, "logps/rejected": -299.4393005371094, "loss": 0.6906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.357011079788208, "rewards/margins": 1.0604581832885742, "rewards/rejected": 2.296552896499634, "step": 72250 }, { "epoch": 3.3548447003110637, "grad_norm": 1.2468613386154175, "learning_rate": 9.880124425460792e-08, "logits/chosen": -18.683441162109375, "logits/rejected": -17.9205379486084, "logps/chosen": -308.4988098144531, "logps/rejected": -277.3351745605469, "loss": 0.7357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.174043893814087, "rewards/margins": 2.1552319526672363, "rewards/rejected": 1.0188119411468506, "step": 72260 }, { "epoch": 3.3553089744184965, "grad_norm": 93.76444244384766, "learning_rate": 9.877338780816193e-08, "logits/chosen": -18.339872360229492, "logits/rejected": -18.654773712158203, "logps/chosen": -356.2802429199219, "logps/rejected": -356.8199157714844, "loss": 2.1847, "rewards/accuracies": 0.5, "rewards/chosen": 3.331235885620117, "rewards/margins": -0.9364570379257202, "rewards/rejected": 4.267693042755127, "step": 72270 }, { "epoch": 3.3557732485259297, "grad_norm": 25.87236976623535, "learning_rate": 9.874553136171595e-08, "logits/chosen": -19.371688842773438, "logits/rejected": -18.393217086791992, "logps/chosen": -420.0547790527344, "logps/rejected": -258.78289794921875, "loss": 0.7814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1496760845184326, "rewards/margins": 1.229241132736206, "rewards/rejected": 1.9204351902008057, "step": 72280 }, { "epoch": 3.3562375226333625, "grad_norm": 30.767250061035156, "learning_rate": 9.871767491526997e-08, "logits/chosen": -19.143810272216797, "logits/rejected": -18.463977813720703, "logps/chosen": -362.7807922363281, "logps/rejected": -334.20819091796875, "loss": 1.0856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3772525787353516, "rewards/margins": 0.12870058417320251, "rewards/rejected": 3.248551607131958, "step": 72290 }, { "epoch": 3.3567017967407957, "grad_norm": 13.211141586303711, "learning_rate": 9.8689818468824e-08, "logits/chosen": -18.405139923095703, "logits/rejected": -17.930286407470703, "logps/chosen": -300.63360595703125, "logps/rejected": -287.95562744140625, "loss": 1.0, "rewards/accuracies": 0.5, "rewards/chosen": 2.5582709312438965, "rewards/margins": 0.8460995554924011, "rewards/rejected": 1.712170958518982, "step": 72300 }, { "epoch": 3.357166070848229, "grad_norm": 58.091590881347656, "learning_rate": 9.866196202237801e-08, "logits/chosen": -19.16427230834961, "logits/rejected": -18.310516357421875, "logps/chosen": -361.9111328125, "logps/rejected": -289.0115661621094, "loss": 0.6886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1471357345581055, "rewards/margins": 2.4604544639587402, "rewards/rejected": 1.6866813898086548, "step": 72310 }, { "epoch": 3.3576303449556617, "grad_norm": 150.01016235351562, "learning_rate": 9.863410557593202e-08, "logits/chosen": -20.55784797668457, "logits/rejected": -18.213768005371094, "logps/chosen": -470.0069885253906, "logps/rejected": -314.21826171875, "loss": 0.3455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.091068744659424, "rewards/margins": 1.9372705221176147, "rewards/rejected": 2.1537981033325195, "step": 72320 }, { "epoch": 3.358094619063095, "grad_norm": 26.084362030029297, "learning_rate": 9.860624912948604e-08, "logits/chosen": -20.067378997802734, "logits/rejected": -19.242801666259766, "logps/chosen": -392.4037780761719, "logps/rejected": -350.5959777832031, "loss": 0.2194, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9237453937530518, "rewards/margins": 2.1148152351379395, "rewards/rejected": 1.8089301586151123, "step": 72330 }, { "epoch": 3.3585588931705277, "grad_norm": 110.38185119628906, "learning_rate": 9.857839268304007e-08, "logits/chosen": -17.44269561767578, "logits/rejected": -17.590686798095703, "logps/chosen": -301.121337890625, "logps/rejected": -271.23077392578125, "loss": 1.0545, "rewards/accuracies": 0.5, "rewards/chosen": 2.067815065383911, "rewards/margins": 0.534534752368927, "rewards/rejected": 1.5332801342010498, "step": 72340 }, { "epoch": 3.359023167277961, "grad_norm": 150.9545440673828, "learning_rate": 9.855053623659408e-08, "logits/chosen": -19.59140968322754, "logits/rejected": -18.59734344482422, "logps/chosen": -286.5852355957031, "logps/rejected": -252.36196899414062, "loss": 0.3711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.390502452850342, "rewards/margins": 1.4534809589385986, "rewards/rejected": 1.9370216131210327, "step": 72350 }, { "epoch": 3.359487441385394, "grad_norm": 49.97846603393555, "learning_rate": 9.852267979014811e-08, "logits/chosen": -19.77457046508789, "logits/rejected": -18.930919647216797, "logps/chosen": -515.1959228515625, "logps/rejected": -426.1871032714844, "loss": 0.4093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3753461837768555, "rewards/margins": 1.5648494958877563, "rewards/rejected": 2.8104968070983887, "step": 72360 }, { "epoch": 3.359951715492827, "grad_norm": 8.352258682250977, "learning_rate": 9.849482334370211e-08, "logits/chosen": -18.890451431274414, "logits/rejected": -18.461414337158203, "logps/chosen": -497.49560546875, "logps/rejected": -433.6434631347656, "loss": 1.0874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8117687702178955, "rewards/margins": -0.15905997157096863, "rewards/rejected": 3.9708285331726074, "step": 72370 }, { "epoch": 3.36041598960026, "grad_norm": 196.80795288085938, "learning_rate": 9.846696689725614e-08, "logits/chosen": -19.252567291259766, "logits/rejected": -18.92469596862793, "logps/chosen": -355.92462158203125, "logps/rejected": -407.85333251953125, "loss": 1.4569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7671990394592285, "rewards/margins": 0.23042026162147522, "rewards/rejected": 2.5367789268493652, "step": 72380 }, { "epoch": 3.360880263707693, "grad_norm": 31.05265998840332, "learning_rate": 9.843911045081015e-08, "logits/chosen": -18.801374435424805, "logits/rejected": -17.137584686279297, "logps/chosen": -461.03265380859375, "logps/rejected": -336.8980407714844, "loss": 0.4674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.257399082183838, "rewards/margins": 1.9497826099395752, "rewards/rejected": 2.3076159954071045, "step": 72390 }, { "epoch": 3.361344537815126, "grad_norm": 29.60405158996582, "learning_rate": 9.841125400436418e-08, "logits/chosen": -18.667028427124023, "logits/rejected": -18.933856964111328, "logps/chosen": -307.5411682128906, "logps/rejected": -255.61923217773438, "loss": 1.109, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5183074474334717, "rewards/margins": 0.4137503504753113, "rewards/rejected": 1.1045572757720947, "step": 72400 }, { "epoch": 3.3618088119225593, "grad_norm": 26.773300170898438, "learning_rate": 9.83833975579182e-08, "logits/chosen": -18.902145385742188, "logits/rejected": -18.14938735961914, "logps/chosen": -440.1998596191406, "logps/rejected": -349.9210510253906, "loss": 0.5446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4078869819641113, "rewards/margins": 0.8956430554389954, "rewards/rejected": 2.5122439861297607, "step": 72410 }, { "epoch": 3.362273086029992, "grad_norm": 1.288732886314392, "learning_rate": 9.83555411114722e-08, "logits/chosen": -19.330379486083984, "logits/rejected": -18.243762969970703, "logps/chosen": -352.5302734375, "logps/rejected": -265.2492980957031, "loss": 1.1326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.68115234375, "rewards/margins": 0.813430666923523, "rewards/rejected": 1.8677217960357666, "step": 72420 }, { "epoch": 3.3627373601374253, "grad_norm": 10.100232124328613, "learning_rate": 9.832768466502622e-08, "logits/chosen": -18.305309295654297, "logits/rejected": -17.90822982788086, "logps/chosen": -417.6083984375, "logps/rejected": -294.98077392578125, "loss": 0.4875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.572603225708008, "rewards/margins": 1.2772022485733032, "rewards/rejected": 2.295401096343994, "step": 72430 }, { "epoch": 3.363201634244858, "grad_norm": 79.24004364013672, "learning_rate": 9.829982821858024e-08, "logits/chosen": -18.888919830322266, "logits/rejected": -18.336910247802734, "logps/chosen": -248.3231201171875, "logps/rejected": -180.27035522460938, "loss": 0.4086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7855132818222046, "rewards/margins": 1.1321276426315308, "rewards/rejected": 0.6533856987953186, "step": 72440 }, { "epoch": 3.3636659083522913, "grad_norm": 15.545601844787598, "learning_rate": 9.827197177213427e-08, "logits/chosen": -18.67868995666504, "logits/rejected": -17.595687866210938, "logps/chosen": -411.60406494140625, "logps/rejected": -291.2713928222656, "loss": 0.9057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4534807205200195, "rewards/margins": 2.2082650661468506, "rewards/rejected": 2.2452163696289062, "step": 72450 }, { "epoch": 3.364130182459724, "grad_norm": 37.491355895996094, "learning_rate": 9.824411532568828e-08, "logits/chosen": -19.94016456604004, "logits/rejected": -19.58384895324707, "logps/chosen": -354.2375793457031, "logps/rejected": -353.81085205078125, "loss": 0.5708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.377859592437744, "rewards/margins": 0.5074886083602905, "rewards/rejected": 1.870370626449585, "step": 72460 }, { "epoch": 3.3645944565671573, "grad_norm": 276.6352844238281, "learning_rate": 9.821625887924231e-08, "logits/chosen": -19.7336368560791, "logits/rejected": -18.807876586914062, "logps/chosen": -400.6422424316406, "logps/rejected": -310.75445556640625, "loss": 0.7924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.465266466140747, "rewards/margins": 1.0222753286361694, "rewards/rejected": 2.442990779876709, "step": 72470 }, { "epoch": 3.3650587306745905, "grad_norm": 0.8977177739143372, "learning_rate": 9.818840243279631e-08, "logits/chosen": -18.247488021850586, "logits/rejected": -17.781362533569336, "logps/chosen": -378.5950012207031, "logps/rejected": -334.23736572265625, "loss": 0.5203, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9621901512145996, "rewards/margins": 1.3501735925674438, "rewards/rejected": 1.6120164394378662, "step": 72480 }, { "epoch": 3.3655230047820233, "grad_norm": 149.60098266601562, "learning_rate": 9.816054598635034e-08, "logits/chosen": -19.00759506225586, "logits/rejected": -18.308170318603516, "logps/chosen": -437.42669677734375, "logps/rejected": -276.9337463378906, "loss": 1.1587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.070268154144287, "rewards/margins": 1.3066784143447876, "rewards/rejected": 1.763589859008789, "step": 72490 }, { "epoch": 3.3659872788894565, "grad_norm": 42.06071090698242, "learning_rate": 9.813268953990435e-08, "logits/chosen": -19.099536895751953, "logits/rejected": -18.355892181396484, "logps/chosen": -429.17193603515625, "logps/rejected": -307.07342529296875, "loss": 0.3567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.445025444030762, "rewards/margins": 1.7398332357406616, "rewards/rejected": 2.7051918506622314, "step": 72500 }, { "epoch": 3.3664515529968893, "grad_norm": 9.971328735351562, "learning_rate": 9.810483309345838e-08, "logits/chosen": -18.665512084960938, "logits/rejected": -18.505733489990234, "logps/chosen": -435.1400451660156, "logps/rejected": -355.31121826171875, "loss": 0.6051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.438638925552368, "rewards/margins": 0.6212866902351379, "rewards/rejected": 1.817352294921875, "step": 72510 }, { "epoch": 3.3669158271043225, "grad_norm": 37.864627838134766, "learning_rate": 9.80769766470124e-08, "logits/chosen": -18.84369468688965, "logits/rejected": -17.50567054748535, "logps/chosen": -365.052978515625, "logps/rejected": -227.9322509765625, "loss": 0.4078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.472001552581787, "rewards/margins": 1.685482382774353, "rewards/rejected": 1.7865194082260132, "step": 72520 }, { "epoch": 3.3673801012117552, "grad_norm": NaN, "learning_rate": 9.8051905845211e-08, "logits/chosen": -19.452320098876953, "logits/rejected": -19.185142517089844, "logps/chosen": -353.66033935546875, "logps/rejected": -275.33587646484375, "loss": 0.9883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.633948802947998, "rewards/margins": 0.3699762523174286, "rewards/rejected": 2.263972043991089, "step": 72530 }, { "epoch": 3.3678443753191885, "grad_norm": 0.06562060117721558, "learning_rate": 9.802404939876503e-08, "logits/chosen": -18.790401458740234, "logits/rejected": -17.604267120361328, "logps/chosen": -364.80914306640625, "logps/rejected": -282.2091979980469, "loss": 0.5551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3448688983917236, "rewards/margins": 1.7994598150253296, "rewards/rejected": 1.545408844947815, "step": 72540 }, { "epoch": 3.3683086494266217, "grad_norm": 30.62858772277832, "learning_rate": 9.799619295231904e-08, "logits/chosen": -18.991724014282227, "logits/rejected": -19.389589309692383, "logps/chosen": -344.9743347167969, "logps/rejected": -400.7210998535156, "loss": 0.9207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6461021900177, "rewards/margins": -0.08418537676334381, "rewards/rejected": 2.7302873134613037, "step": 72550 }, { "epoch": 3.3687729235340544, "grad_norm": 194.72579956054688, "learning_rate": 9.796833650587307e-08, "logits/chosen": -18.681419372558594, "logits/rejected": -17.22307777404785, "logps/chosen": -450.75421142578125, "logps/rejected": -248.4849090576172, "loss": 0.2575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.326691150665283, "rewards/margins": 2.6791932582855225, "rewards/rejected": 0.6474975943565369, "step": 72560 }, { "epoch": 3.3692371976414877, "grad_norm": 5.941310405731201, "learning_rate": 9.794048005942707e-08, "logits/chosen": -19.270509719848633, "logits/rejected": -18.279382705688477, "logps/chosen": -447.46600341796875, "logps/rejected": -337.17535400390625, "loss": 0.4939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.240087509155273, "rewards/margins": 1.7415670156478882, "rewards/rejected": 2.4985198974609375, "step": 72570 }, { "epoch": 3.3697014717489204, "grad_norm": 10.237120628356934, "learning_rate": 9.79126236129811e-08, "logits/chosen": -19.471155166625977, "logits/rejected": -18.821826934814453, "logps/chosen": -572.4866333007812, "logps/rejected": -393.141357421875, "loss": 0.4938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.20786714553833, "rewards/margins": 1.3212988376617432, "rewards/rejected": 2.886568069458008, "step": 72580 }, { "epoch": 3.3701657458563536, "grad_norm": 0.13987205922603607, "learning_rate": 9.788476716653512e-08, "logits/chosen": -18.323429107666016, "logits/rejected": -17.720539093017578, "logps/chosen": -315.5709533691406, "logps/rejected": -233.7582550048828, "loss": 0.7303, "rewards/accuracies": 0.5, "rewards/chosen": 2.330610752105713, "rewards/margins": 0.9672381281852722, "rewards/rejected": 1.363372802734375, "step": 72590 }, { "epoch": 3.3706300199637864, "grad_norm": 33.863861083984375, "learning_rate": 9.785691072008914e-08, "logits/chosen": -19.579898834228516, "logits/rejected": -18.970172882080078, "logps/chosen": -411.58612060546875, "logps/rejected": -350.16259765625, "loss": 0.9403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6302356719970703, "rewards/margins": 0.9744078516960144, "rewards/rejected": 2.6558279991149902, "step": 72600 }, { "epoch": 3.3710942940712196, "grad_norm": 41.821659088134766, "learning_rate": 9.782905427364316e-08, "logits/chosen": -18.622282028198242, "logits/rejected": -18.348073959350586, "logps/chosen": -295.3499450683594, "logps/rejected": -249.87948608398438, "loss": 0.998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2155654430389404, "rewards/margins": -0.09915725141763687, "rewards/rejected": 2.314723014831543, "step": 72610 }, { "epoch": 3.371558568178653, "grad_norm": 101.89410400390625, "learning_rate": 9.780119782719719e-08, "logits/chosen": -20.076648712158203, "logits/rejected": -18.85901641845703, "logps/chosen": -308.1904296875, "logps/rejected": -258.9354553222656, "loss": 0.3893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.560771942138672, "rewards/margins": 1.5121475458145142, "rewards/rejected": 1.0486242771148682, "step": 72620 }, { "epoch": 3.3720228422860856, "grad_norm": 77.07460021972656, "learning_rate": 9.777334138075119e-08, "logits/chosen": -19.213075637817383, "logits/rejected": -17.761144638061523, "logps/chosen": -393.2305908203125, "logps/rejected": -267.89532470703125, "loss": 0.3092, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.528366804122925, "rewards/margins": 1.475377082824707, "rewards/rejected": 2.0529894828796387, "step": 72630 }, { "epoch": 3.372487116393519, "grad_norm": 37.69697189331055, "learning_rate": 9.774548493430522e-08, "logits/chosen": -19.908245086669922, "logits/rejected": -18.545196533203125, "logps/chosen": -511.5892639160156, "logps/rejected": -282.5910949707031, "loss": 0.214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.720179557800293, "rewards/margins": 2.6288414001464844, "rewards/rejected": 2.091338634490967, "step": 72640 }, { "epoch": 3.3729513905009516, "grad_norm": 65.5052261352539, "learning_rate": 9.771762848785923e-08, "logits/chosen": -18.763545989990234, "logits/rejected": -17.844806671142578, "logps/chosen": -409.4071044921875, "logps/rejected": -326.7162780761719, "loss": 0.563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.616931438446045, "rewards/margins": 1.3080604076385498, "rewards/rejected": 2.308871269226074, "step": 72650 }, { "epoch": 3.373415664608385, "grad_norm": 92.56776428222656, "learning_rate": 9.768977204141326e-08, "logits/chosen": -19.42232894897461, "logits/rejected": -18.643468856811523, "logps/chosen": -510.14727783203125, "logps/rejected": -445.29815673828125, "loss": 0.7023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.888625144958496, "rewards/margins": 1.385157823562622, "rewards/rejected": 3.503467559814453, "step": 72660 }, { "epoch": 3.3738799387158176, "grad_norm": 0.10010214895009995, "learning_rate": 9.766191559496726e-08, "logits/chosen": -18.504467010498047, "logits/rejected": -16.96224594116211, "logps/chosen": -440.0684509277344, "logps/rejected": -255.5538787841797, "loss": 0.3713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.849617958068848, "rewards/margins": 2.7098052501678467, "rewards/rejected": 2.13981294631958, "step": 72670 }, { "epoch": 3.374344212823251, "grad_norm": 86.13211822509766, "learning_rate": 9.763405914852127e-08, "logits/chosen": -20.559711456298828, "logits/rejected": -19.223459243774414, "logps/chosen": -369.01080322265625, "logps/rejected": -260.80230712890625, "loss": 0.3023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.318042755126953, "rewards/margins": 2.0029842853546143, "rewards/rejected": 2.3150582313537598, "step": 72680 }, { "epoch": 3.374808486930684, "grad_norm": 26.109342575073242, "learning_rate": 9.76062027020753e-08, "logits/chosen": -19.387800216674805, "logits/rejected": -17.964946746826172, "logps/chosen": -406.80108642578125, "logps/rejected": -358.7212829589844, "loss": 0.9094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.27140736579895, "rewards/margins": 1.0165637731552124, "rewards/rejected": 2.2548439502716064, "step": 72690 }, { "epoch": 3.375272761038117, "grad_norm": 80.06388854980469, "learning_rate": 9.757834625562932e-08, "logits/chosen": -20.76360321044922, "logits/rejected": -19.546239852905273, "logps/chosen": -442.03350830078125, "logps/rejected": -345.3486022949219, "loss": 0.5172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9515209197998047, "rewards/margins": 1.2697703838348389, "rewards/rejected": 2.6817500591278076, "step": 72700 }, { "epoch": 3.37573703514555, "grad_norm": 39.7845344543457, "learning_rate": 9.755048980918334e-08, "logits/chosen": -18.930583953857422, "logits/rejected": -18.342151641845703, "logps/chosen": -346.0556335449219, "logps/rejected": -294.5552062988281, "loss": 0.7896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.981297254562378, "rewards/margins": 0.5640989542007446, "rewards/rejected": 2.417198419570923, "step": 72710 }, { "epoch": 3.376201309252983, "grad_norm": 8.119011878967285, "learning_rate": 9.752263336273734e-08, "logits/chosen": -18.420143127441406, "logits/rejected": -18.543323516845703, "logps/chosen": -340.3724365234375, "logps/rejected": -323.58245849609375, "loss": 1.4743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.602377414703369, "rewards/margins": 0.5849235653877258, "rewards/rejected": 3.017453908920288, "step": 72720 }, { "epoch": 3.376665583360416, "grad_norm": 121.30413055419922, "learning_rate": 9.749477691629137e-08, "logits/chosen": -19.641494750976562, "logits/rejected": -18.233505249023438, "logps/chosen": -384.4045104980469, "logps/rejected": -329.53564453125, "loss": 0.5476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.912163496017456, "rewards/margins": 1.458747386932373, "rewards/rejected": 2.453416109085083, "step": 72730 }, { "epoch": 3.3771298574678488, "grad_norm": 10.90884017944336, "learning_rate": 9.746692046984539e-08, "logits/chosen": -18.054872512817383, "logits/rejected": -17.668642044067383, "logps/chosen": -326.35430908203125, "logps/rejected": -367.3974609375, "loss": 0.8409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.117154598236084, "rewards/margins": 0.8913484811782837, "rewards/rejected": 2.22580623626709, "step": 72740 }, { "epoch": 3.377594131575282, "grad_norm": 58.97932434082031, "learning_rate": 9.743906402339941e-08, "logits/chosen": -19.098125457763672, "logits/rejected": -18.297277450561523, "logps/chosen": -377.7398681640625, "logps/rejected": -290.97088623046875, "loss": 0.3383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2699074745178223, "rewards/margins": 2.0712637901306152, "rewards/rejected": 1.1986433267593384, "step": 72750 }, { "epoch": 3.378058405682715, "grad_norm": 16.68819236755371, "learning_rate": 9.741120757695343e-08, "logits/chosen": -18.42494773864746, "logits/rejected": -17.91554832458496, "logps/chosen": -361.7477722167969, "logps/rejected": -273.7433166503906, "loss": 0.2845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.185392379760742, "rewards/margins": 1.9170730113983154, "rewards/rejected": 1.2683196067810059, "step": 72760 }, { "epoch": 3.378522679790148, "grad_norm": 5.442253112792969, "learning_rate": 9.738335113050746e-08, "logits/chosen": -19.512561798095703, "logits/rejected": -18.75009536743164, "logps/chosen": -467.38348388671875, "logps/rejected": -327.3497619628906, "loss": 0.6156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8245067596435547, "rewards/margins": 1.7088887691497803, "rewards/rejected": 2.1156179904937744, "step": 72770 }, { "epoch": 3.378986953897581, "grad_norm": 46.15222930908203, "learning_rate": 9.735549468406146e-08, "logits/chosen": -19.153091430664062, "logits/rejected": -18.336732864379883, "logps/chosen": -382.9482116699219, "logps/rejected": -331.99505615234375, "loss": 0.6196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5908875465393066, "rewards/margins": 1.704338788986206, "rewards/rejected": 1.886548638343811, "step": 72780 }, { "epoch": 3.3794512280050144, "grad_norm": 54.07160949707031, "learning_rate": 9.732763823761549e-08, "logits/chosen": -18.84741973876953, "logits/rejected": -18.70962905883789, "logps/chosen": -370.4106140136719, "logps/rejected": -359.6181945800781, "loss": 0.8871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.319185733795166, "rewards/margins": 0.21281257271766663, "rewards/rejected": 3.1063730716705322, "step": 72790 }, { "epoch": 3.379915502112447, "grad_norm": 27.03325653076172, "learning_rate": 9.72997817911695e-08, "logits/chosen": -19.443988800048828, "logits/rejected": -18.474594116210938, "logps/chosen": -482.24658203125, "logps/rejected": -359.45428466796875, "loss": 0.5791, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6426429748535156, "rewards/margins": 0.9503129720687866, "rewards/rejected": 2.6923301219940186, "step": 72800 }, { "epoch": 3.3803797762198804, "grad_norm": 6.184692859649658, "learning_rate": 9.727192534472353e-08, "logits/chosen": -20.062572479248047, "logits/rejected": -18.902217864990234, "logps/chosen": -485.0464782714844, "logps/rejected": -406.60284423828125, "loss": 0.6726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.30150032043457, "rewards/margins": 1.4115374088287354, "rewards/rejected": 2.889962911605835, "step": 72810 }, { "epoch": 3.380844050327313, "grad_norm": 24.507410049438477, "learning_rate": 9.724406889827754e-08, "logits/chosen": -18.145702362060547, "logits/rejected": -18.069013595581055, "logps/chosen": -309.92462158203125, "logps/rejected": -312.0125427246094, "loss": 1.0087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.681530714035034, "rewards/margins": 0.6212888956069946, "rewards/rejected": 2.060242176055908, "step": 72820 }, { "epoch": 3.3813083244347464, "grad_norm": 80.34004974365234, "learning_rate": 9.721621245183157e-08, "logits/chosen": -19.62274169921875, "logits/rejected": -18.475505828857422, "logps/chosen": -452.4605407714844, "logps/rejected": -322.5341796875, "loss": 0.4366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.4640421867370605, "rewards/margins": 2.3835830688476562, "rewards/rejected": 3.0804591178894043, "step": 72830 }, { "epoch": 3.381772598542179, "grad_norm": 40.445926666259766, "learning_rate": 9.718835600538557e-08, "logits/chosen": -18.602773666381836, "logits/rejected": -18.086986541748047, "logps/chosen": -375.9253234863281, "logps/rejected": -419.68414306640625, "loss": 1.0628, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.123353481292725, "rewards/margins": 0.4166967272758484, "rewards/rejected": 3.7066569328308105, "step": 72840 }, { "epoch": 3.3822368726496124, "grad_norm": 9.939579010009766, "learning_rate": 9.71604995589396e-08, "logits/chosen": -20.0037899017334, "logits/rejected": -18.74040412902832, "logps/chosen": -483.6107482910156, "logps/rejected": -354.8504333496094, "loss": 0.2887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.1132707595825195, "rewards/margins": 1.772849440574646, "rewards/rejected": 3.340421199798584, "step": 72850 }, { "epoch": 3.3827011467570456, "grad_norm": 7.793989658355713, "learning_rate": 9.713264311249361e-08, "logits/chosen": -18.63410186767578, "logits/rejected": -18.525691986083984, "logps/chosen": -349.49749755859375, "logps/rejected": -375.09625244140625, "loss": 1.1806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.959287166595459, "rewards/margins": 0.558355450630188, "rewards/rejected": 2.4009320735931396, "step": 72860 }, { "epoch": 3.3831654208644784, "grad_norm": 58.19415283203125, "learning_rate": 9.710478666604763e-08, "logits/chosen": -18.190446853637695, "logits/rejected": -17.690380096435547, "logps/chosen": -379.3421325683594, "logps/rejected": -299.9880676269531, "loss": 0.3495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.007660388946533, "rewards/margins": 1.8238239288330078, "rewards/rejected": 1.183836817741394, "step": 72870 }, { "epoch": 3.3836296949719116, "grad_norm": 38.4318733215332, "learning_rate": 9.707693021960164e-08, "logits/chosen": -18.785717010498047, "logits/rejected": -17.76259994506836, "logps/chosen": -458.1038513183594, "logps/rejected": -341.39752197265625, "loss": 0.4413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2122459411621094, "rewards/margins": 1.6586272716522217, "rewards/rejected": 1.5536185503005981, "step": 72880 }, { "epoch": 3.3840939690793443, "grad_norm": 0.01222316361963749, "learning_rate": 9.704907377315566e-08, "logits/chosen": -19.21006202697754, "logits/rejected": -17.87618064880371, "logps/chosen": -398.69970703125, "logps/rejected": -341.5782470703125, "loss": 0.4612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.174290657043457, "rewards/margins": 1.8148977756500244, "rewards/rejected": 2.3593926429748535, "step": 72890 }, { "epoch": 3.3845582431867776, "grad_norm": 297.69189453125, "learning_rate": 9.702121732670969e-08, "logits/chosen": -18.39809799194336, "logits/rejected": -18.96139907836914, "logps/chosen": -392.45013427734375, "logps/rejected": -393.439453125, "loss": 1.2903, "rewards/accuracies": 0.5, "rewards/chosen": 3.137195348739624, "rewards/margins": -0.296896368265152, "rewards/rejected": 3.434091567993164, "step": 72900 }, { "epoch": 3.3850225172942103, "grad_norm": 40.841522216796875, "learning_rate": 9.69933608802637e-08, "logits/chosen": -18.457538604736328, "logits/rejected": -17.355133056640625, "logps/chosen": -350.47686767578125, "logps/rejected": -246.3976593017578, "loss": 0.2944, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1873621940612793, "rewards/margins": 2.3892571926116943, "rewards/rejected": 0.798105001449585, "step": 72910 }, { "epoch": 3.3854867914016435, "grad_norm": 55.84516525268555, "learning_rate": 9.696550443381773e-08, "logits/chosen": -18.73653793334961, "logits/rejected": -18.064245223999023, "logps/chosen": -354.49774169921875, "logps/rejected": -310.5274963378906, "loss": 0.6909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9920554161071777, "rewards/margins": 0.9908693432807922, "rewards/rejected": 2.001185894012451, "step": 72920 }, { "epoch": 3.3859510655090768, "grad_norm": 122.86774444580078, "learning_rate": 9.693764798737173e-08, "logits/chosen": -18.404502868652344, "logits/rejected": -18.01572036743164, "logps/chosen": -356.93988037109375, "logps/rejected": -272.0690002441406, "loss": 0.9769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.099148273468018, "rewards/margins": 1.6761764287948608, "rewards/rejected": 2.422971725463867, "step": 72930 }, { "epoch": 3.3864153396165095, "grad_norm": 41.84946060180664, "learning_rate": 9.690979154092576e-08, "logits/chosen": -19.278764724731445, "logits/rejected": -18.437471389770508, "logps/chosen": -513.034912109375, "logps/rejected": -398.3659973144531, "loss": 0.5484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.562390327453613, "rewards/margins": 1.5625110864639282, "rewards/rejected": 2.999880075454712, "step": 72940 }, { "epoch": 3.3868796137239428, "grad_norm": 49.367881774902344, "learning_rate": 9.688193509447977e-08, "logits/chosen": -18.877235412597656, "logits/rejected": -18.67129898071289, "logps/chosen": -406.15557861328125, "logps/rejected": -385.80535888671875, "loss": 0.5967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9993162155151367, "rewards/margins": 0.6473494172096252, "rewards/rejected": 2.351966619491577, "step": 72950 }, { "epoch": 3.3873438878313755, "grad_norm": 6.8236002922058105, "learning_rate": 9.68540786480338e-08, "logits/chosen": -18.409992218017578, "logits/rejected": -18.55373764038086, "logps/chosen": -483.98931884765625, "logps/rejected": -449.7933654785156, "loss": 0.6573, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.5473408699035645, "rewards/margins": 1.4591271877288818, "rewards/rejected": 4.088213920593262, "step": 72960 }, { "epoch": 3.3878081619388087, "grad_norm": 169.50375366210938, "learning_rate": 9.682622220158781e-08, "logits/chosen": -18.736743927001953, "logits/rejected": -17.5432186126709, "logps/chosen": -497.67572021484375, "logps/rejected": -377.5145263671875, "loss": 0.8885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1368088722229, "rewards/margins": 1.9565509557724, "rewards/rejected": 2.180257558822632, "step": 72970 }, { "epoch": 3.3882724360462415, "grad_norm": 277.164794921875, "learning_rate": 9.679836575514184e-08, "logits/chosen": -17.79159927368164, "logits/rejected": -18.208391189575195, "logps/chosen": -407.7497863769531, "logps/rejected": -371.2906494140625, "loss": 1.0677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9926624298095703, "rewards/margins": 1.2544782161712646, "rewards/rejected": 2.7381839752197266, "step": 72980 }, { "epoch": 3.3887367101536747, "grad_norm": 41.7565803527832, "learning_rate": 9.677050930869584e-08, "logits/chosen": -18.155521392822266, "logits/rejected": -18.016849517822266, "logps/chosen": -345.57293701171875, "logps/rejected": -293.7979431152344, "loss": 0.7797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5716640949249268, "rewards/margins": 1.198015809059143, "rewards/rejected": 1.373647928237915, "step": 72990 }, { "epoch": 3.389200984261108, "grad_norm": 82.35938262939453, "learning_rate": 9.674265286224987e-08, "logits/chosen": -18.923091888427734, "logits/rejected": -17.768253326416016, "logps/chosen": -480.55255126953125, "logps/rejected": -353.3081359863281, "loss": 0.7938, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5389609336853027, "rewards/margins": 1.2080204486846924, "rewards/rejected": 2.3309407234191895, "step": 73000 }, { "epoch": 3.3896652583685407, "grad_norm": 141.4737548828125, "learning_rate": 9.671479641580389e-08, "logits/chosen": -19.561124801635742, "logits/rejected": -19.135005950927734, "logps/chosen": -456.484130859375, "logps/rejected": -385.98138427734375, "loss": 0.4774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.402413368225098, "rewards/margins": 1.4992334842681885, "rewards/rejected": 3.90317964553833, "step": 73010 }, { "epoch": 3.390129532475974, "grad_norm": 9.737262725830078, "learning_rate": 9.668693996935791e-08, "logits/chosen": -18.26922035217285, "logits/rejected": -17.124168395996094, "logps/chosen": -387.5157775878906, "logps/rejected": -266.6254577636719, "loss": 0.1822, "rewards/accuracies": 1.0, "rewards/chosen": 3.7745747566223145, "rewards/margins": 2.1748733520507812, "rewards/rejected": 1.599701166152954, "step": 73020 }, { "epoch": 3.3905938065834067, "grad_norm": 23.0153751373291, "learning_rate": 9.665908352291193e-08, "logits/chosen": -18.875459671020508, "logits/rejected": -18.231210708618164, "logps/chosen": -395.15570068359375, "logps/rejected": -328.09930419921875, "loss": 0.9473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.230544328689575, "rewards/margins": 0.911822497844696, "rewards/rejected": 2.3187220096588135, "step": 73030 }, { "epoch": 3.39105808069084, "grad_norm": 108.1813735961914, "learning_rate": 9.663122707646596e-08, "logits/chosen": -18.80430793762207, "logits/rejected": -18.194473266601562, "logps/chosen": -356.545654296875, "logps/rejected": -282.3373718261719, "loss": 0.6576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1106526851654053, "rewards/margins": 1.5793687105178833, "rewards/rejected": 1.5312840938568115, "step": 73040 }, { "epoch": 3.3915223547982727, "grad_norm": 26.635398864746094, "learning_rate": 9.660337063001996e-08, "logits/chosen": -19.45853042602539, "logits/rejected": -18.34108543395996, "logps/chosen": -424.4712829589844, "logps/rejected": -300.1932067871094, "loss": 0.3349, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2287545204162598, "rewards/margins": 1.0954437255859375, "rewards/rejected": 2.1333110332489014, "step": 73050 }, { "epoch": 3.391986628905706, "grad_norm": 3.409355401992798, "learning_rate": 9.657551418357397e-08, "logits/chosen": -19.357397079467773, "logits/rejected": -18.70437240600586, "logps/chosen": -446.323486328125, "logps/rejected": -403.1991271972656, "loss": 0.5392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.254286766052246, "rewards/margins": 1.532209038734436, "rewards/rejected": 2.7220773696899414, "step": 73060 }, { "epoch": 3.392450903013139, "grad_norm": 53.580650329589844, "learning_rate": 9.6547657737128e-08, "logits/chosen": -19.907367706298828, "logits/rejected": -18.780778884887695, "logps/chosen": -301.2123107910156, "logps/rejected": -266.16552734375, "loss": 0.6059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6244261264801025, "rewards/margins": 1.206886887550354, "rewards/rejected": 1.417539358139038, "step": 73070 }, { "epoch": 3.392915177120572, "grad_norm": 143.63845825195312, "learning_rate": 9.651980129068201e-08, "logits/chosen": -20.364694595336914, "logits/rejected": -19.34331512451172, "logps/chosen": -442.572021484375, "logps/rejected": -339.1083984375, "loss": 0.5713, "rewards/accuracies": 0.5, "rewards/chosen": 4.5865397453308105, "rewards/margins": 1.499106764793396, "rewards/rejected": 3.087432861328125, "step": 73080 }, { "epoch": 3.393379451228005, "grad_norm": 249.1678466796875, "learning_rate": 9.649194484423603e-08, "logits/chosen": -19.008525848388672, "logits/rejected": -18.64571762084961, "logps/chosen": -351.6445617675781, "logps/rejected": -389.95159912109375, "loss": 1.0153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.632817268371582, "rewards/margins": 0.23131532967090607, "rewards/rejected": 2.4015016555786133, "step": 73090 }, { "epoch": 3.393843725335438, "grad_norm": 0.7927237153053284, "learning_rate": 9.646408839779004e-08, "logits/chosen": -18.712766647338867, "logits/rejected": -17.953134536743164, "logps/chosen": -407.0962219238281, "logps/rejected": -290.70062255859375, "loss": 0.3082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7848825454711914, "rewards/margins": 1.7366453409194946, "rewards/rejected": 1.0482372045516968, "step": 73100 }, { "epoch": 3.394307999442871, "grad_norm": 7.742391109466553, "learning_rate": 9.643623195134407e-08, "logits/chosen": -19.222288131713867, "logits/rejected": -17.951847076416016, "logps/chosen": -438.18682861328125, "logps/rejected": -305.6899108886719, "loss": 0.7672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.801352500915527, "rewards/margins": 2.1594271659851074, "rewards/rejected": 2.641925573348999, "step": 73110 }, { "epoch": 3.394772273550304, "grad_norm": 22.040374755859375, "learning_rate": 9.640837550489808e-08, "logits/chosen": -18.49675178527832, "logits/rejected": -18.216495513916016, "logps/chosen": -429.1910095214844, "logps/rejected": -308.30078125, "loss": 1.041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.082684516906738, "rewards/margins": 1.4721449613571167, "rewards/rejected": 2.6105399131774902, "step": 73120 }, { "epoch": 3.395236547657737, "grad_norm": 1.65468168258667, "learning_rate": 9.638051905845211e-08, "logits/chosen": -19.615814208984375, "logits/rejected": -18.942317962646484, "logps/chosen": -420.86553955078125, "logps/rejected": -294.9613037109375, "loss": 0.387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4910881519317627, "rewards/margins": 1.42795729637146, "rewards/rejected": 2.0631303787231445, "step": 73130 }, { "epoch": 3.3957008217651703, "grad_norm": 15.493868827819824, "learning_rate": 9.635266261200611e-08, "logits/chosen": -19.27163314819336, "logits/rejected": -19.064172744750977, "logps/chosen": -446.3037109375, "logps/rejected": -396.6205139160156, "loss": 0.7599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.373556613922119, "rewards/margins": 0.5222827792167664, "rewards/rejected": 3.851273775100708, "step": 73140 }, { "epoch": 3.396165095872603, "grad_norm": 10.788830757141113, "learning_rate": 9.632480616556014e-08, "logits/chosen": -19.461349487304688, "logits/rejected": -17.62518882751465, "logps/chosen": -462.229736328125, "logps/rejected": -241.036865234375, "loss": 0.3851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.878237247467041, "rewards/margins": 2.389791965484619, "rewards/rejected": 1.488445520401001, "step": 73150 }, { "epoch": 3.3966293699800363, "grad_norm": 77.23614501953125, "learning_rate": 9.629694971911416e-08, "logits/chosen": -18.456287384033203, "logits/rejected": -18.078655242919922, "logps/chosen": -361.5171203613281, "logps/rejected": -319.0900573730469, "loss": 0.4228, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.512044906616211, "rewards/margins": 1.2940692901611328, "rewards/rejected": 2.217975616455078, "step": 73160 }, { "epoch": 3.397093644087469, "grad_norm": 0.763015866279602, "learning_rate": 9.626909327266818e-08, "logits/chosen": -18.674898147583008, "logits/rejected": -18.337982177734375, "logps/chosen": -360.3914794921875, "logps/rejected": -271.5555419921875, "loss": 0.4226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5010509490966797, "rewards/margins": 1.5396881103515625, "rewards/rejected": 1.9613628387451172, "step": 73170 }, { "epoch": 3.3975579181949023, "grad_norm": 123.54024505615234, "learning_rate": 9.62412368262222e-08, "logits/chosen": -18.262765884399414, "logits/rejected": -18.548940658569336, "logps/chosen": -344.10382080078125, "logps/rejected": -336.31646728515625, "loss": 0.9302, "rewards/accuracies": 0.5, "rewards/chosen": 2.965857982635498, "rewards/margins": -0.019518796354532242, "rewards/rejected": 2.9853768348693848, "step": 73180 }, { "epoch": 3.3980221923023355, "grad_norm": 54.24452209472656, "learning_rate": 9.621338037977623e-08, "logits/chosen": -19.583402633666992, "logits/rejected": -19.249378204345703, "logps/chosen": -418.17266845703125, "logps/rejected": -344.1181640625, "loss": 0.8676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.064000129699707, "rewards/margins": 1.099311113357544, "rewards/rejected": 2.964688777923584, "step": 73190 }, { "epoch": 3.3984864664097683, "grad_norm": 6.963249206542969, "learning_rate": 9.618552393333023e-08, "logits/chosen": -19.205066680908203, "logits/rejected": -18.40780258178711, "logps/chosen": -316.5052795410156, "logps/rejected": -244.29049682617188, "loss": 0.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6055214405059814, "rewards/margins": 1.1530312299728394, "rewards/rejected": 1.4524898529052734, "step": 73200 }, { "epoch": 3.3989507405172015, "grad_norm": 0.29616832733154297, "learning_rate": 9.615766748688426e-08, "logits/chosen": -19.568002700805664, "logits/rejected": -18.492259979248047, "logps/chosen": -325.5697937011719, "logps/rejected": -285.4354248046875, "loss": 0.8635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.625473737716675, "rewards/margins": 0.7154949307441711, "rewards/rejected": 1.9099788665771484, "step": 73210 }, { "epoch": 3.3994150146246342, "grad_norm": 0.11578952521085739, "learning_rate": 9.612981104043827e-08, "logits/chosen": -19.15428352355957, "logits/rejected": -18.03488540649414, "logps/chosen": -453.38775634765625, "logps/rejected": -325.8689270019531, "loss": 0.7901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8527324199676514, "rewards/margins": 1.7630847692489624, "rewards/rejected": 2.0896480083465576, "step": 73220 }, { "epoch": 3.3998792887320675, "grad_norm": 147.26876831054688, "learning_rate": 9.61019545939923e-08, "logits/chosen": -18.904285430908203, "logits/rejected": -18.86968421936035, "logps/chosen": -428.46826171875, "logps/rejected": -352.9455261230469, "loss": 0.6115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.340007305145264, "rewards/margins": 1.450634241104126, "rewards/rejected": 2.8893730640411377, "step": 73230 }, { "epoch": 3.4003435628395007, "grad_norm": 47.876983642578125, "learning_rate": 9.607409814754631e-08, "logits/chosen": -18.84415054321289, "logits/rejected": -18.01719093322754, "logps/chosen": -558.19775390625, "logps/rejected": -351.20574951171875, "loss": 0.2801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.524905204772949, "rewards/margins": 2.484741449356079, "rewards/rejected": 2.0401644706726074, "step": 73240 }, { "epoch": 3.4008078369469334, "grad_norm": 130.6534881591797, "learning_rate": 9.604624170110034e-08, "logits/chosen": -18.21231460571289, "logits/rejected": -17.74091339111328, "logps/chosen": -347.88677978515625, "logps/rejected": -285.55145263671875, "loss": 0.8847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.028870105743408, "rewards/margins": 1.1646918058395386, "rewards/rejected": 1.8641780614852905, "step": 73250 }, { "epoch": 3.4012721110543667, "grad_norm": 61.2845344543457, "learning_rate": 9.601838525465434e-08, "logits/chosen": -19.608814239501953, "logits/rejected": -18.619142532348633, "logps/chosen": -420.81927490234375, "logps/rejected": -290.32293701171875, "loss": 0.3044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.218541145324707, "rewards/margins": 2.521512985229492, "rewards/rejected": 1.6970279216766357, "step": 73260 }, { "epoch": 3.4017363851617994, "grad_norm": 19.030786514282227, "learning_rate": 9.599052880820836e-08, "logits/chosen": -18.64834213256836, "logits/rejected": -17.515323638916016, "logps/chosen": -340.015380859375, "logps/rejected": -230.3793487548828, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4098708629608154, "rewards/margins": 1.0931330919265747, "rewards/rejected": 1.3167381286621094, "step": 73270 }, { "epoch": 3.4022006592692327, "grad_norm": 1.97889244556427, "learning_rate": 9.596267236176238e-08, "logits/chosen": -18.811304092407227, "logits/rejected": -18.115251541137695, "logps/chosen": -391.45513916015625, "logps/rejected": -320.6355895996094, "loss": 0.6301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3450825214385986, "rewards/margins": 1.017458200454712, "rewards/rejected": 2.3276238441467285, "step": 73280 }, { "epoch": 3.4026649333766654, "grad_norm": 90.57068634033203, "learning_rate": 9.59348159153164e-08, "logits/chosen": -18.83660125732422, "logits/rejected": -18.38608169555664, "logps/chosen": -433.7955627441406, "logps/rejected": -331.7798767089844, "loss": 0.6754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3829116821289062, "rewards/margins": 0.6053749918937683, "rewards/rejected": 2.777536630630493, "step": 73290 }, { "epoch": 3.4031292074840986, "grad_norm": 16.209218978881836, "learning_rate": 9.590695946887041e-08, "logits/chosen": -18.90555191040039, "logits/rejected": -18.32008934020996, "logps/chosen": -323.82965087890625, "logps/rejected": -325.01220703125, "loss": 1.0665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8953213691711426, "rewards/margins": 0.8235618472099304, "rewards/rejected": 2.0717592239379883, "step": 73300 }, { "epoch": 3.403593481591532, "grad_norm": 34.36738967895508, "learning_rate": 9.587910302242443e-08, "logits/chosen": -17.71830940246582, "logits/rejected": -17.888092041015625, "logps/chosen": -294.837158203125, "logps/rejected": -294.22210693359375, "loss": 0.9097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2305476665496826, "rewards/margins": 0.18469054996967316, "rewards/rejected": 2.0458571910858154, "step": 73310 }, { "epoch": 3.4040577556989646, "grad_norm": 118.38207244873047, "learning_rate": 9.585124657597845e-08, "logits/chosen": -18.73501205444336, "logits/rejected": -18.431180953979492, "logps/chosen": -291.62060546875, "logps/rejected": -226.32894897460938, "loss": 0.8779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.353959560394287, "rewards/margins": 1.052107334136963, "rewards/rejected": 1.3018522262573242, "step": 73320 }, { "epoch": 3.404522029806398, "grad_norm": 14.664664268493652, "learning_rate": 9.582339012953247e-08, "logits/chosen": -19.854219436645508, "logits/rejected": -19.07845687866211, "logps/chosen": -514.2218017578125, "logps/rejected": -374.56585693359375, "loss": 0.3998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.759557247161865, "rewards/margins": 2.178743839263916, "rewards/rejected": 2.580812931060791, "step": 73330 }, { "epoch": 3.4049863039138306, "grad_norm": 0.2686227262020111, "learning_rate": 9.57955336830865e-08, "logits/chosen": -18.542844772338867, "logits/rejected": -17.201589584350586, "logps/chosen": -517.7425537109375, "logps/rejected": -361.13848876953125, "loss": 0.3341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.865040302276611, "rewards/margins": 2.4639389514923096, "rewards/rejected": 2.40110182762146, "step": 73340 }, { "epoch": 3.405450578021264, "grad_norm": 5.692905902862549, "learning_rate": 9.57676772366405e-08, "logits/chosen": -19.344728469848633, "logits/rejected": -18.207754135131836, "logps/chosen": -361.339111328125, "logps/rejected": -311.97509765625, "loss": 0.672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.254591941833496, "rewards/margins": 1.5113227367401123, "rewards/rejected": 2.743269443511963, "step": 73350 }, { "epoch": 3.4059148521286966, "grad_norm": 164.2137908935547, "learning_rate": 9.573982079019453e-08, "logits/chosen": -19.31402015686035, "logits/rejected": -18.84962272644043, "logps/chosen": -337.6562805175781, "logps/rejected": -279.90533447265625, "loss": 0.4669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4618239402770996, "rewards/margins": 1.5530030727386475, "rewards/rejected": 1.9088207483291626, "step": 73360 }, { "epoch": 3.40637912623613, "grad_norm": 20.35358428955078, "learning_rate": 9.571196434374854e-08, "logits/chosen": -20.019166946411133, "logits/rejected": -19.250579833984375, "logps/chosen": -430.13507080078125, "logps/rejected": -340.0008239746094, "loss": 0.1949, "rewards/accuracies": 1.0, "rewards/chosen": 5.273393154144287, "rewards/margins": 2.506511688232422, "rewards/rejected": 2.7668814659118652, "step": 73370 }, { "epoch": 3.406843400343563, "grad_norm": 2.8690567016601562, "learning_rate": 9.568410789730257e-08, "logits/chosen": -19.521976470947266, "logits/rejected": -18.50888442993164, "logps/chosen": -354.5380554199219, "logps/rejected": -259.3017883300781, "loss": 0.5914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.142977237701416, "rewards/margins": 0.9534417986869812, "rewards/rejected": 1.18953537940979, "step": 73380 }, { "epoch": 3.407307674450996, "grad_norm": 11.130080223083496, "learning_rate": 9.565625145085658e-08, "logits/chosen": -18.08552360534668, "logits/rejected": -17.60426139831543, "logps/chosen": -389.44305419921875, "logps/rejected": -337.1923828125, "loss": 0.6715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.782606601715088, "rewards/margins": 1.1749927997589111, "rewards/rejected": 3.6076138019561768, "step": 73390 }, { "epoch": 3.407771948558429, "grad_norm": 54.31007766723633, "learning_rate": 9.562839500441061e-08, "logits/chosen": -19.742935180664062, "logits/rejected": -19.10331916809082, "logps/chosen": -361.03363037109375, "logps/rejected": -292.33465576171875, "loss": 0.479, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.962409257888794, "rewards/margins": 1.3158719539642334, "rewards/rejected": 1.64653742313385, "step": 73400 }, { "epoch": 3.408236222665862, "grad_norm": 3.2581419944763184, "learning_rate": 9.560053855796461e-08, "logits/chosen": -18.948848724365234, "logits/rejected": -18.145191192626953, "logps/chosen": -375.3451232910156, "logps/rejected": -241.44937133789062, "loss": 0.8196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6832923889160156, "rewards/margins": 1.4830373525619507, "rewards/rejected": 2.2002546787261963, "step": 73410 }, { "epoch": 3.408700496773295, "grad_norm": 164.28347778320312, "learning_rate": 9.557268211151864e-08, "logits/chosen": -18.890445709228516, "logits/rejected": -18.013669967651367, "logps/chosen": -391.67999267578125, "logps/rejected": -321.5133056640625, "loss": 0.2998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.736940383911133, "rewards/margins": 2.0067126750946045, "rewards/rejected": 1.730228066444397, "step": 73420 }, { "epoch": 3.409164770880728, "grad_norm": 81.82637023925781, "learning_rate": 9.554482566507265e-08, "logits/chosen": -18.17519760131836, "logits/rejected": -18.482147216796875, "logps/chosen": -394.34881591796875, "logps/rejected": -454.1238708496094, "loss": 1.5525, "rewards/accuracies": 0.5, "rewards/chosen": 2.4857070446014404, "rewards/margins": -0.6489994525909424, "rewards/rejected": 3.134706974029541, "step": 73430 }, { "epoch": 3.409629044988161, "grad_norm": 32.25173568725586, "learning_rate": 9.551696921862668e-08, "logits/chosen": -18.607318878173828, "logits/rejected": -17.95779037475586, "logps/chosen": -397.4571228027344, "logps/rejected": -307.8142395019531, "loss": 0.4172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.727973699569702, "rewards/margins": 1.527954339981079, "rewards/rejected": 1.2000194787979126, "step": 73440 }, { "epoch": 3.410093319095594, "grad_norm": 210.2046661376953, "learning_rate": 9.54891127721807e-08, "logits/chosen": -18.297727584838867, "logits/rejected": -18.260833740234375, "logps/chosen": -312.42242431640625, "logps/rejected": -298.33270263671875, "loss": 0.8421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5139670372009277, "rewards/margins": 0.7687468528747559, "rewards/rejected": 1.745220422744751, "step": 73450 }, { "epoch": 3.410557593203027, "grad_norm": 89.55500793457031, "learning_rate": 9.54612563257347e-08, "logits/chosen": -19.291107177734375, "logits/rejected": -19.273910522460938, "logps/chosen": -359.20721435546875, "logps/rejected": -366.8023681640625, "loss": 0.6787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.323406219482422, "rewards/margins": 0.8579046130180359, "rewards/rejected": 3.4655017852783203, "step": 73460 }, { "epoch": 3.41102186731046, "grad_norm": 111.65399169921875, "learning_rate": 9.543339987928873e-08, "logits/chosen": -20.367420196533203, "logits/rejected": -19.85944938659668, "logps/chosen": -439.8160705566406, "logps/rejected": -291.4198913574219, "loss": 0.5558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.361438751220703, "rewards/margins": 1.4385722875595093, "rewards/rejected": 1.9228661060333252, "step": 73470 }, { "epoch": 3.411486141417893, "grad_norm": 195.28384399414062, "learning_rate": 9.540554343284274e-08, "logits/chosen": -19.704940795898438, "logits/rejected": -18.78549575805664, "logps/chosen": -368.08380126953125, "logps/rejected": -326.73321533203125, "loss": 0.4971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.677459716796875, "rewards/margins": 1.3144654035568237, "rewards/rejected": 2.3629939556121826, "step": 73480 }, { "epoch": 3.411950415525326, "grad_norm": 217.25656127929688, "learning_rate": 9.537768698639677e-08, "logits/chosen": -18.481592178344727, "logits/rejected": -18.350648880004883, "logps/chosen": -416.15380859375, "logps/rejected": -369.56536865234375, "loss": 0.9916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.927791118621826, "rewards/margins": 0.08553914725780487, "rewards/rejected": 2.842252016067505, "step": 73490 }, { "epoch": 3.412414689632759, "grad_norm": 17.262725830078125, "learning_rate": 9.534983053995078e-08, "logits/chosen": -19.995288848876953, "logits/rejected": -19.534950256347656, "logps/chosen": -310.1794738769531, "logps/rejected": -327.08245849609375, "loss": 1.2807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6233630180358887, "rewards/margins": 0.23838531970977783, "rewards/rejected": 2.3849778175354004, "step": 73500 }, { "epoch": 3.412878963740192, "grad_norm": 249.46556091308594, "learning_rate": 9.53219740935048e-08, "logits/chosen": -19.47098159790039, "logits/rejected": -19.295154571533203, "logps/chosen": -434.4072265625, "logps/rejected": -451.73956298828125, "loss": 1.0028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.127049446105957, "rewards/margins": 0.44984301924705505, "rewards/rejected": 3.677206516265869, "step": 73510 }, { "epoch": 3.4133432378476254, "grad_norm": 16.653331756591797, "learning_rate": 9.529411764705881e-08, "logits/chosen": -19.328908920288086, "logits/rejected": -18.32174301147461, "logps/chosen": -473.2803649902344, "logps/rejected": -347.7075500488281, "loss": 0.3739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8298540115356445, "rewards/margins": 2.5202317237854004, "rewards/rejected": 2.3096230030059814, "step": 73520 }, { "epoch": 3.413807511955058, "grad_norm": 28.152278900146484, "learning_rate": 9.526626120061284e-08, "logits/chosen": -19.454782485961914, "logits/rejected": -18.658618927001953, "logps/chosen": -436.3798828125, "logps/rejected": -321.48907470703125, "loss": 0.4587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.265690565109253, "rewards/margins": 1.5114909410476685, "rewards/rejected": 1.7541996240615845, "step": 73530 }, { "epoch": 3.4142717860624914, "grad_norm": 76.38368225097656, "learning_rate": 9.523840475416685e-08, "logits/chosen": -19.164085388183594, "logits/rejected": -18.346477508544922, "logps/chosen": -375.46875, "logps/rejected": -295.48699951171875, "loss": 0.7214, "rewards/accuracies": 0.5, "rewards/chosen": 2.594703197479248, "rewards/margins": 0.564689040184021, "rewards/rejected": 2.0300140380859375, "step": 73540 }, { "epoch": 3.414736060169924, "grad_norm": 2.4238669872283936, "learning_rate": 9.521054830772088e-08, "logits/chosen": -19.257343292236328, "logits/rejected": -18.717296600341797, "logps/chosen": -402.5129699707031, "logps/rejected": -338.6608581542969, "loss": 0.9138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6820228099823, "rewards/margins": 1.0342295169830322, "rewards/rejected": 2.6477935314178467, "step": 73550 }, { "epoch": 3.4152003342773574, "grad_norm": 195.66937255859375, "learning_rate": 9.518269186127488e-08, "logits/chosen": -18.996097564697266, "logits/rejected": -19.702146530151367, "logps/chosen": -354.598876953125, "logps/rejected": -374.6372375488281, "loss": 1.2205, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.999253273010254, "rewards/margins": 0.030660057440400124, "rewards/rejected": 2.9685935974121094, "step": 73560 }, { "epoch": 3.41566460838479, "grad_norm": 112.93394470214844, "learning_rate": 9.515483541482891e-08, "logits/chosen": -18.590368270874023, "logits/rejected": -18.622339248657227, "logps/chosen": -292.81268310546875, "logps/rejected": -295.19488525390625, "loss": 0.8596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6042282581329346, "rewards/margins": 0.44715404510498047, "rewards/rejected": 2.157073974609375, "step": 73570 }, { "epoch": 3.4161288824922234, "grad_norm": 7.761539936065674, "learning_rate": 9.512697896838293e-08, "logits/chosen": -19.785442352294922, "logits/rejected": -17.810762405395508, "logps/chosen": -278.96466064453125, "logps/rejected": -195.76174926757812, "loss": 0.2458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8017706871032715, "rewards/margins": 2.216071367263794, "rewards/rejected": 0.5856992602348328, "step": 73580 }, { "epoch": 3.4165931565996566, "grad_norm": 102.4103012084961, "learning_rate": 9.509912252193695e-08, "logits/chosen": -19.66811752319336, "logits/rejected": -18.679655075073242, "logps/chosen": -459.96588134765625, "logps/rejected": -284.16229248046875, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 3.371826171875, "rewards/margins": 0.8777586817741394, "rewards/rejected": 2.4940671920776367, "step": 73590 }, { "epoch": 3.4170574307070893, "grad_norm": 70.07958984375, "learning_rate": 9.507126607549097e-08, "logits/chosen": -20.131671905517578, "logits/rejected": -18.61798858642578, "logps/chosen": -373.82244873046875, "logps/rejected": -278.88861083984375, "loss": 0.3985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.199911594390869, "rewards/margins": 2.4686129093170166, "rewards/rejected": 1.7312986850738525, "step": 73600 }, { "epoch": 3.4175217048145226, "grad_norm": 2.8334453105926514, "learning_rate": 9.5043409629045e-08, "logits/chosen": -20.025480270385742, "logits/rejected": -19.06351661682129, "logps/chosen": -345.5538635253906, "logps/rejected": -297.062744140625, "loss": 0.5874, "rewards/accuracies": 0.5, "rewards/chosen": 2.865705966949463, "rewards/margins": 1.293673038482666, "rewards/rejected": 1.5720332860946655, "step": 73610 }, { "epoch": 3.4179859789219558, "grad_norm": 53.09263229370117, "learning_rate": 9.5015553182599e-08, "logits/chosen": -18.313570022583008, "logits/rejected": -17.502384185791016, "logps/chosen": -300.1444396972656, "logps/rejected": -240.0353546142578, "loss": 0.6327, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9286272525787354, "rewards/margins": 1.0145509243011475, "rewards/rejected": 0.9140761494636536, "step": 73620 }, { "epoch": 3.4184502530293885, "grad_norm": 0.02826497331261635, "learning_rate": 9.498769673615302e-08, "logits/chosen": -18.77988052368164, "logits/rejected": -17.146282196044922, "logps/chosen": -425.34356689453125, "logps/rejected": -278.484619140625, "loss": 0.2747, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.767239809036255, "rewards/margins": 2.3046326637268066, "rewards/rejected": 1.4626071453094482, "step": 73630 }, { "epoch": 3.4189145271368218, "grad_norm": 23.90345001220703, "learning_rate": 9.495984028970704e-08, "logits/chosen": -19.60602378845215, "logits/rejected": -18.063629150390625, "logps/chosen": -483.6407775878906, "logps/rejected": -322.4224548339844, "loss": 0.5295, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.639983177185059, "rewards/margins": 1.7995914220809937, "rewards/rejected": 2.8403921127319336, "step": 73640 }, { "epoch": 3.4193788012442545, "grad_norm": 132.74569702148438, "learning_rate": 9.493198384326107e-08, "logits/chosen": -19.88352394104004, "logits/rejected": -18.879873275756836, "logps/chosen": -474.18670654296875, "logps/rejected": -382.7395935058594, "loss": 0.4708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.641438961029053, "rewards/margins": 1.408180832862854, "rewards/rejected": 3.23325777053833, "step": 73650 }, { "epoch": 3.4198430753516877, "grad_norm": 24.17684555053711, "learning_rate": 9.490412739681508e-08, "logits/chosen": -19.39445686340332, "logits/rejected": -19.040321350097656, "logps/chosen": -408.66925048828125, "logps/rejected": -360.7250061035156, "loss": 0.5064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.39038348197937, "rewards/margins": 1.1273680925369263, "rewards/rejected": 2.2630155086517334, "step": 73660 }, { "epoch": 3.4203073494591205, "grad_norm": 58.20598602294922, "learning_rate": 9.487627095036908e-08, "logits/chosen": -19.7376766204834, "logits/rejected": -18.805038452148438, "logps/chosen": -428.23333740234375, "logps/rejected": -330.7357177734375, "loss": 0.536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.472330093383789, "rewards/margins": 1.0486879348754883, "rewards/rejected": 3.42364239692688, "step": 73670 }, { "epoch": 3.4207716235665537, "grad_norm": 211.51095581054688, "learning_rate": 9.484841450392311e-08, "logits/chosen": -18.814634323120117, "logits/rejected": -19.817195892333984, "logps/chosen": -345.3880920410156, "logps/rejected": -459.48162841796875, "loss": 1.5327, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.199547290802002, "rewards/margins": -0.5605889558792114, "rewards/rejected": 3.7601356506347656, "step": 73680 }, { "epoch": 3.421235897673987, "grad_norm": 4.13339900970459, "learning_rate": 9.482055805747713e-08, "logits/chosen": -19.31801986694336, "logits/rejected": -17.843914031982422, "logps/chosen": -518.2064208984375, "logps/rejected": -353.35211181640625, "loss": 0.2427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.397218704223633, "rewards/margins": 2.306516170501709, "rewards/rejected": 2.090702533721924, "step": 73690 }, { "epoch": 3.4217001717814197, "grad_norm": 155.60533142089844, "learning_rate": 9.479270161103115e-08, "logits/chosen": -18.641260147094727, "logits/rejected": -18.443424224853516, "logps/chosen": -387.04144287109375, "logps/rejected": -420.07305908203125, "loss": 1.042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6550300121307373, "rewards/margins": 0.4314054548740387, "rewards/rejected": 3.2236247062683105, "step": 73700 }, { "epoch": 3.422164445888853, "grad_norm": 2.3690552711486816, "learning_rate": 9.476484516458517e-08, "logits/chosen": -19.723867416381836, "logits/rejected": -19.322818756103516, "logps/chosen": -428.9859924316406, "logps/rejected": -354.8067626953125, "loss": 0.2551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.06111478805542, "rewards/margins": 1.7229000329971313, "rewards/rejected": 3.33821439743042, "step": 73710 }, { "epoch": 3.4226287199962857, "grad_norm": 1.0521231889724731, "learning_rate": 9.473698871813918e-08, "logits/chosen": -20.446374893188477, "logits/rejected": -18.482908248901367, "logps/chosen": -272.49114990234375, "logps/rejected": -202.75234985351562, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.202836751937866, "rewards/margins": 1.431714653968811, "rewards/rejected": 0.7711222767829895, "step": 73720 }, { "epoch": 3.423092994103719, "grad_norm": 87.89297485351562, "learning_rate": 9.47091322716932e-08, "logits/chosen": -18.305036544799805, "logits/rejected": -18.210067749023438, "logps/chosen": -467.77117919921875, "logps/rejected": -422.5514221191406, "loss": 1.1069, "rewards/accuracies": 0.5, "rewards/chosen": 3.0421242713928223, "rewards/margins": 0.20954561233520508, "rewards/rejected": 2.832578659057617, "step": 73730 }, { "epoch": 3.4235572682111517, "grad_norm": 106.14781188964844, "learning_rate": 9.468127582524722e-08, "logits/chosen": -17.749860763549805, "logits/rejected": -17.72806739807129, "logps/chosen": -354.8742980957031, "logps/rejected": -360.8799743652344, "loss": 1.4331, "rewards/accuracies": 0.5, "rewards/chosen": 2.505171775817871, "rewards/margins": 0.31626805663108826, "rewards/rejected": 2.188903331756592, "step": 73740 }, { "epoch": 3.424021542318585, "grad_norm": 13.562076568603516, "learning_rate": 9.465341937880124e-08, "logits/chosen": -19.861337661743164, "logits/rejected": -19.016845703125, "logps/chosen": -369.1011047363281, "logps/rejected": -310.8079833984375, "loss": 0.3795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3826770782470703, "rewards/margins": 1.583016037940979, "rewards/rejected": 1.7996610403060913, "step": 73750 }, { "epoch": 3.424485816426018, "grad_norm": 0.9350749850273132, "learning_rate": 9.462556293235527e-08, "logits/chosen": -18.68227195739746, "logits/rejected": -17.374238967895508, "logps/chosen": -466.5420837402344, "logps/rejected": -274.0148620605469, "loss": 0.2892, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3940911293029785, "rewards/margins": 2.745047092437744, "rewards/rejected": 1.6490437984466553, "step": 73760 }, { "epoch": 3.424950090533451, "grad_norm": 3.635408401489258, "learning_rate": 9.459770648590927e-08, "logits/chosen": -20.076536178588867, "logits/rejected": -18.684682846069336, "logps/chosen": -437.833740234375, "logps/rejected": -340.19476318359375, "loss": 0.3689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.44780969619751, "rewards/margins": 1.7109874486923218, "rewards/rejected": 2.7368226051330566, "step": 73770 }, { "epoch": 3.425414364640884, "grad_norm": 157.22792053222656, "learning_rate": 9.45698500394633e-08, "logits/chosen": -18.794567108154297, "logits/rejected": -17.585309982299805, "logps/chosen": -325.549560546875, "logps/rejected": -216.79660034179688, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.345597743988037, "rewards/margins": 1.7748905420303345, "rewards/rejected": 1.5707075595855713, "step": 73780 }, { "epoch": 3.425878638748317, "grad_norm": 258.0377197265625, "learning_rate": 9.454199359301731e-08, "logits/chosen": -19.06212615966797, "logits/rejected": -18.541400909423828, "logps/chosen": -376.37213134765625, "logps/rejected": -389.1819763183594, "loss": 1.0638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6469593048095703, "rewards/margins": 1.3993942737579346, "rewards/rejected": 2.247565269470215, "step": 73790 }, { "epoch": 3.42634291285575, "grad_norm": 29.15030288696289, "learning_rate": 9.451413714657134e-08, "logits/chosen": -19.625608444213867, "logits/rejected": -18.56458854675293, "logps/chosen": -425.3042907714844, "logps/rejected": -374.1268615722656, "loss": 1.2767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.229285717010498, "rewards/margins": 1.066317081451416, "rewards/rejected": 3.162968635559082, "step": 73800 }, { "epoch": 3.426807186963183, "grad_norm": 78.50345611572266, "learning_rate": 9.448628070012535e-08, "logits/chosen": -18.482328414916992, "logits/rejected": -17.71343994140625, "logps/chosen": -356.79461669921875, "logps/rejected": -273.4908447265625, "loss": 0.501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8982443809509277, "rewards/margins": 1.475669503211975, "rewards/rejected": 1.4225749969482422, "step": 73810 }, { "epoch": 3.427271461070616, "grad_norm": 7.630611896514893, "learning_rate": 9.445842425367938e-08, "logits/chosen": -18.224323272705078, "logits/rejected": -18.188182830810547, "logps/chosen": -335.1031188964844, "logps/rejected": -318.00579833984375, "loss": 0.3816, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5268607139587402, "rewards/margins": 1.4507191181182861, "rewards/rejected": 1.0761417150497437, "step": 73820 }, { "epoch": 3.4277357351780493, "grad_norm": 45.70285415649414, "learning_rate": 9.443056780723338e-08, "logits/chosen": -18.211668014526367, "logits/rejected": -18.32895851135254, "logps/chosen": -307.046142578125, "logps/rejected": -285.26324462890625, "loss": 0.6835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.40653133392334, "rewards/margins": 0.4950936436653137, "rewards/rejected": 1.911437749862671, "step": 73830 }, { "epoch": 3.428200009285482, "grad_norm": 0.012191691435873508, "learning_rate": 9.440271136078741e-08, "logits/chosen": -19.295522689819336, "logits/rejected": -19.086193084716797, "logps/chosen": -504.7052307128906, "logps/rejected": -459.79998779296875, "loss": 1.2343, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.209289073944092, "rewards/margins": 0.7061083912849426, "rewards/rejected": 3.503180742263794, "step": 73840 }, { "epoch": 3.4286642833929153, "grad_norm": 198.69552612304688, "learning_rate": 9.437485491434142e-08, "logits/chosen": -18.91765785217285, "logits/rejected": -18.97571563720703, "logps/chosen": -424.92352294921875, "logps/rejected": -387.42333984375, "loss": 0.6766, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.399996280670166, "rewards/margins": 0.6004346013069153, "rewards/rejected": 2.7995619773864746, "step": 73850 }, { "epoch": 3.429128557500348, "grad_norm": 19.943927764892578, "learning_rate": 9.434699846789544e-08, "logits/chosen": -20.28263282775879, "logits/rejected": -19.591188430786133, "logps/chosen": -453.93170166015625, "logps/rejected": -411.373046875, "loss": 1.6049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.012110233306885, "rewards/margins": 0.6256572008132935, "rewards/rejected": 3.3864529132843018, "step": 73860 }, { "epoch": 3.4295928316077813, "grad_norm": 174.7500762939453, "learning_rate": 9.431914202144947e-08, "logits/chosen": -18.792156219482422, "logits/rejected": -18.21098518371582, "logps/chosen": -370.1923828125, "logps/rejected": -329.2443542480469, "loss": 0.381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.301237106323242, "rewards/margins": 1.4925416707992554, "rewards/rejected": 1.8086954355239868, "step": 73870 }, { "epoch": 3.430057105715214, "grad_norm": 54.18781661987305, "learning_rate": 9.429128557500347e-08, "logits/chosen": -19.94775390625, "logits/rejected": -18.977317810058594, "logps/chosen": -349.3534240722656, "logps/rejected": -301.4094543457031, "loss": 0.5029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.128829002380371, "rewards/margins": 1.4087839126586914, "rewards/rejected": 2.7200450897216797, "step": 73880 }, { "epoch": 3.4305213798226473, "grad_norm": 117.22978973388672, "learning_rate": 9.42634291285575e-08, "logits/chosen": -19.055383682250977, "logits/rejected": -18.880107879638672, "logps/chosen": -395.0293273925781, "logps/rejected": -371.48846435546875, "loss": 0.9602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.398378849029541, "rewards/margins": -0.04325588792562485, "rewards/rejected": 3.4416351318359375, "step": 73890 }, { "epoch": 3.4309856539300805, "grad_norm": 4.925571918487549, "learning_rate": 9.423557268211151e-08, "logits/chosen": -19.00787925720215, "logits/rejected": -19.345767974853516, "logps/chosen": -311.490478515625, "logps/rejected": -331.4751892089844, "loss": 1.5922, "rewards/accuracies": 0.5, "rewards/chosen": 2.719573736190796, "rewards/margins": -0.6601251363754272, "rewards/rejected": 3.379699230194092, "step": 73900 }, { "epoch": 3.4314499280375133, "grad_norm": 29.039987564086914, "learning_rate": 9.420771623566554e-08, "logits/chosen": -18.522724151611328, "logits/rejected": -17.37057876586914, "logps/chosen": -347.4947204589844, "logps/rejected": -241.37594604492188, "loss": 0.8022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7982871532440186, "rewards/margins": 1.294420838356018, "rewards/rejected": 1.50386643409729, "step": 73910 }, { "epoch": 3.4319142021449465, "grad_norm": 17.272815704345703, "learning_rate": 9.417985978921955e-08, "logits/chosen": -18.852758407592773, "logits/rejected": -18.71234130859375, "logps/chosen": -445.57550048828125, "logps/rejected": -428.09893798828125, "loss": 0.762, "rewards/accuracies": 0.5, "rewards/chosen": 3.390393018722534, "rewards/margins": 0.5306535959243774, "rewards/rejected": 2.859739303588867, "step": 73920 }, { "epoch": 3.4323784762523792, "grad_norm": 146.04220581054688, "learning_rate": 9.415200334277357e-08, "logits/chosen": -18.517099380493164, "logits/rejected": -18.20250701904297, "logps/chosen": -387.30938720703125, "logps/rejected": -363.2154235839844, "loss": 1.0231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6890151500701904, "rewards/margins": 0.9583522081375122, "rewards/rejected": 2.7306630611419678, "step": 73930 }, { "epoch": 3.4328427503598125, "grad_norm": 3.2545695304870605, "learning_rate": 9.412414689632758e-08, "logits/chosen": -19.629487991333008, "logits/rejected": -18.68455696105957, "logps/chosen": -350.38232421875, "logps/rejected": -281.66986083984375, "loss": 1.1932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.985485076904297, "rewards/margins": 0.8470395803451538, "rewards/rejected": 2.1384453773498535, "step": 73940 }, { "epoch": 3.4333070244672452, "grad_norm": 80.7566909790039, "learning_rate": 9.409629044988161e-08, "logits/chosen": -19.400373458862305, "logits/rejected": -18.301284790039062, "logps/chosen": -486.48895263671875, "logps/rejected": -407.8582763671875, "loss": 0.2747, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.76658296585083, "rewards/margins": 2.7443087100982666, "rewards/rejected": 2.0222744941711426, "step": 73950 }, { "epoch": 3.4337712985746784, "grad_norm": 8.995538711547852, "learning_rate": 9.406843400343562e-08, "logits/chosen": -18.915233612060547, "logits/rejected": -18.595439910888672, "logps/chosen": -318.97943115234375, "logps/rejected": -267.50482177734375, "loss": 0.4318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5801491737365723, "rewards/margins": 1.419938087463379, "rewards/rejected": 1.1602110862731934, "step": 73960 }, { "epoch": 3.4342355726821117, "grad_norm": 128.56468200683594, "learning_rate": 9.404057755698965e-08, "logits/chosen": -18.723560333251953, "logits/rejected": -18.252470016479492, "logps/chosen": -395.50726318359375, "logps/rejected": -305.79656982421875, "loss": 0.7697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9975860118865967, "rewards/margins": 1.5407593250274658, "rewards/rejected": 2.45682692527771, "step": 73970 }, { "epoch": 3.4346998467895444, "grad_norm": 5.5395941734313965, "learning_rate": 9.401272111054365e-08, "logits/chosen": -19.537710189819336, "logits/rejected": -17.509061813354492, "logps/chosen": -407.4866943359375, "logps/rejected": -263.7843322753906, "loss": 0.4258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1901516914367676, "rewards/margins": 1.7166459560394287, "rewards/rejected": 1.4735054969787598, "step": 73980 }, { "epoch": 3.4351641208969776, "grad_norm": 0.21154768764972687, "learning_rate": 9.398486466409768e-08, "logits/chosen": -20.048282623291016, "logits/rejected": -19.044780731201172, "logps/chosen": -389.42156982421875, "logps/rejected": -336.0132751464844, "loss": 1.0834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.330812454223633, "rewards/margins": 1.8759349584579468, "rewards/rejected": 2.4548776149749756, "step": 73990 }, { "epoch": 3.4356283950044104, "grad_norm": 1.3552675247192383, "learning_rate": 9.39570082176517e-08, "logits/chosen": -19.33482551574707, "logits/rejected": -17.719085693359375, "logps/chosen": -457.2340393066406, "logps/rejected": -346.5441589355469, "loss": 0.3641, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.626593828201294, "rewards/margins": 1.9112741947174072, "rewards/rejected": 1.7153196334838867, "step": 74000 }, { "epoch": 3.4360926691118436, "grad_norm": 100.79584503173828, "learning_rate": 9.392915177120572e-08, "logits/chosen": -19.049448013305664, "logits/rejected": -18.352161407470703, "logps/chosen": -391.1406555175781, "logps/rejected": -296.9171142578125, "loss": 0.7295, "rewards/accuracies": 0.5, "rewards/chosen": 3.7683727741241455, "rewards/margins": 1.354328989982605, "rewards/rejected": 2.41404390335083, "step": 74010 }, { "epoch": 3.436556943219277, "grad_norm": 6.747596263885498, "learning_rate": 9.390129532475974e-08, "logits/chosen": -18.7728328704834, "logits/rejected": -17.30849838256836, "logps/chosen": -331.84588623046875, "logps/rejected": -198.87295532226562, "loss": 0.2429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0842607021331787, "rewards/margins": 2.27184796333313, "rewards/rejected": 0.8124127388000488, "step": 74020 }, { "epoch": 3.4370212173267096, "grad_norm": 65.54119110107422, "learning_rate": 9.387343887831377e-08, "logits/chosen": -19.49201202392578, "logits/rejected": -17.52313804626465, "logps/chosen": -318.3334655761719, "logps/rejected": -201.48440551757812, "loss": 0.2994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.867541790008545, "rewards/margins": 2.474229335784912, "rewards/rejected": 1.393312692642212, "step": 74030 }, { "epoch": 3.437485491434143, "grad_norm": 53.1525764465332, "learning_rate": 9.384558243186777e-08, "logits/chosen": -18.988605499267578, "logits/rejected": -18.440540313720703, "logps/chosen": -310.46856689453125, "logps/rejected": -293.41583251953125, "loss": 0.7432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.820756435394287, "rewards/margins": 0.9415286779403687, "rewards/rejected": 1.879227638244629, "step": 74040 }, { "epoch": 3.4379497655415756, "grad_norm": 183.50828552246094, "learning_rate": 9.381772598542178e-08, "logits/chosen": -19.660192489624023, "logits/rejected": -18.848308563232422, "logps/chosen": -512.4551391601562, "logps/rejected": -368.7392883300781, "loss": 0.4729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.438739776611328, "rewards/margins": 1.741876244544983, "rewards/rejected": 2.6968631744384766, "step": 74050 }, { "epoch": 3.438414039649009, "grad_norm": 188.4214324951172, "learning_rate": 9.378986953897581e-08, "logits/chosen": -19.328420639038086, "logits/rejected": -18.50958251953125, "logps/chosen": -428.8777770996094, "logps/rejected": -288.2640075683594, "loss": 0.2812, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.462006568908691, "rewards/margins": 2.6829497814178467, "rewards/rejected": 1.7790567874908447, "step": 74060 }, { "epoch": 3.438878313756442, "grad_norm": 8.059873580932617, "learning_rate": 9.376201309252982e-08, "logits/chosen": -18.625263214111328, "logits/rejected": -17.979036331176758, "logps/chosen": -467.53778076171875, "logps/rejected": -378.3341064453125, "loss": 0.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.919460773468018, "rewards/margins": 1.6656322479248047, "rewards/rejected": 3.253828525543213, "step": 74070 }, { "epoch": 3.439342587863875, "grad_norm": 119.97134399414062, "learning_rate": 9.373415664608385e-08, "logits/chosen": -18.206867218017578, "logits/rejected": -17.259302139282227, "logps/chosen": -406.72076416015625, "logps/rejected": -281.04132080078125, "loss": 0.5559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.991168737411499, "rewards/margins": 1.3039228916168213, "rewards/rejected": 1.6872456073760986, "step": 74080 }, { "epoch": 3.439806861971308, "grad_norm": 1.4539649486541748, "learning_rate": 9.370630019963785e-08, "logits/chosen": -18.491748809814453, "logits/rejected": -17.911989212036133, "logps/chosen": -380.4777526855469, "logps/rejected": -251.1915740966797, "loss": 0.8785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6138312816619873, "rewards/margins": 1.1771281957626343, "rewards/rejected": 1.4367027282714844, "step": 74090 }, { "epoch": 3.440271136078741, "grad_norm": 126.31580352783203, "learning_rate": 9.367844375319188e-08, "logits/chosen": -19.01290512084961, "logits/rejected": -18.189977645874023, "logps/chosen": -302.2361755371094, "logps/rejected": -288.9549560546875, "loss": 0.3707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1686320304870605, "rewards/margins": 1.7139301300048828, "rewards/rejected": 1.4547021389007568, "step": 74100 }, { "epoch": 3.440735410186174, "grad_norm": 171.02642822265625, "learning_rate": 9.36505873067459e-08, "logits/chosen": -20.215953826904297, "logits/rejected": -19.180086135864258, "logps/chosen": -542.6358642578125, "logps/rejected": -454.2431640625, "loss": 0.7032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.413177013397217, "rewards/margins": 0.993847668170929, "rewards/rejected": 4.419329643249512, "step": 74110 }, { "epoch": 3.441199684293607, "grad_norm": 208.67218017578125, "learning_rate": 9.362273086029992e-08, "logits/chosen": -19.527816772460938, "logits/rejected": -18.611576080322266, "logps/chosen": -404.57769775390625, "logps/rejected": -263.402587890625, "loss": 0.4404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.05478835105896, "rewards/margins": 1.3281975984573364, "rewards/rejected": 1.7265907526016235, "step": 74120 }, { "epoch": 3.44166395840104, "grad_norm": 95.01172637939453, "learning_rate": 9.359487441385394e-08, "logits/chosen": -18.612821578979492, "logits/rejected": -18.43716049194336, "logps/chosen": -345.70904541015625, "logps/rejected": -278.5690002441406, "loss": 0.6983, "rewards/accuracies": 0.5, "rewards/chosen": 3.0172548294067383, "rewards/margins": 0.5797361135482788, "rewards/rejected": 2.437518835067749, "step": 74130 }, { "epoch": 3.442128232508473, "grad_norm": 18.67140007019043, "learning_rate": 9.356701796740795e-08, "logits/chosen": -18.2614688873291, "logits/rejected": -17.46735191345215, "logps/chosen": -325.0166320800781, "logps/rejected": -321.379150390625, "loss": 0.8201, "rewards/accuracies": 0.5, "rewards/chosen": 3.027094602584839, "rewards/margins": 0.8185675740242004, "rewards/rejected": 2.208527088165283, "step": 74140 }, { "epoch": 3.442592506615906, "grad_norm": 0.20539765059947968, "learning_rate": 9.353916152096197e-08, "logits/chosen": -19.760841369628906, "logits/rejected": -18.01897430419922, "logps/chosen": -420.9561462402344, "logps/rejected": -227.99728393554688, "loss": 0.2949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.018038749694824, "rewards/margins": 2.7862446308135986, "rewards/rejected": 2.2317943572998047, "step": 74150 }, { "epoch": 3.443056780723339, "grad_norm": 152.08506774902344, "learning_rate": 9.3511305074516e-08, "logits/chosen": -18.760944366455078, "logits/rejected": -18.29894256591797, "logps/chosen": -297.7372741699219, "logps/rejected": -291.4822692871094, "loss": 0.9323, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0725502967834473, "rewards/margins": 0.18831516802310944, "rewards/rejected": 1.8842350244522095, "step": 74160 }, { "epoch": 3.443521054830772, "grad_norm": 105.48572540283203, "learning_rate": 9.348344862807001e-08, "logits/chosen": -19.073040008544922, "logits/rejected": -17.933712005615234, "logps/chosen": -487.88116455078125, "logps/rejected": -429.5726013183594, "loss": 0.6673, "rewards/accuracies": 0.5, "rewards/chosen": 4.187829971313477, "rewards/margins": 0.9623907804489136, "rewards/rejected": 3.2254390716552734, "step": 74170 }, { "epoch": 3.443985328938205, "grad_norm": 88.21624755859375, "learning_rate": 9.345559218162404e-08, "logits/chosen": -19.683408737182617, "logits/rejected": -19.295442581176758, "logps/chosen": -369.3779602050781, "logps/rejected": -323.6872253417969, "loss": 0.8171, "rewards/accuracies": 0.5, "rewards/chosen": 2.7799887657165527, "rewards/margins": 0.30833208560943604, "rewards/rejected": 2.4716567993164062, "step": 74180 }, { "epoch": 3.444449603045638, "grad_norm": 127.10293579101562, "learning_rate": 9.342773573517804e-08, "logits/chosen": -19.008525848388672, "logits/rejected": -19.725025177001953, "logps/chosen": -358.6422424316406, "logps/rejected": -324.4117431640625, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9724087715148926, "rewards/margins": 0.3628832697868347, "rewards/rejected": 2.609525203704834, "step": 74190 }, { "epoch": 3.444913877153071, "grad_norm": 38.61890411376953, "learning_rate": 9.339987928873207e-08, "logits/chosen": -19.346832275390625, "logits/rejected": -18.458171844482422, "logps/chosen": -390.2603759765625, "logps/rejected": -343.65863037109375, "loss": 0.9481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.19547438621521, "rewards/margins": 0.7679033279418945, "rewards/rejected": 1.427571177482605, "step": 74200 }, { "epoch": 3.4453781512605044, "grad_norm": 95.40910339355469, "learning_rate": 9.337202284228608e-08, "logits/chosen": -18.783416748046875, "logits/rejected": -17.58144760131836, "logps/chosen": -344.9085998535156, "logps/rejected": -256.02459716796875, "loss": 0.4111, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6889312267303467, "rewards/margins": 1.6793100833892822, "rewards/rejected": 1.009621024131775, "step": 74210 }, { "epoch": 3.445842425367937, "grad_norm": 71.72891998291016, "learning_rate": 9.334416639584011e-08, "logits/chosen": -18.06167984008789, "logits/rejected": -17.733707427978516, "logps/chosen": -424.61273193359375, "logps/rejected": -291.68743896484375, "loss": 0.6212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8550400733947754, "rewards/margins": 1.108176827430725, "rewards/rejected": 1.7468633651733398, "step": 74220 }, { "epoch": 3.4463066994753704, "grad_norm": 116.90753936767578, "learning_rate": 9.331630994939412e-08, "logits/chosen": -19.55624771118164, "logits/rejected": -18.015575408935547, "logps/chosen": -383.9329833984375, "logps/rejected": -247.7571258544922, "loss": 0.4779, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4527087211608887, "rewards/margins": 2.200819492340088, "rewards/rejected": 1.251889944076538, "step": 74230 }, { "epoch": 3.446770973582803, "grad_norm": 0.07923218607902527, "learning_rate": 9.328845350294815e-08, "logits/chosen": -18.686283111572266, "logits/rejected": -18.673437118530273, "logps/chosen": -367.063720703125, "logps/rejected": -285.7481689453125, "loss": 0.9838, "rewards/accuracies": 0.5, "rewards/chosen": 3.6514601707458496, "rewards/margins": 1.3753533363342285, "rewards/rejected": 2.276106834411621, "step": 74240 }, { "epoch": 3.4472352476902364, "grad_norm": 121.41365051269531, "learning_rate": 9.326059705650215e-08, "logits/chosen": -19.007902145385742, "logits/rejected": -19.417470932006836, "logps/chosen": -401.69268798828125, "logps/rejected": -356.9443359375, "loss": 1.3627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.151355266571045, "rewards/margins": 0.1283588856458664, "rewards/rejected": 3.022996187210083, "step": 74250 }, { "epoch": 3.447699521797669, "grad_norm": 0.025412462651729584, "learning_rate": 9.323274061005617e-08, "logits/chosen": -19.127788543701172, "logits/rejected": -18.067611694335938, "logps/chosen": -498.4158630371094, "logps/rejected": -287.4089660644531, "loss": 0.39, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4009785652160645, "rewards/margins": 1.736505150794983, "rewards/rejected": 1.6644738912582397, "step": 74260 }, { "epoch": 3.4481637959051024, "grad_norm": 81.1839370727539, "learning_rate": 9.320488416361019e-08, "logits/chosen": -19.064889907836914, "logits/rejected": -19.20313262939453, "logps/chosen": -344.0506896972656, "logps/rejected": -349.03692626953125, "loss": 0.6141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8567757606506348, "rewards/margins": 0.7100415229797363, "rewards/rejected": 2.1467342376708984, "step": 74270 }, { "epoch": 3.4486280700125356, "grad_norm": 95.91375732421875, "learning_rate": 9.317702771716421e-08, "logits/chosen": -19.1572265625, "logits/rejected": -18.505414962768555, "logps/chosen": -341.86285400390625, "logps/rejected": -273.2860412597656, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0175094604492188, "rewards/margins": 0.9982402920722961, "rewards/rejected": 2.0192689895629883, "step": 74280 }, { "epoch": 3.4490923441199683, "grad_norm": 1.753122329711914, "learning_rate": 9.314917127071824e-08, "logits/chosen": -18.911319732666016, "logits/rejected": -19.398435592651367, "logps/chosen": -401.4509582519531, "logps/rejected": -419.87762451171875, "loss": 1.4858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.319080352783203, "rewards/margins": 0.5540453195571899, "rewards/rejected": 2.7650349140167236, "step": 74290 }, { "epoch": 3.4495566182274016, "grad_norm": 1.5624407529830933, "learning_rate": 9.312131482427224e-08, "logits/chosen": -19.315662384033203, "logits/rejected": -18.032861709594727, "logps/chosen": -379.907470703125, "logps/rejected": -320.1049499511719, "loss": 0.959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6357855796813965, "rewards/margins": 1.3762539625167847, "rewards/rejected": 2.2595314979553223, "step": 74300 }, { "epoch": 3.4500208923348343, "grad_norm": 0.06344562023878098, "learning_rate": 9.309345837782626e-08, "logits/chosen": -19.748016357421875, "logits/rejected": -18.60610580444336, "logps/chosen": -482.65130615234375, "logps/rejected": -251.8733673095703, "loss": 0.4336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.090620994567871, "rewards/margins": 2.39620304107666, "rewards/rejected": 1.69441819190979, "step": 74310 }, { "epoch": 3.4504851664422675, "grad_norm": 1.5405998229980469, "learning_rate": 9.306560193138028e-08, "logits/chosen": -19.143535614013672, "logits/rejected": -18.117284774780273, "logps/chosen": -306.94061279296875, "logps/rejected": -244.5983428955078, "loss": 0.4415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4590651988983154, "rewards/margins": 1.6064351797103882, "rewards/rejected": 0.8526299595832825, "step": 74320 }, { "epoch": 3.4509494405497003, "grad_norm": 8.39452838897705, "learning_rate": 9.303774548493431e-08, "logits/chosen": -19.309345245361328, "logits/rejected": -20.1970157623291, "logps/chosen": -346.4014587402344, "logps/rejected": -351.5401306152344, "loss": 0.5024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.926265001296997, "rewards/margins": 1.057250738143921, "rewards/rejected": 1.8690143823623657, "step": 74330 }, { "epoch": 3.4514137146571335, "grad_norm": 27.68160629272461, "learning_rate": 9.300988903848832e-08, "logits/chosen": -19.59088134765625, "logits/rejected": -18.930011749267578, "logps/chosen": -338.9551086425781, "logps/rejected": -279.619384765625, "loss": 0.3345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.344831943511963, "rewards/margins": 1.8116830587387085, "rewards/rejected": 1.5331486463546753, "step": 74340 }, { "epoch": 3.4518779887645668, "grad_norm": 41.833675384521484, "learning_rate": 9.298203259204234e-08, "logits/chosen": -19.229103088378906, "logits/rejected": -18.35385513305664, "logps/chosen": -435.6153869628906, "logps/rejected": -368.5764465332031, "loss": 0.5006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8170928955078125, "rewards/margins": 0.9009129405021667, "rewards/rejected": 1.916179895401001, "step": 74350 }, { "epoch": 3.4523422628719995, "grad_norm": 0.6436622142791748, "learning_rate": 9.295417614559635e-08, "logits/chosen": -19.237796783447266, "logits/rejected": -18.637813568115234, "logps/chosen": -382.6900634765625, "logps/rejected": -355.25775146484375, "loss": 0.8367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.285027027130127, "rewards/margins": 0.5884426832199097, "rewards/rejected": 2.6965837478637695, "step": 74360 }, { "epoch": 3.4528065369794327, "grad_norm": 0.11704933643341064, "learning_rate": 9.292631969915038e-08, "logits/chosen": -18.80521011352539, "logits/rejected": -19.07522201538086, "logps/chosen": -286.96160888671875, "logps/rejected": -277.85955810546875, "loss": 1.0977, "rewards/accuracies": 0.5, "rewards/chosen": 2.0760486125946045, "rewards/margins": 0.38414302468299866, "rewards/rejected": 1.6919057369232178, "step": 74370 }, { "epoch": 3.4532708110868655, "grad_norm": 61.566768646240234, "learning_rate": 9.289846325270439e-08, "logits/chosen": -20.106220245361328, "logits/rejected": -20.03529930114746, "logps/chosen": -386.730224609375, "logps/rejected": -428.87164306640625, "loss": 0.7077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.492713451385498, "rewards/margins": 0.3967738747596741, "rewards/rejected": 3.0959396362304688, "step": 74380 }, { "epoch": 3.4537350851942987, "grad_norm": 67.84628295898438, "learning_rate": 9.287060680625842e-08, "logits/chosen": -18.558040618896484, "logits/rejected": -18.26467514038086, "logps/chosen": -403.5227966308594, "logps/rejected": -319.7576599121094, "loss": 0.4004, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.25266695022583, "rewards/margins": 1.279292106628418, "rewards/rejected": 1.9733749628067017, "step": 74390 }, { "epoch": 3.4541993593017315, "grad_norm": 53.17753601074219, "learning_rate": 9.284275035981242e-08, "logits/chosen": -20.056806564331055, "logits/rejected": -19.396106719970703, "logps/chosen": -334.294189453125, "logps/rejected": -346.2164611816406, "loss": 1.5096, "rewards/accuracies": 0.5, "rewards/chosen": 3.385776996612549, "rewards/margins": -0.09703101962804794, "rewards/rejected": 3.4828078746795654, "step": 74400 }, { "epoch": 3.4546636334091647, "grad_norm": 125.39067840576172, "learning_rate": 9.281489391336645e-08, "logits/chosen": -19.29995346069336, "logits/rejected": -18.765655517578125, "logps/chosen": -424.06787109375, "logps/rejected": -424.44891357421875, "loss": 0.5585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.354554176330566, "rewards/margins": 0.8428612947463989, "rewards/rejected": 3.511692762374878, "step": 74410 }, { "epoch": 3.455127907516598, "grad_norm": 83.78225708007812, "learning_rate": 9.278703746692046e-08, "logits/chosen": -18.750167846679688, "logits/rejected": -18.377132415771484, "logps/chosen": -353.4281311035156, "logps/rejected": -294.7929992675781, "loss": 1.1423, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.759023666381836, "rewards/margins": 0.3829914629459381, "rewards/rejected": 2.3760321140289307, "step": 74420 }, { "epoch": 3.4555921816240307, "grad_norm": 207.81057739257812, "learning_rate": 9.275918102047449e-08, "logits/chosen": -18.624340057373047, "logits/rejected": -17.574726104736328, "logps/chosen": -414.888916015625, "logps/rejected": -337.2777404785156, "loss": 0.9144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.386717319488525, "rewards/margins": 1.243916392326355, "rewards/rejected": 3.142800807952881, "step": 74430 }, { "epoch": 3.456056455731464, "grad_norm": 83.74312591552734, "learning_rate": 9.273132457402851e-08, "logits/chosen": -19.278705596923828, "logits/rejected": -18.877334594726562, "logps/chosen": -451.4512634277344, "logps/rejected": -345.9875183105469, "loss": 0.9003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.041689395904541, "rewards/margins": 0.43244847655296326, "rewards/rejected": 3.609240770339966, "step": 74440 }, { "epoch": 3.456520729838897, "grad_norm": 28.420482635498047, "learning_rate": 9.270346812758251e-08, "logits/chosen": -20.13827133178711, "logits/rejected": -18.332983016967773, "logps/chosen": -416.00579833984375, "logps/rejected": -234.44271850585938, "loss": 0.5355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9415483474731445, "rewards/margins": 1.648630142211914, "rewards/rejected": 1.2929182052612305, "step": 74450 }, { "epoch": 3.45698500394633, "grad_norm": 124.69477081298828, "learning_rate": 9.267561168113654e-08, "logits/chosen": -19.91054344177246, "logits/rejected": -19.235475540161133, "logps/chosen": -458.64727783203125, "logps/rejected": -440.42901611328125, "loss": 0.9733, "rewards/accuracies": 0.5, "rewards/chosen": 3.991699695587158, "rewards/margins": 0.3506903052330017, "rewards/rejected": 3.641009569168091, "step": 74460 }, { "epoch": 3.457449278053763, "grad_norm": 126.6221923828125, "learning_rate": 9.264775523469055e-08, "logits/chosen": -19.376806259155273, "logits/rejected": -19.79153823852539, "logps/chosen": -351.531005859375, "logps/rejected": -400.76556396484375, "loss": 0.7977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.357686996459961, "rewards/margins": 0.0928221195936203, "rewards/rejected": 3.2648651599884033, "step": 74470 }, { "epoch": 3.457913552161196, "grad_norm": 71.99860382080078, "learning_rate": 9.261989878824458e-08, "logits/chosen": -18.826915740966797, "logits/rejected": -18.509599685668945, "logps/chosen": -360.817626953125, "logps/rejected": -296.47247314453125, "loss": 0.7769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2407500743865967, "rewards/margins": 1.122031569480896, "rewards/rejected": 2.118718385696411, "step": 74480 }, { "epoch": 3.458377826268629, "grad_norm": 13.07359504699707, "learning_rate": 9.259204234179859e-08, "logits/chosen": -19.18213653564453, "logits/rejected": -18.436954498291016, "logps/chosen": -430.94482421875, "logps/rejected": -348.22418212890625, "loss": 0.8646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.169219017028809, "rewards/margins": 1.9604575634002686, "rewards/rejected": 2.208761692047119, "step": 74490 }, { "epoch": 3.458842100376062, "grad_norm": 38.03693771362305, "learning_rate": 9.256418589535262e-08, "logits/chosen": -18.816091537475586, "logits/rejected": -18.706096649169922, "logps/chosen": -334.36029052734375, "logps/rejected": -279.9197082519531, "loss": 0.6379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.817673683166504, "rewards/margins": 0.774196207523346, "rewards/rejected": 2.0434775352478027, "step": 74500 }, { "epoch": 3.459306374483495, "grad_norm": 33.79656982421875, "learning_rate": 9.253632944890662e-08, "logits/chosen": -18.586135864257812, "logits/rejected": -17.89739227294922, "logps/chosen": -317.0135803222656, "logps/rejected": -259.1365966796875, "loss": 0.3793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.575582265853882, "rewards/margins": 1.5160748958587646, "rewards/rejected": 1.0595074892044067, "step": 74510 }, { "epoch": 3.4597706485909283, "grad_norm": 67.39246368408203, "learning_rate": 9.250847300246065e-08, "logits/chosen": -19.084396362304688, "logits/rejected": -18.565773010253906, "logps/chosen": -321.9279479980469, "logps/rejected": -244.5867919921875, "loss": 0.5194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8336141109466553, "rewards/margins": 1.2569175958633423, "rewards/rejected": 0.576696515083313, "step": 74520 }, { "epoch": 3.460234922698361, "grad_norm": 47.61465835571289, "learning_rate": 9.248061655601466e-08, "logits/chosen": -20.303909301757812, "logits/rejected": -18.846683502197266, "logps/chosen": -490.2216796875, "logps/rejected": -308.29327392578125, "loss": 0.3549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.119342803955078, "rewards/margins": 1.605290412902832, "rewards/rejected": 2.514052152633667, "step": 74530 }, { "epoch": 3.4606991968057943, "grad_norm": 155.31686401367188, "learning_rate": 9.245276010956869e-08, "logits/chosen": -19.405576705932617, "logits/rejected": -18.85725975036621, "logps/chosen": -473.94122314453125, "logps/rejected": -412.1494140625, "loss": 0.8205, "rewards/accuracies": 0.5, "rewards/chosen": 4.4148736000061035, "rewards/margins": 1.539402961730957, "rewards/rejected": 2.8754706382751465, "step": 74540 }, { "epoch": 3.461163470913227, "grad_norm": 48.130943298339844, "learning_rate": 9.24249036631227e-08, "logits/chosen": -18.68285369873047, "logits/rejected": -17.684078216552734, "logps/chosen": -388.80810546875, "logps/rejected": -246.2344207763672, "loss": 0.6255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4896749258041382, "rewards/margins": 1.0796579122543335, "rewards/rejected": 0.4100169241428375, "step": 74550 }, { "epoch": 3.4616277450206603, "grad_norm": 147.946044921875, "learning_rate": 9.239704721667672e-08, "logits/chosen": -19.027042388916016, "logits/rejected": -17.879838943481445, "logps/chosen": -442.438720703125, "logps/rejected": -347.48895263671875, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.05831241607666, "rewards/margins": 1.2138348817825317, "rewards/rejected": 2.844477415084839, "step": 74560 }, { "epoch": 3.462092019128093, "grad_norm": 49.71983337402344, "learning_rate": 9.236919077023074e-08, "logits/chosen": -18.429248809814453, "logits/rejected": -17.760162353515625, "logps/chosen": -250.5312957763672, "logps/rejected": -215.94558715820312, "loss": 0.5993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1995139122009277, "rewards/margins": 1.1283258199691772, "rewards/rejected": 1.0711880922317505, "step": 74570 }, { "epoch": 3.4625562932355263, "grad_norm": 118.6739501953125, "learning_rate": 9.234133432378476e-08, "logits/chosen": -20.558862686157227, "logits/rejected": -20.284971237182617, "logps/chosen": -401.897705078125, "logps/rejected": -328.81927490234375, "loss": 1.084, "rewards/accuracies": 0.5, "rewards/chosen": 3.9368178844451904, "rewards/margins": 0.2400999516248703, "rewards/rejected": 3.6967177391052246, "step": 74580 }, { "epoch": 3.4630205673429595, "grad_norm": 4.194049835205078, "learning_rate": 9.231347787733878e-08, "logits/chosen": -19.089603424072266, "logits/rejected": -18.148479461669922, "logps/chosen": -443.59722900390625, "logps/rejected": -383.2342834472656, "loss": 0.5662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3823142051696777, "rewards/margins": 1.0915005207061768, "rewards/rejected": 2.29081392288208, "step": 74590 }, { "epoch": 3.4634848414503923, "grad_norm": 6.89976692199707, "learning_rate": 9.22856214308928e-08, "logits/chosen": -19.109712600708008, "logits/rejected": -18.582813262939453, "logps/chosen": -460.7032165527344, "logps/rejected": -380.27838134765625, "loss": 0.8208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.032271862030029, "rewards/margins": 1.299459457397461, "rewards/rejected": 2.7328128814697266, "step": 74600 }, { "epoch": 3.4639491155578255, "grad_norm": 114.98762512207031, "learning_rate": 9.225776498444681e-08, "logits/chosen": -19.688114166259766, "logits/rejected": -18.990890502929688, "logps/chosen": -485.65234375, "logps/rejected": -390.0999450683594, "loss": 0.5614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.130759239196777, "rewards/margins": 1.5779085159301758, "rewards/rejected": 4.55285120010376, "step": 74610 }, { "epoch": 3.4644133896652582, "grad_norm": 39.80052947998047, "learning_rate": 9.222990853800083e-08, "logits/chosen": -18.865276336669922, "logits/rejected": -17.80124282836914, "logps/chosen": -382.1434631347656, "logps/rejected": -271.5838317871094, "loss": 0.4496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.048916339874268, "rewards/margins": 1.9812170267105103, "rewards/rejected": 2.0676987171173096, "step": 74620 }, { "epoch": 3.4648776637726915, "grad_norm": 32.59629440307617, "learning_rate": 9.220205209155485e-08, "logits/chosen": -19.76274871826172, "logits/rejected": -17.682811737060547, "logps/chosen": -398.87347412109375, "logps/rejected": -239.9914093017578, "loss": 0.4213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7085647583007812, "rewards/margins": 2.347677707672119, "rewards/rejected": 1.3608872890472412, "step": 74630 }, { "epoch": 3.4653419378801242, "grad_norm": 156.3350830078125, "learning_rate": 9.217419564510886e-08, "logits/chosen": -19.018381118774414, "logits/rejected": -17.99848747253418, "logps/chosen": -443.17364501953125, "logps/rejected": -407.9122314453125, "loss": 0.5369, "rewards/accuracies": 0.5, "rewards/chosen": 4.0538787841796875, "rewards/margins": 1.6952892541885376, "rewards/rejected": 2.3585894107818604, "step": 74640 }, { "epoch": 3.4658062119875575, "grad_norm": 108.4023208618164, "learning_rate": 9.214633919866289e-08, "logits/chosen": -19.23899269104004, "logits/rejected": -18.880084991455078, "logps/chosen": -390.3743591308594, "logps/rejected": -436.6681213378906, "loss": 0.8243, "rewards/accuracies": 0.5, "rewards/chosen": 3.831923007965088, "rewards/margins": 0.3749990463256836, "rewards/rejected": 3.456923723220825, "step": 74650 }, { "epoch": 3.4662704860949907, "grad_norm": 11.47994327545166, "learning_rate": 9.211848275221689e-08, "logits/chosen": -20.005043029785156, "logits/rejected": -18.844284057617188, "logps/chosen": -444.2877502441406, "logps/rejected": -375.032958984375, "loss": 0.2815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.710860252380371, "rewards/margins": 2.107725143432617, "rewards/rejected": 2.603135108947754, "step": 74660 }, { "epoch": 3.4667347602024234, "grad_norm": 7.245418071746826, "learning_rate": 9.209062630577092e-08, "logits/chosen": -18.799060821533203, "logits/rejected": -17.707332611083984, "logps/chosen": -379.18597412109375, "logps/rejected": -247.25839233398438, "loss": 0.2043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8956642150878906, "rewards/margins": 2.17155385017395, "rewards/rejected": 0.7241103649139404, "step": 74670 }, { "epoch": 3.4671990343098567, "grad_norm": 146.65663146972656, "learning_rate": 9.206276985932493e-08, "logits/chosen": -18.6256103515625, "logits/rejected": -17.630983352661133, "logps/chosen": -353.36199951171875, "logps/rejected": -296.5311279296875, "loss": 0.9189, "rewards/accuracies": 0.5, "rewards/chosen": 2.652622699737549, "rewards/margins": 1.2418501377105713, "rewards/rejected": 1.4107725620269775, "step": 74680 }, { "epoch": 3.4676633084172894, "grad_norm": 78.4721450805664, "learning_rate": 9.203491341287896e-08, "logits/chosen": -18.597084045410156, "logits/rejected": -18.420608520507812, "logps/chosen": -454.30303955078125, "logps/rejected": -440.42486572265625, "loss": 1.288, "rewards/accuracies": 0.5, "rewards/chosen": 4.832618236541748, "rewards/margins": 0.3490990698337555, "rewards/rejected": 4.483519554138184, "step": 74690 }, { "epoch": 3.4681275825247226, "grad_norm": 69.22921752929688, "learning_rate": 9.200705696643298e-08, "logits/chosen": -19.595596313476562, "logits/rejected": -18.970375061035156, "logps/chosen": -489.6319274902344, "logps/rejected": -399.641357421875, "loss": 0.7514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.42185115814209, "rewards/margins": 1.7677568197250366, "rewards/rejected": 3.6540935039520264, "step": 74700 }, { "epoch": 3.4685918566321554, "grad_norm": 57.19674301147461, "learning_rate": 9.1979200519987e-08, "logits/chosen": -20.3116512298584, "logits/rejected": -19.818424224853516, "logps/chosen": -399.6385192871094, "logps/rejected": -311.49896240234375, "loss": 0.5096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0502095222473145, "rewards/margins": 1.4499280452728271, "rewards/rejected": 2.6002814769744873, "step": 74710 }, { "epoch": 3.4690561307395886, "grad_norm": 13.093520164489746, "learning_rate": 9.1951344073541e-08, "logits/chosen": -19.05691146850586, "logits/rejected": -18.447298049926758, "logps/chosen": -337.7619934082031, "logps/rejected": -302.1657409667969, "loss": 0.7242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8013129234313965, "rewards/margins": 1.3150628805160522, "rewards/rejected": 1.4862498044967651, "step": 74720 }, { "epoch": 3.469520404847022, "grad_norm": 47.04738998413086, "learning_rate": 9.192348762709503e-08, "logits/chosen": -19.104156494140625, "logits/rejected": -18.49854850769043, "logps/chosen": -290.40850830078125, "logps/rejected": -216.9909210205078, "loss": 0.4074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.59199857711792, "rewards/margins": 2.033334255218506, "rewards/rejected": 0.5586642026901245, "step": 74730 }, { "epoch": 3.4699846789544546, "grad_norm": 88.97879028320312, "learning_rate": 9.189563118064905e-08, "logits/chosen": -19.72336769104004, "logits/rejected": -19.71506690979004, "logps/chosen": -363.7353515625, "logps/rejected": -301.96514892578125, "loss": 0.8199, "rewards/accuracies": 0.5, "rewards/chosen": 2.233177661895752, "rewards/margins": 0.22835561633110046, "rewards/rejected": 2.004822015762329, "step": 74740 }, { "epoch": 3.470448953061888, "grad_norm": 20.72673225402832, "learning_rate": 9.186777473420308e-08, "logits/chosen": -19.176843643188477, "logits/rejected": -18.442058563232422, "logps/chosen": -373.3928527832031, "logps/rejected": -335.09503173828125, "loss": 0.8719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6461844444274902, "rewards/margins": 0.7160639762878418, "rewards/rejected": 1.9301207065582275, "step": 74750 }, { "epoch": 3.4709132271693206, "grad_norm": 50.15724182128906, "learning_rate": 9.183991828775709e-08, "logits/chosen": -18.612060546875, "logits/rejected": -17.771333694458008, "logps/chosen": -402.4416809082031, "logps/rejected": -315.875, "loss": 0.8414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5754318237304688, "rewards/margins": 1.177332878112793, "rewards/rejected": 2.398099184036255, "step": 74760 }, { "epoch": 3.471377501276754, "grad_norm": 12.355846405029297, "learning_rate": 9.18120618413111e-08, "logits/chosen": -18.67520523071289, "logits/rejected": -18.50449562072754, "logps/chosen": -383.54205322265625, "logps/rejected": -398.62860107421875, "loss": 0.9012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.237513780593872, "rewards/margins": 0.18328535556793213, "rewards/rejected": 3.0542283058166504, "step": 74770 }, { "epoch": 3.4718417753841866, "grad_norm": 44.275550842285156, "learning_rate": 9.178420539486512e-08, "logits/chosen": -19.448930740356445, "logits/rejected": -18.093610763549805, "logps/chosen": -508.15179443359375, "logps/rejected": -336.13214111328125, "loss": 0.269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.670910358428955, "rewards/margins": 2.0401949882507324, "rewards/rejected": 2.6307144165039062, "step": 74780 }, { "epoch": 3.47230604949162, "grad_norm": 10.087406158447266, "learning_rate": 9.175634894841915e-08, "logits/chosen": -18.648677825927734, "logits/rejected": -18.207836151123047, "logps/chosen": -341.1672058105469, "logps/rejected": -364.0514831542969, "loss": 0.6943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9743456840515137, "rewards/margins": 0.8947290182113647, "rewards/rejected": 2.0796167850494385, "step": 74790 }, { "epoch": 3.472770323599053, "grad_norm": 118.92341613769531, "learning_rate": 9.172849250197316e-08, "logits/chosen": -18.629257202148438, "logits/rejected": -18.50704002380371, "logps/chosen": -364.27581787109375, "logps/rejected": -293.27850341796875, "loss": 0.7972, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.573668956756592, "rewards/margins": 0.844174861907959, "rewards/rejected": 1.7294940948486328, "step": 74800 }, { "epoch": 3.473234597706486, "grad_norm": 76.97209930419922, "learning_rate": 9.170063605552719e-08, "logits/chosen": -19.034473419189453, "logits/rejected": -18.686216354370117, "logps/chosen": -366.9972839355469, "logps/rejected": -361.83123779296875, "loss": 1.3633, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.869694948196411, "rewards/margins": -0.7043235898017883, "rewards/rejected": 3.5740184783935547, "step": 74810 }, { "epoch": 3.473698871813919, "grad_norm": 36.20160675048828, "learning_rate": 9.167277960908119e-08, "logits/chosen": -19.054262161254883, "logits/rejected": -18.460575103759766, "logps/chosen": -403.8781433105469, "logps/rejected": -301.9667053222656, "loss": 0.7137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6007771492004395, "rewards/margins": 0.7244430184364319, "rewards/rejected": 1.8763344287872314, "step": 74820 }, { "epoch": 3.474163145921352, "grad_norm": 101.3587875366211, "learning_rate": 9.164492316263522e-08, "logits/chosen": -19.736835479736328, "logits/rejected": -19.161861419677734, "logps/chosen": -353.4239196777344, "logps/rejected": -340.2351379394531, "loss": 0.5982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.723675489425659, "rewards/margins": 1.1494033336639404, "rewards/rejected": 2.5742721557617188, "step": 74830 }, { "epoch": 3.474627420028785, "grad_norm": 37.59006118774414, "learning_rate": 9.161706671618923e-08, "logits/chosen": -18.579307556152344, "logits/rejected": -18.417232513427734, "logps/chosen": -323.45269775390625, "logps/rejected": -229.31533813476562, "loss": 0.6234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8898653984069824, "rewards/margins": 1.3740590810775757, "rewards/rejected": 1.5158063173294067, "step": 74840 }, { "epoch": 3.475091694136218, "grad_norm": 138.88111877441406, "learning_rate": 9.158921026974325e-08, "logits/chosen": -19.8643856048584, "logits/rejected": -18.336101531982422, "logps/chosen": -312.58148193359375, "logps/rejected": -257.0571594238281, "loss": 0.3417, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.993117570877075, "rewards/margins": 1.7836910486221313, "rewards/rejected": 2.2094264030456543, "step": 74850 }, { "epoch": 3.475555968243651, "grad_norm": 40.55459213256836, "learning_rate": 9.156135382329728e-08, "logits/chosen": -19.613506317138672, "logits/rejected": -17.77627182006836, "logps/chosen": -365.9371643066406, "logps/rejected": -340.1728820800781, "loss": 0.8037, "rewards/accuracies": 0.5, "rewards/chosen": 4.270091533660889, "rewards/margins": 1.0636961460113525, "rewards/rejected": 3.2063956260681152, "step": 74860 }, { "epoch": 3.476020242351084, "grad_norm": 1.505718469619751, "learning_rate": 9.153349737685128e-08, "logits/chosen": -18.953083038330078, "logits/rejected": -18.331710815429688, "logps/chosen": -388.5157165527344, "logps/rejected": -310.68841552734375, "loss": 0.4863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.456404685974121, "rewards/margins": 2.4096667766571045, "rewards/rejected": 2.0467379093170166, "step": 74870 }, { "epoch": 3.476484516458517, "grad_norm": 23.776222229003906, "learning_rate": 9.15056409304053e-08, "logits/chosen": -18.871829986572266, "logits/rejected": -18.14346694946289, "logps/chosen": -396.1313171386719, "logps/rejected": -312.430908203125, "loss": 0.5336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.606184482574463, "rewards/margins": 1.8881343603134155, "rewards/rejected": 1.7180497646331787, "step": 74880 }, { "epoch": 3.47694879056595, "grad_norm": 10.926109313964844, "learning_rate": 9.147778448395932e-08, "logits/chosen": -19.281925201416016, "logits/rejected": -18.081737518310547, "logps/chosen": -399.84979248046875, "logps/rejected": -306.6297912597656, "loss": 0.5994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6557185649871826, "rewards/margins": 1.5744737386703491, "rewards/rejected": 2.081244707107544, "step": 74890 }, { "epoch": 3.4774130646733834, "grad_norm": 15.998577117919922, "learning_rate": 9.144992803751335e-08, "logits/chosen": -18.779888153076172, "logits/rejected": -18.251564025878906, "logps/chosen": -512.6253662109375, "logps/rejected": -384.99041748046875, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 3.9341869354248047, "rewards/margins": 2.0171806812286377, "rewards/rejected": 1.9170061349868774, "step": 74900 }, { "epoch": 3.477877338780816, "grad_norm": 0.0895049050450325, "learning_rate": 9.142207159106736e-08, "logits/chosen": -20.10260009765625, "logits/rejected": -18.49521255493164, "logps/chosen": -465.36297607421875, "logps/rejected": -303.17987060546875, "loss": 0.4614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.574762344360352, "rewards/margins": 2.2882144451141357, "rewards/rejected": 2.286548137664795, "step": 74910 }, { "epoch": 3.4783416128882494, "grad_norm": 60.42621612548828, "learning_rate": 9.139421514462139e-08, "logits/chosen": -18.953731536865234, "logits/rejected": -18.558242797851562, "logps/chosen": -411.408203125, "logps/rejected": -333.82867431640625, "loss": 0.5643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.056796073913574, "rewards/margins": 1.1597236394882202, "rewards/rejected": 2.8970723152160645, "step": 74920 }, { "epoch": 3.478805886995682, "grad_norm": 44.52885437011719, "learning_rate": 9.136635869817539e-08, "logits/chosen": -18.56528663635254, "logits/rejected": -18.270809173583984, "logps/chosen": -304.3745422363281, "logps/rejected": -265.3501892089844, "loss": 0.4793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.830406904220581, "rewards/margins": 1.0659348964691162, "rewards/rejected": 1.7644720077514648, "step": 74930 }, { "epoch": 3.4792701611031154, "grad_norm": 63.32749557495117, "learning_rate": 9.133850225172942e-08, "logits/chosen": -18.35539436340332, "logits/rejected": -17.8773250579834, "logps/chosen": -453.07763671875, "logps/rejected": -362.1993103027344, "loss": 1.1138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.895400285720825, "rewards/margins": 1.0930147171020508, "rewards/rejected": 2.8023855686187744, "step": 74940 }, { "epoch": 3.479734435210548, "grad_norm": 65.9891586303711, "learning_rate": 9.131064580528343e-08, "logits/chosen": -19.287137985229492, "logits/rejected": -19.442668914794922, "logps/chosen": -502.74981689453125, "logps/rejected": -474.87646484375, "loss": 0.8101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2824602127075195, "rewards/margins": 0.7828783392906189, "rewards/rejected": 3.4995815753936768, "step": 74950 }, { "epoch": 3.4801987093179814, "grad_norm": 0.35520103573799133, "learning_rate": 9.128278935883746e-08, "logits/chosen": -18.47573471069336, "logits/rejected": -18.284793853759766, "logps/chosen": -382.3809509277344, "logps/rejected": -370.89593505859375, "loss": 2.053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.171414375305176, "rewards/margins": -0.08525103330612183, "rewards/rejected": 3.256664991378784, "step": 74960 }, { "epoch": 3.4806629834254146, "grad_norm": 8.9296293258667, "learning_rate": 9.125493291239148e-08, "logits/chosen": -19.080881118774414, "logits/rejected": -19.052579879760742, "logps/chosen": -308.9226379394531, "logps/rejected": -298.7919616699219, "loss": 1.04, "rewards/accuracies": 0.5, "rewards/chosen": 1.8257477283477783, "rewards/margins": -0.010944962501525879, "rewards/rejected": 1.8366928100585938, "step": 74970 }, { "epoch": 3.4811272575328474, "grad_norm": 167.6014862060547, "learning_rate": 9.122707646594549e-08, "logits/chosen": -18.375171661376953, "logits/rejected": -18.755313873291016, "logps/chosen": -295.39105224609375, "logps/rejected": -359.6271057128906, "loss": 1.0223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3228278160095215, "rewards/margins": 0.022721003741025925, "rewards/rejected": 2.3001067638397217, "step": 74980 }, { "epoch": 3.4815915316402806, "grad_norm": 107.55245971679688, "learning_rate": 9.11992200194995e-08, "logits/chosen": -18.86675453186035, "logits/rejected": -18.443632125854492, "logps/chosen": -351.63250732421875, "logps/rejected": -306.3460998535156, "loss": 0.8247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.054130792617798, "rewards/margins": 0.7666023969650269, "rewards/rejected": 2.2875285148620605, "step": 74990 }, { "epoch": 3.4820558057477133, "grad_norm": 57.3116455078125, "learning_rate": 9.117136357305353e-08, "logits/chosen": -19.406496047973633, "logits/rejected": -17.319978713989258, "logps/chosen": -508.4261779785156, "logps/rejected": -325.72943115234375, "loss": 0.22, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.662012577056885, "rewards/margins": 3.0228400230407715, "rewards/rejected": 2.639172315597534, "step": 75000 }, { "epoch": 3.4825200798551466, "grad_norm": 115.45230865478516, "learning_rate": 9.114350712660755e-08, "logits/chosen": -20.08999252319336, "logits/rejected": -19.78362274169922, "logps/chosen": -484.94000244140625, "logps/rejected": -395.2523498535156, "loss": 0.8736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.806832313537598, "rewards/margins": 0.7671881914138794, "rewards/rejected": 4.039644241333008, "step": 75010 }, { "epoch": 3.4829843539625793, "grad_norm": 179.42562866210938, "learning_rate": 9.111565068016157e-08, "logits/chosen": -19.53168296813965, "logits/rejected": -18.882591247558594, "logps/chosen": -436.00177001953125, "logps/rejected": -404.7894592285156, "loss": 0.4859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5802464485168457, "rewards/margins": 0.8853128552436829, "rewards/rejected": 2.6949336528778076, "step": 75020 }, { "epoch": 3.4834486280700125, "grad_norm": 1.3165148496627808, "learning_rate": 9.108779423371558e-08, "logits/chosen": -19.123306274414062, "logits/rejected": -17.990198135375977, "logps/chosen": -432.52978515625, "logps/rejected": -290.1860046386719, "loss": 0.4706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.122818946838379, "rewards/margins": 2.5943548679351807, "rewards/rejected": 1.5284638404846191, "step": 75030 }, { "epoch": 3.4839129021774458, "grad_norm": 2.6041810512542725, "learning_rate": 9.105993778726959e-08, "logits/chosen": -19.984338760375977, "logits/rejected": -19.05925941467285, "logps/chosen": -397.9967346191406, "logps/rejected": -289.06512451171875, "loss": 0.6327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.961560010910034, "rewards/margins": 1.6100826263427734, "rewards/rejected": 2.3514771461486816, "step": 75040 }, { "epoch": 3.4843771762848785, "grad_norm": 1.1694631576538086, "learning_rate": 9.103208134082362e-08, "logits/chosen": -20.517841339111328, "logits/rejected": -19.030391693115234, "logps/chosen": -436.0679626464844, "logps/rejected": -293.54730224609375, "loss": 0.3006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.67994499206543, "rewards/margins": 2.823674440383911, "rewards/rejected": 1.8562710285186768, "step": 75050 }, { "epoch": 3.4848414503923117, "grad_norm": 95.19849395751953, "learning_rate": 9.100422489437763e-08, "logits/chosen": -19.660812377929688, "logits/rejected": -19.115901947021484, "logps/chosen": -509.72198486328125, "logps/rejected": -459.02691650390625, "loss": 0.8463, "rewards/accuracies": 0.5, "rewards/chosen": 4.268886089324951, "rewards/margins": 0.6409501433372498, "rewards/rejected": 3.6279358863830566, "step": 75060 }, { "epoch": 3.4853057244997445, "grad_norm": 303.1543273925781, "learning_rate": 9.097636844793166e-08, "logits/chosen": -18.963773727416992, "logits/rejected": -18.14107894897461, "logps/chosen": -519.501708984375, "logps/rejected": -437.33306884765625, "loss": 0.922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.858757972717285, "rewards/margins": 1.5799858570098877, "rewards/rejected": 3.2787718772888184, "step": 75070 }, { "epoch": 3.4857699986071777, "grad_norm": 183.6333770751953, "learning_rate": 9.094851200148566e-08, "logits/chosen": -18.82591438293457, "logits/rejected": -18.052936553955078, "logps/chosen": -431.9241638183594, "logps/rejected": -363.82916259765625, "loss": 1.0994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.588627576828003, "rewards/margins": 1.3452221155166626, "rewards/rejected": 2.24340558052063, "step": 75080 }, { "epoch": 3.4862342727146105, "grad_norm": 65.37751770019531, "learning_rate": 9.092065555503969e-08, "logits/chosen": -19.13043212890625, "logits/rejected": -18.37592315673828, "logps/chosen": -373.8257141113281, "logps/rejected": -301.7623596191406, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0103635787963867, "rewards/margins": 0.8966075778007507, "rewards/rejected": 2.113755941390991, "step": 75090 }, { "epoch": 3.4866985468220437, "grad_norm": 41.4888916015625, "learning_rate": 9.08927991085937e-08, "logits/chosen": -18.261104583740234, "logits/rejected": -16.84372329711914, "logps/chosen": -447.9775390625, "logps/rejected": -273.8619079589844, "loss": 0.9183, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3196792602539062, "rewards/margins": 1.9549520015716553, "rewards/rejected": 1.3647269010543823, "step": 75100 }, { "epoch": 3.487162820929477, "grad_norm": 202.9356231689453, "learning_rate": 9.086494266214773e-08, "logits/chosen": -19.02578353881836, "logits/rejected": -18.50303840637207, "logps/chosen": -432.6897888183594, "logps/rejected": -394.82513427734375, "loss": 1.0001, "rewards/accuracies": 0.5, "rewards/chosen": 3.3772029876708984, "rewards/margins": 0.3738863170146942, "rewards/rejected": 3.003316640853882, "step": 75110 }, { "epoch": 3.4876270950369097, "grad_norm": 2.7043168544769287, "learning_rate": 9.083708621570175e-08, "logits/chosen": -20.289905548095703, "logits/rejected": -18.249958038330078, "logps/chosen": -441.97747802734375, "logps/rejected": -326.12493896484375, "loss": 0.1636, "rewards/accuracies": 1.0, "rewards/chosen": 4.867795944213867, "rewards/margins": 2.3288424015045166, "rewards/rejected": 2.5389530658721924, "step": 75120 }, { "epoch": 3.488091369144343, "grad_norm": 292.9767761230469, "learning_rate": 9.080922976925577e-08, "logits/chosen": -20.800081253051758, "logits/rejected": -19.570758819580078, "logps/chosen": -417.454345703125, "logps/rejected": -335.77294921875, "loss": 0.9283, "rewards/accuracies": 0.5, "rewards/chosen": 4.057323455810547, "rewards/margins": 0.7890671491622925, "rewards/rejected": 3.2682559490203857, "step": 75130 }, { "epoch": 3.4885556432517757, "grad_norm": 76.4549789428711, "learning_rate": 9.078137332280978e-08, "logits/chosen": -18.240856170654297, "logits/rejected": -17.75021743774414, "logps/chosen": -299.4783630371094, "logps/rejected": -275.24298095703125, "loss": 1.3329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.041210651397705, "rewards/margins": 0.575440526008606, "rewards/rejected": 2.4657700061798096, "step": 75140 }, { "epoch": 3.489019917359209, "grad_norm": 4.405886173248291, "learning_rate": 9.07535168763638e-08, "logits/chosen": -19.054296493530273, "logits/rejected": -17.798717498779297, "logps/chosen": -386.45562744140625, "logps/rejected": -312.19775390625, "loss": 0.5974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.159247398376465, "rewards/margins": 1.6532455682754517, "rewards/rejected": 1.5060017108917236, "step": 75150 }, { "epoch": 3.4894841914666417, "grad_norm": 1.3563745021820068, "learning_rate": 9.072566042991782e-08, "logits/chosen": -18.893077850341797, "logits/rejected": -17.550186157226562, "logps/chosen": -369.59283447265625, "logps/rejected": -237.3253936767578, "loss": 0.2312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2514448165893555, "rewards/margins": 2.624131679534912, "rewards/rejected": 1.627312421798706, "step": 75160 }, { "epoch": 3.489948465574075, "grad_norm": 122.47232055664062, "learning_rate": 9.069780398347185e-08, "logits/chosen": -18.73911476135254, "logits/rejected": -17.721834182739258, "logps/chosen": -395.48040771484375, "logps/rejected": -268.19586181640625, "loss": 0.5549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2929813861846924, "rewards/margins": 1.96096932888031, "rewards/rejected": 1.3320118188858032, "step": 75170 }, { "epoch": 3.490412739681508, "grad_norm": 56.5315055847168, "learning_rate": 9.066994753702586e-08, "logits/chosen": -19.07517433166504, "logits/rejected": -17.642284393310547, "logps/chosen": -474.77032470703125, "logps/rejected": -270.76385498046875, "loss": 0.2676, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.901007652282715, "rewards/margins": 2.730067491531372, "rewards/rejected": 2.170940399169922, "step": 75180 }, { "epoch": 3.490877013788941, "grad_norm": 71.31944274902344, "learning_rate": 9.064209109057987e-08, "logits/chosen": -18.764728546142578, "logits/rejected": -17.844104766845703, "logps/chosen": -432.29571533203125, "logps/rejected": -342.53521728515625, "loss": 0.3911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.154923439025879, "rewards/margins": 1.8283460140228271, "rewards/rejected": 2.326577663421631, "step": 75190 }, { "epoch": 3.491341287896374, "grad_norm": 20.714479446411133, "learning_rate": 9.061423464413389e-08, "logits/chosen": -18.958179473876953, "logits/rejected": -18.46787452697754, "logps/chosen": -449.95599365234375, "logps/rejected": -411.28570556640625, "loss": 0.8708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5526492595672607, "rewards/margins": 1.2168471813201904, "rewards/rejected": 2.3358020782470703, "step": 75200 }, { "epoch": 3.491805562003807, "grad_norm": 246.114990234375, "learning_rate": 9.058637819768792e-08, "logits/chosen": -19.645652770996094, "logits/rejected": -19.090967178344727, "logps/chosen": -403.11541748046875, "logps/rejected": -373.20709228515625, "loss": 0.5847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.48734974861145, "rewards/margins": 0.7777423858642578, "rewards/rejected": 2.7096071243286133, "step": 75210 }, { "epoch": 3.49226983611124, "grad_norm": 291.6455383300781, "learning_rate": 9.055852175124193e-08, "logits/chosen": -21.565488815307617, "logits/rejected": -19.47881507873535, "logps/chosen": -400.49676513671875, "logps/rejected": -366.2923889160156, "loss": 0.5744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.825105667114258, "rewards/margins": 1.0220022201538086, "rewards/rejected": 2.803102970123291, "step": 75220 }, { "epoch": 3.492734110218673, "grad_norm": 37.95609664916992, "learning_rate": 9.053066530479596e-08, "logits/chosen": -18.568317413330078, "logits/rejected": -17.650943756103516, "logps/chosen": -577.0823974609375, "logps/rejected": -372.4156799316406, "loss": 0.544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.644026517868042, "rewards/margins": 1.4003403186798096, "rewards/rejected": 2.2436869144439697, "step": 75230 }, { "epoch": 3.493198384326106, "grad_norm": 33.221588134765625, "learning_rate": 9.050280885834996e-08, "logits/chosen": -18.837284088134766, "logits/rejected": -18.087913513183594, "logps/chosen": -341.5160217285156, "logps/rejected": -254.88363647460938, "loss": 0.2913, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.964754104614258, "rewards/margins": 1.6623947620391846, "rewards/rejected": 1.3023592233657837, "step": 75240 }, { "epoch": 3.4936626584335393, "grad_norm": 178.41323852539062, "learning_rate": 9.047495241190398e-08, "logits/chosen": -18.721166610717773, "logits/rejected": -19.042081832885742, "logps/chosen": -347.76019287109375, "logps/rejected": -365.3157043457031, "loss": 1.8109, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3186416625976562, "rewards/margins": -1.165846347808838, "rewards/rejected": 3.484488010406494, "step": 75250 }, { "epoch": 3.494126932540972, "grad_norm": 169.5991973876953, "learning_rate": 9.0447095965458e-08, "logits/chosen": -19.818798065185547, "logits/rejected": -18.58774757385254, "logps/chosen": -220.5245819091797, "logps/rejected": -206.5916290283203, "loss": 0.5396, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1372368335723877, "rewards/margins": 0.7763179540634155, "rewards/rejected": 1.3609187602996826, "step": 75260 }, { "epoch": 3.4945912066484053, "grad_norm": 73.395263671875, "learning_rate": 9.041923951901202e-08, "logits/chosen": -18.823904037475586, "logits/rejected": -18.089160919189453, "logps/chosen": -456.64447021484375, "logps/rejected": -381.25115966796875, "loss": 0.9467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.890577554702759, "rewards/margins": 0.9062501788139343, "rewards/rejected": 1.9843275547027588, "step": 75270 }, { "epoch": 3.4950554807558385, "grad_norm": 19.471290588378906, "learning_rate": 9.039138307256605e-08, "logits/chosen": -19.21236801147461, "logits/rejected": -19.3237247467041, "logps/chosen": -373.61492919921875, "logps/rejected": -422.0011291503906, "loss": 0.8252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.887510299682617, "rewards/margins": 0.33936959505081177, "rewards/rejected": 2.54814076423645, "step": 75280 }, { "epoch": 3.4955197548632713, "grad_norm": 50.3163948059082, "learning_rate": 9.036352662612005e-08, "logits/chosen": -18.836387634277344, "logits/rejected": -18.56197738647461, "logps/chosen": -365.6360168457031, "logps/rejected": -337.9533386230469, "loss": 1.1866, "rewards/accuracies": 0.5, "rewards/chosen": 2.9330856800079346, "rewards/margins": 0.23934109508991241, "rewards/rejected": 2.693744659423828, "step": 75290 }, { "epoch": 3.4959840289707045, "grad_norm": 60.9925537109375, "learning_rate": 9.033567017967407e-08, "logits/chosen": -19.64150047302246, "logits/rejected": -19.578813552856445, "logps/chosen": -512.5613403320312, "logps/rejected": -510.6544494628906, "loss": 1.4495, "rewards/accuracies": 0.5, "rewards/chosen": 4.400454044342041, "rewards/margins": -0.12707142531871796, "rewards/rejected": 4.527525424957275, "step": 75300 }, { "epoch": 3.4964483030781373, "grad_norm": 41.18557357788086, "learning_rate": 9.030781373322809e-08, "logits/chosen": -21.521991729736328, "logits/rejected": -20.11467170715332, "logps/chosen": -373.46759033203125, "logps/rejected": -351.07012939453125, "loss": 0.7213, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8098175525665283, "rewards/margins": 0.5145767331123352, "rewards/rejected": 3.295240879058838, "step": 75310 }, { "epoch": 3.4969125771855705, "grad_norm": 53.9564094543457, "learning_rate": 9.027995728678212e-08, "logits/chosen": -19.603120803833008, "logits/rejected": -18.255435943603516, "logps/chosen": -353.3028564453125, "logps/rejected": -245.22848510742188, "loss": 0.935, "rewards/accuracies": 0.5, "rewards/chosen": 3.7075672149658203, "rewards/margins": 1.7854681015014648, "rewards/rejected": 1.9220987558364868, "step": 75320 }, { "epoch": 3.4973768512930032, "grad_norm": 106.98287963867188, "learning_rate": 9.025210084033613e-08, "logits/chosen": -18.25539207458496, "logits/rejected": -17.945323944091797, "logps/chosen": -310.3114929199219, "logps/rejected": -265.49200439453125, "loss": 0.9478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7106354236602783, "rewards/margins": -0.13546086847782135, "rewards/rejected": 1.846096396446228, "step": 75330 }, { "epoch": 3.4978411254004365, "grad_norm": 88.43853759765625, "learning_rate": 9.022424439389016e-08, "logits/chosen": -19.109926223754883, "logits/rejected": -18.489093780517578, "logps/chosen": -349.43865966796875, "logps/rejected": -304.6729431152344, "loss": 0.6055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9688382148742676, "rewards/margins": 1.0924537181854248, "rewards/rejected": 1.8763847351074219, "step": 75340 }, { "epoch": 3.4983053995078697, "grad_norm": 92.72421264648438, "learning_rate": 9.019638794744416e-08, "logits/chosen": -19.408039093017578, "logits/rejected": -18.498443603515625, "logps/chosen": -318.5009765625, "logps/rejected": -255.07705688476562, "loss": 0.8193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4358134269714355, "rewards/margins": 0.7565475702285767, "rewards/rejected": 1.6792659759521484, "step": 75350 }, { "epoch": 3.4987696736153024, "grad_norm": 32.63978576660156, "learning_rate": 9.016853150099819e-08, "logits/chosen": -19.160175323486328, "logits/rejected": -18.635690689086914, "logps/chosen": -345.630126953125, "logps/rejected": -325.98931884765625, "loss": 0.5937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.114387035369873, "rewards/margins": 0.645740270614624, "rewards/rejected": 2.468647003173828, "step": 75360 }, { "epoch": 3.4992339477227357, "grad_norm": 30.620203018188477, "learning_rate": 9.01406750545522e-08, "logits/chosen": -19.591617584228516, "logits/rejected": -18.471782684326172, "logps/chosen": -428.45001220703125, "logps/rejected": -279.9345397949219, "loss": 0.4406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7688488960266113, "rewards/margins": 1.7321889400482178, "rewards/rejected": 2.0366597175598145, "step": 75370 }, { "epoch": 3.4996982218301684, "grad_norm": 58.88489532470703, "learning_rate": 9.011281860810623e-08, "logits/chosen": -19.660099029541016, "logits/rejected": -17.85623550415039, "logps/chosen": -378.0914001464844, "logps/rejected": -294.224853515625, "loss": 0.5277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.24650764465332, "rewards/margins": 2.1330020427703857, "rewards/rejected": 2.1135058403015137, "step": 75380 }, { "epoch": 3.5001624959376016, "grad_norm": 5.765925407409668, "learning_rate": 9.008496216166024e-08, "logits/chosen": -19.479595184326172, "logits/rejected": -18.552139282226562, "logps/chosen": -401.2421569824219, "logps/rejected": -329.56317138671875, "loss": 0.4822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.034759044647217, "rewards/margins": 2.503668785095215, "rewards/rejected": 1.531090497970581, "step": 75390 }, { "epoch": 3.5006267700450344, "grad_norm": 85.47014617919922, "learning_rate": 9.005710571521426e-08, "logits/chosen": -19.04727554321289, "logits/rejected": -18.248504638671875, "logps/chosen": -371.065185546875, "logps/rejected": -277.6927795410156, "loss": 0.3285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0608537197113037, "rewards/margins": 1.6767905950546265, "rewards/rejected": 1.3840631246566772, "step": 75400 }, { "epoch": 3.5010910441524676, "grad_norm": 130.58412170410156, "learning_rate": 9.002924926876827e-08, "logits/chosen": -20.388919830322266, "logits/rejected": -19.374013900756836, "logps/chosen": -393.87823486328125, "logps/rejected": -467.90771484375, "loss": 1.3071, "rewards/accuracies": 0.5, "rewards/chosen": 4.054421901702881, "rewards/margins": -0.22294290363788605, "rewards/rejected": 4.277365207672119, "step": 75410 }, { "epoch": 3.501555318259901, "grad_norm": 25.019014358520508, "learning_rate": 9.00013928223223e-08, "logits/chosen": -19.529815673828125, "logits/rejected": -18.337589263916016, "logps/chosen": -418.4295959472656, "logps/rejected": -330.74169921875, "loss": 0.3109, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.233835697174072, "rewards/margins": 1.935325026512146, "rewards/rejected": 2.2985105514526367, "step": 75420 }, { "epoch": 3.5020195923673336, "grad_norm": 46.32356262207031, "learning_rate": 8.997353637587632e-08, "logits/chosen": -19.404247283935547, "logits/rejected": -18.76036834716797, "logps/chosen": -292.38946533203125, "logps/rejected": -229.367431640625, "loss": 0.7225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5360732078552246, "rewards/margins": 0.7612656354904175, "rewards/rejected": 1.7748076915740967, "step": 75430 }, { "epoch": 3.502483866474767, "grad_norm": 190.86129760742188, "learning_rate": 8.994567992943032e-08, "logits/chosen": -18.501079559326172, "logits/rejected": -19.204313278198242, "logps/chosen": -300.8561096191406, "logps/rejected": -373.77960205078125, "loss": 1.2849, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.350874423980713, "rewards/margins": -0.5191607475280762, "rewards/rejected": 2.87003493309021, "step": 75440 }, { "epoch": 3.5029481405821996, "grad_norm": 0.9810624122619629, "learning_rate": 8.991782348298435e-08, "logits/chosen": -20.173303604125977, "logits/rejected": -18.366291046142578, "logps/chosen": -374.6872253417969, "logps/rejected": -279.1205749511719, "loss": 0.3105, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9541728496551514, "rewards/margins": 2.156907558441162, "rewards/rejected": 1.7972652912139893, "step": 75450 }, { "epoch": 3.503412414689633, "grad_norm": 49.27532958984375, "learning_rate": 8.988996703653836e-08, "logits/chosen": -19.433080673217773, "logits/rejected": -19.202350616455078, "logps/chosen": -364.45758056640625, "logps/rejected": -321.5481872558594, "loss": 0.7187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3798980712890625, "rewards/margins": 0.29543301463127136, "rewards/rejected": 2.0844650268554688, "step": 75460 }, { "epoch": 3.5038766887970656, "grad_norm": 2.8590848445892334, "learning_rate": 8.986211059009239e-08, "logits/chosen": -19.272258758544922, "logits/rejected": -17.672122955322266, "logps/chosen": -382.9964599609375, "logps/rejected": -280.72344970703125, "loss": 0.353, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7190845012664795, "rewards/margins": 1.7970142364501953, "rewards/rejected": 1.9220702648162842, "step": 75470 }, { "epoch": 3.504340962904499, "grad_norm": 64.22391510009766, "learning_rate": 8.98342541436464e-08, "logits/chosen": -19.39063262939453, "logits/rejected": -19.169519424438477, "logps/chosen": -468.05975341796875, "logps/rejected": -479.1809997558594, "loss": 0.4153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.587121963500977, "rewards/margins": 1.1088863611221313, "rewards/rejected": 4.478235244750977, "step": 75480 }, { "epoch": 3.504805237011932, "grad_norm": 2.7391867637634277, "learning_rate": 8.980639769720043e-08, "logits/chosen": -18.275646209716797, "logits/rejected": -18.905080795288086, "logps/chosen": -443.84210205078125, "logps/rejected": -362.6297912597656, "loss": 0.5353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.624150276184082, "rewards/margins": 1.9320180416107178, "rewards/rejected": 2.6921324729919434, "step": 75490 }, { "epoch": 3.505269511119365, "grad_norm": 23.914745330810547, "learning_rate": 8.977854125075443e-08, "logits/chosen": -19.755420684814453, "logits/rejected": -18.79264259338379, "logps/chosen": -451.0400390625, "logps/rejected": -305.84185791015625, "loss": 0.4717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.088789939880371, "rewards/margins": 2.0989623069763184, "rewards/rejected": 1.9898271560668945, "step": 75500 }, { "epoch": 3.505733785226798, "grad_norm": 237.58909606933594, "learning_rate": 8.975068480430846e-08, "logits/chosen": -19.191408157348633, "logits/rejected": -19.016117095947266, "logps/chosen": -396.6573791503906, "logps/rejected": -371.06683349609375, "loss": 1.0479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7232418060302734, "rewards/margins": 0.6627318859100342, "rewards/rejected": 3.06050968170166, "step": 75510 }, { "epoch": 3.506198059334231, "grad_norm": 47.25602722167969, "learning_rate": 8.972282835786247e-08, "logits/chosen": -18.86632537841797, "logits/rejected": -17.74770736694336, "logps/chosen": -336.5406799316406, "logps/rejected": -236.2366180419922, "loss": 0.3578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6074297428131104, "rewards/margins": 1.9175310134887695, "rewards/rejected": 0.6898987293243408, "step": 75520 }, { "epoch": 3.506662333441664, "grad_norm": 20.979360580444336, "learning_rate": 8.96949719114165e-08, "logits/chosen": -19.09323501586914, "logits/rejected": -18.10311508178711, "logps/chosen": -425.89373779296875, "logps/rejected": -297.99005126953125, "loss": 0.4005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7077553272247314, "rewards/margins": 1.9533119201660156, "rewards/rejected": 1.7544434070587158, "step": 75530 }, { "epoch": 3.5071266075490968, "grad_norm": 72.78216552734375, "learning_rate": 8.966711546497052e-08, "logits/chosen": -18.26882553100586, "logits/rejected": -17.350854873657227, "logps/chosen": -322.076416015625, "logps/rejected": -224.8119659423828, "loss": 0.6435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7740695476531982, "rewards/margins": 1.747675895690918, "rewards/rejected": 1.0263941287994385, "step": 75540 }, { "epoch": 3.50759088165653, "grad_norm": 101.53347778320312, "learning_rate": 8.963925901852454e-08, "logits/chosen": -19.201696395874023, "logits/rejected": -18.65293312072754, "logps/chosen": -444.739013671875, "logps/rejected": -361.9396057128906, "loss": 0.476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0816810131073, "rewards/margins": 1.2280471324920654, "rewards/rejected": 1.8536338806152344, "step": 75550 }, { "epoch": 3.508055155763963, "grad_norm": 108.48125457763672, "learning_rate": 8.961140257207855e-08, "logits/chosen": -19.7380428314209, "logits/rejected": -18.703641891479492, "logps/chosen": -406.182373046875, "logps/rejected": -290.7426452636719, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7663817405700684, "rewards/margins": 1.6468899250030518, "rewards/rejected": 2.1194918155670166, "step": 75560 }, { "epoch": 3.508519429871396, "grad_norm": 55.55358123779297, "learning_rate": 8.958354612563257e-08, "logits/chosen": -19.37287139892578, "logits/rejected": -18.543258666992188, "logps/chosen": -516.9564819335938, "logps/rejected": -456.5564880371094, "loss": 1.3909, "rewards/accuracies": 0.5, "rewards/chosen": 4.753678321838379, "rewards/margins": 0.7918068766593933, "rewards/rejected": 3.961871385574341, "step": 75570 }, { "epoch": 3.508983703978829, "grad_norm": 1.22081458568573, "learning_rate": 8.955568967918659e-08, "logits/chosen": -19.562593460083008, "logits/rejected": -18.29072380065918, "logps/chosen": -359.95123291015625, "logps/rejected": -268.4869079589844, "loss": 0.477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0509603023529053, "rewards/margins": 1.7256011962890625, "rewards/rejected": 1.3253591060638428, "step": 75580 }, { "epoch": 3.509447978086262, "grad_norm": 0.04051715135574341, "learning_rate": 8.952783323274062e-08, "logits/chosen": -19.436267852783203, "logits/rejected": -18.051755905151367, "logps/chosen": -461.56768798828125, "logps/rejected": -338.3779602050781, "loss": 0.4222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9042325019836426, "rewards/margins": 2.1697189807891846, "rewards/rejected": 1.734513521194458, "step": 75590 }, { "epoch": 3.509912252193695, "grad_norm": 10.26791763305664, "learning_rate": 8.949997678629463e-08, "logits/chosen": -19.619003295898438, "logits/rejected": -18.355892181396484, "logps/chosen": -505.43115234375, "logps/rejected": -280.46533203125, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 5.57455587387085, "rewards/margins": 3.3565750122070312, "rewards/rejected": 2.2179808616638184, "step": 75600 }, { "epoch": 3.510376526301128, "grad_norm": 44.61009216308594, "learning_rate": 8.947212033984864e-08, "logits/chosen": -20.056270599365234, "logits/rejected": -18.91086196899414, "logps/chosen": -441.51025390625, "logps/rejected": -349.0826110839844, "loss": 0.3983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.175843715667725, "rewards/margins": 1.3737828731536865, "rewards/rejected": 2.802060604095459, "step": 75610 }, { "epoch": 3.510840800408561, "grad_norm": 134.75997924804688, "learning_rate": 8.944426389340266e-08, "logits/chosen": -20.09784507751465, "logits/rejected": -19.860605239868164, "logps/chosen": -371.65631103515625, "logps/rejected": -408.85089111328125, "loss": 0.9314, "rewards/accuracies": 0.5, "rewards/chosen": 2.9677205085754395, "rewards/margins": 0.0806334987282753, "rewards/rejected": 2.887086868286133, "step": 75620 }, { "epoch": 3.5113050745159944, "grad_norm": 27.897228240966797, "learning_rate": 8.941640744695667e-08, "logits/chosen": -19.72255516052246, "logits/rejected": -18.045568466186523, "logps/chosen": -404.93463134765625, "logps/rejected": -278.0543212890625, "loss": 0.3924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.506537437438965, "rewards/margins": 2.366637706756592, "rewards/rejected": 2.139899730682373, "step": 75630 }, { "epoch": 3.511769348623427, "grad_norm": 147.5668487548828, "learning_rate": 8.93885510005107e-08, "logits/chosen": -19.347476959228516, "logits/rejected": -18.600910186767578, "logps/chosen": -406.40966796875, "logps/rejected": -346.6222229003906, "loss": 0.3963, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.57474946975708, "rewards/margins": 1.8137023448944092, "rewards/rejected": 1.7610470056533813, "step": 75640 }, { "epoch": 3.5122336227308604, "grad_norm": 0.12274283915758133, "learning_rate": 8.93606945540647e-08, "logits/chosen": -20.161130905151367, "logits/rejected": -19.46392059326172, "logps/chosen": -435.074462890625, "logps/rejected": -353.2099304199219, "loss": 0.5313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.67714786529541, "rewards/margins": 2.0624401569366455, "rewards/rejected": 2.6147074699401855, "step": 75650 }, { "epoch": 3.5126978968382936, "grad_norm": 149.2752685546875, "learning_rate": 8.933283810761873e-08, "logits/chosen": -18.761028289794922, "logits/rejected": -18.868478775024414, "logps/chosen": -333.1805114746094, "logps/rejected": -346.6177978515625, "loss": 1.2953, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.525635242462158, "rewards/margins": -0.4082562327384949, "rewards/rejected": 2.9338912963867188, "step": 75660 }, { "epoch": 3.5131621709457264, "grad_norm": 23.92789077758789, "learning_rate": 8.930498166117274e-08, "logits/chosen": -19.185222625732422, "logits/rejected": -17.931766510009766, "logps/chosen": -442.35711669921875, "logps/rejected": -311.35791015625, "loss": 0.3466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.387097358703613, "rewards/margins": 2.4307594299316406, "rewards/rejected": 1.9563379287719727, "step": 75670 }, { "epoch": 3.513626445053159, "grad_norm": 78.48548126220703, "learning_rate": 8.927712521472677e-08, "logits/chosen": -18.673236846923828, "logits/rejected": -18.765594482421875, "logps/chosen": -294.687255859375, "logps/rejected": -286.0040283203125, "loss": 0.48, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6611995697021484, "rewards/margins": 0.9025014042854309, "rewards/rejected": 0.7586981654167175, "step": 75680 }, { "epoch": 3.5140907191605923, "grad_norm": 3.8360016345977783, "learning_rate": 8.924926876828079e-08, "logits/chosen": -19.24622917175293, "logits/rejected": -18.48349952697754, "logps/chosen": -390.02081298828125, "logps/rejected": -288.1027526855469, "loss": 0.5251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.315030574798584, "rewards/margins": 1.1193426847457886, "rewards/rejected": 2.195688247680664, "step": 75690 }, { "epoch": 3.5145549932680256, "grad_norm": 28.395645141601562, "learning_rate": 8.922141232183481e-08, "logits/chosen": -18.98404312133789, "logits/rejected": -18.66910171508789, "logps/chosen": -374.93756103515625, "logps/rejected": -340.6091613769531, "loss": 1.0628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.852137804031372, "rewards/margins": 0.12732906639575958, "rewards/rejected": 2.724808931350708, "step": 75700 }, { "epoch": 3.5150192673754583, "grad_norm": 322.2734069824219, "learning_rate": 8.919355587538882e-08, "logits/chosen": -18.79056167602539, "logits/rejected": -18.393396377563477, "logps/chosen": -432.857666015625, "logps/rejected": -447.71136474609375, "loss": 0.9239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7858238220214844, "rewards/margins": 0.4768602252006531, "rewards/rejected": 3.3089632987976074, "step": 75710 }, { "epoch": 3.5154835414828915, "grad_norm": 204.6485137939453, "learning_rate": 8.916569942894284e-08, "logits/chosen": -18.62674331665039, "logits/rejected": -17.961315155029297, "logps/chosen": -358.16510009765625, "logps/rejected": -266.26239013671875, "loss": 0.7346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7432525157928467, "rewards/margins": 1.3874595165252686, "rewards/rejected": 2.355792760848999, "step": 75720 }, { "epoch": 3.5159478155903248, "grad_norm": 68.34375, "learning_rate": 8.913784298249686e-08, "logits/chosen": -19.315380096435547, "logits/rejected": -18.433223724365234, "logps/chosen": -349.481689453125, "logps/rejected": -263.50750732421875, "loss": 0.3401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4877021312713623, "rewards/margins": 1.3315620422363281, "rewards/rejected": 2.156139850616455, "step": 75730 }, { "epoch": 3.5164120896977575, "grad_norm": 47.81211853027344, "learning_rate": 8.910998653605089e-08, "logits/chosen": -19.95584487915039, "logits/rejected": -20.034616470336914, "logps/chosen": -470.8326721191406, "logps/rejected": -451.30120849609375, "loss": 0.4914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.511548042297363, "rewards/margins": 1.3474609851837158, "rewards/rejected": 3.1640872955322266, "step": 75740 }, { "epoch": 3.5168763638051903, "grad_norm": 199.35643005371094, "learning_rate": 8.90821300896049e-08, "logits/chosen": -19.750446319580078, "logits/rejected": -18.696924209594727, "logps/chosen": -381.095458984375, "logps/rejected": -381.41973876953125, "loss": 0.9848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3666396141052246, "rewards/margins": 0.7131723761558533, "rewards/rejected": 2.6534676551818848, "step": 75750 }, { "epoch": 3.5173406379126235, "grad_norm": 105.27423858642578, "learning_rate": 8.905427364315893e-08, "logits/chosen": -19.080928802490234, "logits/rejected": -18.724197387695312, "logps/chosen": -426.44561767578125, "logps/rejected": -377.87835693359375, "loss": 0.7914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7560698986053467, "rewards/margins": 0.8982227444648743, "rewards/rejected": 2.8578476905822754, "step": 75760 }, { "epoch": 3.5178049120200567, "grad_norm": 74.00476837158203, "learning_rate": 8.902641719671293e-08, "logits/chosen": -19.669170379638672, "logits/rejected": -19.07965087890625, "logps/chosen": -429.86285400390625, "logps/rejected": -379.68023681640625, "loss": 0.9327, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7811279296875, "rewards/margins": 0.7856036424636841, "rewards/rejected": 2.9955246448516846, "step": 75770 }, { "epoch": 3.5182691861274895, "grad_norm": 678.2096557617188, "learning_rate": 8.899856075026696e-08, "logits/chosen": -19.470109939575195, "logits/rejected": -18.516544342041016, "logps/chosen": -460.7483825683594, "logps/rejected": -344.5335998535156, "loss": 0.927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.38468074798584, "rewards/margins": 1.4674510955810547, "rewards/rejected": 2.9172306060791016, "step": 75780 }, { "epoch": 3.5187334602349227, "grad_norm": 14.495609283447266, "learning_rate": 8.897070430382097e-08, "logits/chosen": -18.791667938232422, "logits/rejected": -18.539669036865234, "logps/chosen": -358.68310546875, "logps/rejected": -319.42291259765625, "loss": 0.5588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0934648513793945, "rewards/margins": 1.0483729839324951, "rewards/rejected": 3.0450916290283203, "step": 75790 }, { "epoch": 3.519197734342356, "grad_norm": 18.444660186767578, "learning_rate": 8.8942847857375e-08, "logits/chosen": -19.71592903137207, "logits/rejected": -17.982311248779297, "logps/chosen": -443.04827880859375, "logps/rejected": -362.1477355957031, "loss": 0.5241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.541057109832764, "rewards/margins": 2.150634765625, "rewards/rejected": 2.3904225826263428, "step": 75800 }, { "epoch": 3.5196620084497887, "grad_norm": 35.03835678100586, "learning_rate": 8.891499141092901e-08, "logits/chosen": -18.810932159423828, "logits/rejected": -18.647518157958984, "logps/chosen": -319.8563537597656, "logps/rejected": -310.47100830078125, "loss": 0.7107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5336616039276123, "rewards/margins": 0.3797615170478821, "rewards/rejected": 2.153900384902954, "step": 75810 }, { "epoch": 3.520126282557222, "grad_norm": 59.80253601074219, "learning_rate": 8.888713496448303e-08, "logits/chosen": -18.831687927246094, "logits/rejected": -18.691274642944336, "logps/chosen": -318.368408203125, "logps/rejected": -329.1820068359375, "loss": 0.652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.457911729812622, "rewards/margins": 0.6164841651916504, "rewards/rejected": 1.8414275646209717, "step": 75820 }, { "epoch": 3.5205905566646547, "grad_norm": 242.24244689941406, "learning_rate": 8.885927851803704e-08, "logits/chosen": -19.150814056396484, "logits/rejected": -18.295011520385742, "logps/chosen": -339.9497375488281, "logps/rejected": -338.244140625, "loss": 1.1742, "rewards/accuracies": 0.5, "rewards/chosen": 2.88940691947937, "rewards/margins": 0.20193564891815186, "rewards/rejected": 2.687471628189087, "step": 75830 }, { "epoch": 3.521054830772088, "grad_norm": 12.45384407043457, "learning_rate": 8.883142207159106e-08, "logits/chosen": -19.12602424621582, "logits/rejected": -17.082881927490234, "logps/chosen": -401.35418701171875, "logps/rejected": -202.4569549560547, "loss": 0.1725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4434313774108887, "rewards/margins": 2.888518810272217, "rewards/rejected": 0.5549125671386719, "step": 75840 }, { "epoch": 3.5215191048795207, "grad_norm": 90.7849349975586, "learning_rate": 8.880356562514509e-08, "logits/chosen": -19.28421401977539, "logits/rejected": -19.123516082763672, "logps/chosen": -347.344482421875, "logps/rejected": -287.7183532714844, "loss": 0.7262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7329373359680176, "rewards/margins": 1.2350733280181885, "rewards/rejected": 1.4978643655776978, "step": 75850 }, { "epoch": 3.521983378986954, "grad_norm": 30.226598739624023, "learning_rate": 8.877570917869909e-08, "logits/chosen": -18.603656768798828, "logits/rejected": -17.912174224853516, "logps/chosen": -534.3770141601562, "logps/rejected": -333.5465393066406, "loss": 0.3933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.842991590499878, "rewards/margins": 1.9108402729034424, "rewards/rejected": 1.932151436805725, "step": 75860 }, { "epoch": 3.522447653094387, "grad_norm": 113.49352264404297, "learning_rate": 8.874785273225311e-08, "logits/chosen": -19.57815933227539, "logits/rejected": -18.038677215576172, "logps/chosen": -540.5620727539062, "logps/rejected": -391.45660400390625, "loss": 0.7395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.962749004364014, "rewards/margins": 1.881077766418457, "rewards/rejected": 3.0816712379455566, "step": 75870 }, { "epoch": 3.52291192720182, "grad_norm": 2.0042343139648438, "learning_rate": 8.871999628580713e-08, "logits/chosen": -19.823026657104492, "logits/rejected": -17.3287410736084, "logps/chosen": -363.63140869140625, "logps/rejected": -210.32589721679688, "loss": 0.752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7315757274627686, "rewards/margins": 2.9777884483337402, "rewards/rejected": 0.7537875175476074, "step": 75880 }, { "epoch": 3.523376201309253, "grad_norm": 28.073102951049805, "learning_rate": 8.869213983936116e-08, "logits/chosen": -18.899320602416992, "logits/rejected": -18.07560157775879, "logps/chosen": -380.7845153808594, "logps/rejected": -320.79669189453125, "loss": 0.6028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.822265625, "rewards/margins": 1.115056037902832, "rewards/rejected": 1.7072093486785889, "step": 75890 }, { "epoch": 3.523840475416686, "grad_norm": 9.34694766998291, "learning_rate": 8.866428339291517e-08, "logits/chosen": -19.18099021911621, "logits/rejected": -18.02652359008789, "logps/chosen": -345.1844177246094, "logps/rejected": -271.0138244628906, "loss": 0.2238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.314713478088379, "rewards/margins": 2.2703731060028076, "rewards/rejected": 2.0443410873413086, "step": 75900 }, { "epoch": 3.524304749524119, "grad_norm": 4.545977592468262, "learning_rate": 8.86364269464692e-08, "logits/chosen": -18.618446350097656, "logits/rejected": -18.105257034301758, "logps/chosen": -431.2588806152344, "logps/rejected": -368.77642822265625, "loss": 0.5273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.894773006439209, "rewards/margins": 1.568426489830017, "rewards/rejected": 2.3263463973999023, "step": 75910 }, { "epoch": 3.524769023631552, "grad_norm": 1.0820810794830322, "learning_rate": 8.86085705000232e-08, "logits/chosen": -18.785348892211914, "logits/rejected": -18.319807052612305, "logps/chosen": -334.15167236328125, "logps/rejected": -316.10711669921875, "loss": 0.5237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.842006206512451, "rewards/margins": 1.1505000591278076, "rewards/rejected": 1.6915056705474854, "step": 75920 }, { "epoch": 3.525233297738985, "grad_norm": 23.59467315673828, "learning_rate": 8.858071405357723e-08, "logits/chosen": -19.0755672454834, "logits/rejected": -19.08075523376465, "logps/chosen": -426.611083984375, "logps/rejected": -367.6171569824219, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.274202346801758, "rewards/margins": 0.7404239773750305, "rewards/rejected": 2.5337777137756348, "step": 75930 }, { "epoch": 3.5256975718464183, "grad_norm": 13.421492576599121, "learning_rate": 8.855285760713124e-08, "logits/chosen": -19.33181381225586, "logits/rejected": -18.79637908935547, "logps/chosen": -287.05877685546875, "logps/rejected": -255.9013214111328, "loss": 0.8328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1823348999023438, "rewards/margins": 0.4609549045562744, "rewards/rejected": 1.7213798761367798, "step": 75940 }, { "epoch": 3.526161845953851, "grad_norm": 132.13880920410156, "learning_rate": 8.852500116068527e-08, "logits/chosen": -18.714841842651367, "logits/rejected": -17.62411880493164, "logps/chosen": -370.02911376953125, "logps/rejected": -310.38250732421875, "loss": 0.9962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.103945732116699, "rewards/margins": 1.7750780582427979, "rewards/rejected": 2.3288679122924805, "step": 75950 }, { "epoch": 3.5266261200612843, "grad_norm": 42.273529052734375, "learning_rate": 8.849714471423929e-08, "logits/chosen": -19.389799118041992, "logits/rejected": -19.220539093017578, "logps/chosen": -480.4727478027344, "logps/rejected": -400.25286865234375, "loss": 0.5769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.010584831237793, "rewards/margins": 0.9266125559806824, "rewards/rejected": 3.083972692489624, "step": 75960 }, { "epoch": 3.527090394168717, "grad_norm": 48.57172775268555, "learning_rate": 8.846928826779331e-08, "logits/chosen": -18.502384185791016, "logits/rejected": -18.086442947387695, "logps/chosen": -403.1617431640625, "logps/rejected": -328.08709716796875, "loss": 0.4377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.240976333618164, "rewards/margins": 1.3000060319900513, "rewards/rejected": 1.9409706592559814, "step": 75970 }, { "epoch": 3.5275546682761503, "grad_norm": 1.3389923572540283, "learning_rate": 8.844143182134731e-08, "logits/chosen": -20.189990997314453, "logits/rejected": -19.498363494873047, "logps/chosen": -385.1390380859375, "logps/rejected": -352.62158203125, "loss": 0.8661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0642640590667725, "rewards/margins": 0.5965994000434875, "rewards/rejected": 2.4676644802093506, "step": 75980 }, { "epoch": 3.528018942383583, "grad_norm": 188.39364624023438, "learning_rate": 8.841357537490134e-08, "logits/chosen": -19.55392837524414, "logits/rejected": -18.955808639526367, "logps/chosen": -411.8140563964844, "logps/rejected": -362.03759765625, "loss": 0.6097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.401444911956787, "rewards/margins": 1.4418985843658447, "rewards/rejected": 1.9595463275909424, "step": 75990 }, { "epoch": 3.5284832164910163, "grad_norm": 25.66911506652832, "learning_rate": 8.838571892845536e-08, "logits/chosen": -19.15898895263672, "logits/rejected": -18.450977325439453, "logps/chosen": -405.32452392578125, "logps/rejected": -333.60223388671875, "loss": 0.688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.909339427947998, "rewards/margins": 1.3831946849822998, "rewards/rejected": 2.5261447429656982, "step": 76000 }, { "epoch": 3.5289474905984495, "grad_norm": 85.49140167236328, "learning_rate": 8.835786248200938e-08, "logits/chosen": -19.518409729003906, "logits/rejected": -17.862457275390625, "logps/chosen": -362.47210693359375, "logps/rejected": -257.2397155761719, "loss": 0.3142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.421316146850586, "rewards/margins": 2.6171741485595703, "rewards/rejected": 1.8041422367095947, "step": 76010 }, { "epoch": 3.5294117647058822, "grad_norm": 26.25547981262207, "learning_rate": 8.83300060355634e-08, "logits/chosen": -19.164281845092773, "logits/rejected": -18.402626037597656, "logps/chosen": -366.8692321777344, "logps/rejected": -320.75677490234375, "loss": 0.6754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.696596622467041, "rewards/margins": 0.8346354365348816, "rewards/rejected": 1.8619611263275146, "step": 76020 }, { "epoch": 3.5298760388133155, "grad_norm": 21.58609390258789, "learning_rate": 8.83021495891174e-08, "logits/chosen": -18.53764533996582, "logits/rejected": -16.941585540771484, "logps/chosen": -344.9931640625, "logps/rejected": -212.79458618164062, "loss": 0.3112, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5487053394317627, "rewards/margins": 3.0552144050598145, "rewards/rejected": 0.49349117279052734, "step": 76030 }, { "epoch": 3.5303403129207487, "grad_norm": 78.90398406982422, "learning_rate": 8.827429314267143e-08, "logits/chosen": -18.517004013061523, "logits/rejected": -17.827302932739258, "logps/chosen": -315.44219970703125, "logps/rejected": -304.93804931640625, "loss": 0.5725, "rewards/accuracies": 0.5, "rewards/chosen": 2.678779125213623, "rewards/margins": 1.359421730041504, "rewards/rejected": 1.3193576335906982, "step": 76040 }, { "epoch": 3.5308045870281815, "grad_norm": 102.08675384521484, "learning_rate": 8.824643669622544e-08, "logits/chosen": -18.814029693603516, "logits/rejected": -17.805858612060547, "logps/chosen": -328.0009765625, "logps/rejected": -310.56072998046875, "loss": 1.117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7696001529693604, "rewards/margins": 0.05890561267733574, "rewards/rejected": 1.7106945514678955, "step": 76050 }, { "epoch": 3.5312688611356142, "grad_norm": 140.76309204101562, "learning_rate": 8.821858024977947e-08, "logits/chosen": -19.019161224365234, "logits/rejected": -18.684282302856445, "logps/chosen": -463.1083984375, "logps/rejected": -413.565185546875, "loss": 0.8576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.014575481414795, "rewards/margins": -0.11892100423574448, "rewards/rejected": 3.1334967613220215, "step": 76060 }, { "epoch": 3.5317331352430474, "grad_norm": 112.00142669677734, "learning_rate": 8.819072380333347e-08, "logits/chosen": -20.71207046508789, "logits/rejected": -20.4088077545166, "logps/chosen": -490.26239013671875, "logps/rejected": -456.6083984375, "loss": 0.9012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.896439075469971, "rewards/margins": 0.518572211265564, "rewards/rejected": 4.377867221832275, "step": 76070 }, { "epoch": 3.5321974093504807, "grad_norm": 30.637908935546875, "learning_rate": 8.81628673568875e-08, "logits/chosen": -18.946569442749023, "logits/rejected": -18.158262252807617, "logps/chosen": -435.78839111328125, "logps/rejected": -306.1502380371094, "loss": 0.6797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5317447185516357, "rewards/margins": 1.3198366165161133, "rewards/rejected": 2.2119078636169434, "step": 76080 }, { "epoch": 3.5326616834579134, "grad_norm": 2.2643988132476807, "learning_rate": 8.813501091044151e-08, "logits/chosen": -18.837783813476562, "logits/rejected": -19.32900047302246, "logps/chosen": -398.44171142578125, "logps/rejected": -332.1180114746094, "loss": 0.832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6252028942108154, "rewards/margins": 0.6584380269050598, "rewards/rejected": 2.9667649269104004, "step": 76090 }, { "epoch": 3.5331259575653466, "grad_norm": 9.898810386657715, "learning_rate": 8.810715446399554e-08, "logits/chosen": -19.97439193725586, "logits/rejected": -18.346559524536133, "logps/chosen": -357.512939453125, "logps/rejected": -260.4722595214844, "loss": 0.331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2610812187194824, "rewards/margins": 1.6976550817489624, "rewards/rejected": 1.5634263753890991, "step": 76100 }, { "epoch": 3.53359023167278, "grad_norm": 6.602590560913086, "learning_rate": 8.807929801754956e-08, "logits/chosen": -19.43313217163086, "logits/rejected": -18.544963836669922, "logps/chosen": -586.5963134765625, "logps/rejected": -383.499755859375, "loss": 0.2924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.981266975402832, "rewards/margins": 2.4604997634887695, "rewards/rejected": 2.5207672119140625, "step": 76110 }, { "epoch": 3.5340545057802126, "grad_norm": 0.4048521816730499, "learning_rate": 8.805144157110358e-08, "logits/chosen": -18.819509506225586, "logits/rejected": -17.55112075805664, "logps/chosen": -466.6722106933594, "logps/rejected": -290.91571044921875, "loss": 0.2894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.143148422241211, "rewards/margins": 3.050421714782715, "rewards/rejected": 1.0927263498306274, "step": 76120 }, { "epoch": 3.5345187798876454, "grad_norm": 5.8316969871521, "learning_rate": 8.802358512465759e-08, "logits/chosen": -19.897438049316406, "logits/rejected": -18.663164138793945, "logps/chosen": -441.04296875, "logps/rejected": -297.5782165527344, "loss": 0.4821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.218681335449219, "rewards/margins": 2.099160671234131, "rewards/rejected": 3.119520425796509, "step": 76130 }, { "epoch": 3.5349830539950786, "grad_norm": 60.73580551147461, "learning_rate": 8.799572867821161e-08, "logits/chosen": -18.85546112060547, "logits/rejected": -18.206043243408203, "logps/chosen": -293.90264892578125, "logps/rejected": -225.99282836914062, "loss": 0.7975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.469135284423828, "rewards/margins": 1.1284115314483643, "rewards/rejected": 1.3407241106033325, "step": 76140 }, { "epoch": 3.535447328102512, "grad_norm": 21.83156967163086, "learning_rate": 8.796787223176563e-08, "logits/chosen": -18.91021156311035, "logits/rejected": -17.374679565429688, "logps/chosen": -442.11053466796875, "logps/rejected": -332.73614501953125, "loss": 0.8314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6284637451171875, "rewards/margins": 1.8440462350845337, "rewards/rejected": 2.7844176292419434, "step": 76150 }, { "epoch": 3.5359116022099446, "grad_norm": 7.260635852813721, "learning_rate": 8.794001578531966e-08, "logits/chosen": -19.60831642150879, "logits/rejected": -19.30098533630371, "logps/chosen": -406.48614501953125, "logps/rejected": -363.4556884765625, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.927595853805542, "rewards/margins": 1.2709147930145264, "rewards/rejected": 2.6566812992095947, "step": 76160 }, { "epoch": 3.536375876317378, "grad_norm": 73.41632843017578, "learning_rate": 8.791215933887367e-08, "logits/chosen": -19.6555118560791, "logits/rejected": -18.491186141967773, "logps/chosen": -443.8639221191406, "logps/rejected": -322.7030334472656, "loss": 0.2693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.023312091827393, "rewards/margins": 1.9987726211547852, "rewards/rejected": 2.0245394706726074, "step": 76170 }, { "epoch": 3.536840150424811, "grad_norm": 69.36475372314453, "learning_rate": 8.78843028924277e-08, "logits/chosen": -18.42034339904785, "logits/rejected": -17.90891456604004, "logps/chosen": -361.7933654785156, "logps/rejected": -360.47821044921875, "loss": 0.7879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1241061687469482, "rewards/margins": 0.8006292581558228, "rewards/rejected": 2.323476791381836, "step": 76180 }, { "epoch": 3.537304424532244, "grad_norm": 210.19015502929688, "learning_rate": 8.78564464459817e-08, "logits/chosen": -19.54544448852539, "logits/rejected": -18.435819625854492, "logps/chosen": -391.45001220703125, "logps/rejected": -288.72100830078125, "loss": 1.0457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.941772699356079, "rewards/margins": 0.9942694902420044, "rewards/rejected": 1.9475033283233643, "step": 76190 }, { "epoch": 3.537768698639677, "grad_norm": 8.970175743103027, "learning_rate": 8.782858999953573e-08, "logits/chosen": -19.1762752532959, "logits/rejected": -18.746715545654297, "logps/chosen": -389.531494140625, "logps/rejected": -335.06488037109375, "loss": 1.371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8771018981933594, "rewards/margins": 0.3926536440849304, "rewards/rejected": 3.484448194503784, "step": 76200 }, { "epoch": 3.53823297274711, "grad_norm": 24.037498474121094, "learning_rate": 8.780073355308974e-08, "logits/chosen": -19.075042724609375, "logits/rejected": -18.75986099243164, "logps/chosen": -295.91595458984375, "logps/rejected": -344.11767578125, "loss": 0.9644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4196667671203613, "rewards/margins": -0.03781602531671524, "rewards/rejected": 2.4574828147888184, "step": 76210 }, { "epoch": 3.538697246854543, "grad_norm": 2.7225944995880127, "learning_rate": 8.777287710664377e-08, "logits/chosen": -20.079172134399414, "logits/rejected": -18.096773147583008, "logps/chosen": -452.4449157714844, "logps/rejected": -281.7330627441406, "loss": 0.528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.063740253448486, "rewards/margins": 2.4720311164855957, "rewards/rejected": 2.5917086601257324, "step": 76220 }, { "epoch": 3.539161520961976, "grad_norm": 26.583507537841797, "learning_rate": 8.774502066019778e-08, "logits/chosen": -19.13543128967285, "logits/rejected": -18.77883529663086, "logps/chosen": -309.47662353515625, "logps/rejected": -315.5705261230469, "loss": 0.7499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.103726863861084, "rewards/margins": 0.3263392448425293, "rewards/rejected": 2.7773876190185547, "step": 76230 }, { "epoch": 3.539625795069409, "grad_norm": 0.025513499975204468, "learning_rate": 8.771716421375179e-08, "logits/chosen": -19.160308837890625, "logits/rejected": -17.74574089050293, "logps/chosen": -283.26226806640625, "logps/rejected": -262.8861389160156, "loss": 0.8912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4819447994232178, "rewards/margins": 1.1504417657852173, "rewards/rejected": 1.33150315284729, "step": 76240 }, { "epoch": 3.540090069176842, "grad_norm": 17.151704788208008, "learning_rate": 8.768930776730581e-08, "logits/chosen": -19.23908233642578, "logits/rejected": -18.03163719177246, "logps/chosen": -343.08319091796875, "logps/rejected": -216.4347381591797, "loss": 0.6281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2688510417938232, "rewards/margins": 1.0647157430648804, "rewards/rejected": 1.2041351795196533, "step": 76250 }, { "epoch": 3.540554343284275, "grad_norm": 49.167842864990234, "learning_rate": 8.766145132085983e-08, "logits/chosen": -19.998857498168945, "logits/rejected": -19.580276489257812, "logps/chosen": -383.2090759277344, "logps/rejected": -349.5926208496094, "loss": 0.4225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2924227714538574, "rewards/margins": 0.7549511194229126, "rewards/rejected": 2.5374717712402344, "step": 76260 }, { "epoch": 3.541018617391708, "grad_norm": 20.466228485107422, "learning_rate": 8.763359487441386e-08, "logits/chosen": -18.277294158935547, "logits/rejected": -18.619699478149414, "logps/chosen": -343.6729736328125, "logps/rejected": -397.50262451171875, "loss": 1.8903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4535040855407715, "rewards/margins": -0.5845822095870972, "rewards/rejected": 3.038086414337158, "step": 76270 }, { "epoch": 3.541482891499141, "grad_norm": 5.7444844245910645, "learning_rate": 8.760573842796786e-08, "logits/chosen": -19.043346405029297, "logits/rejected": -18.630651473999023, "logps/chosen": -369.5609436035156, "logps/rejected": -337.2251281738281, "loss": 1.428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.423828125, "rewards/margins": 0.6256221532821655, "rewards/rejected": 1.798206090927124, "step": 76280 }, { "epoch": 3.541947165606574, "grad_norm": 0.2660900950431824, "learning_rate": 8.757788198152188e-08, "logits/chosen": -19.073047637939453, "logits/rejected": -19.450944900512695, "logps/chosen": -361.3366394042969, "logps/rejected": -337.05792236328125, "loss": 1.1029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3031163215637207, "rewards/margins": 0.8507142066955566, "rewards/rejected": 2.452401876449585, "step": 76290 }, { "epoch": 3.542411439714007, "grad_norm": 59.048892974853516, "learning_rate": 8.75500255350759e-08, "logits/chosen": -18.81521224975586, "logits/rejected": -18.90643882751465, "logps/chosen": -299.70623779296875, "logps/rejected": -316.54168701171875, "loss": 1.0653, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.223036527633667, "rewards/margins": -0.2679898738861084, "rewards/rejected": 2.4910266399383545, "step": 76300 }, { "epoch": 3.54287571382144, "grad_norm": 57.35649871826172, "learning_rate": 8.752216908862993e-08, "logits/chosen": -17.48821258544922, "logits/rejected": -17.618896484375, "logps/chosen": -340.5835876464844, "logps/rejected": -394.1103820800781, "loss": 1.1671, "rewards/accuracies": 0.5, "rewards/chosen": 3.3154633045196533, "rewards/margins": 0.021428370848298073, "rewards/rejected": 3.294034481048584, "step": 76310 }, { "epoch": 3.5433399879288734, "grad_norm": 0.01664365455508232, "learning_rate": 8.749431264218394e-08, "logits/chosen": -18.814022064208984, "logits/rejected": -17.504690170288086, "logps/chosen": -460.93060302734375, "logps/rejected": -225.4622039794922, "loss": 1.0967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.939428329467773, "rewards/margins": 3.5303924083709717, "rewards/rejected": 1.4090361595153809, "step": 76320 }, { "epoch": 3.543804262036306, "grad_norm": 9.769710540771484, "learning_rate": 8.746645619573797e-08, "logits/chosen": -18.847881317138672, "logits/rejected": -17.58650016784668, "logps/chosen": -513.5943603515625, "logps/rejected": -310.6452331542969, "loss": 0.9255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.165152072906494, "rewards/margins": 2.2046005725860596, "rewards/rejected": 1.9605515003204346, "step": 76330 }, { "epoch": 3.5442685361437394, "grad_norm": 201.7935028076172, "learning_rate": 8.743859974929197e-08, "logits/chosen": -20.16030502319336, "logits/rejected": -19.770076751708984, "logps/chosen": -413.78009033203125, "logps/rejected": -407.5079345703125, "loss": 1.1383, "rewards/accuracies": 0.5, "rewards/chosen": 4.490508079528809, "rewards/margins": 0.8308531045913696, "rewards/rejected": 3.6596546173095703, "step": 76340 }, { "epoch": 3.544732810251172, "grad_norm": 8.980186462402344, "learning_rate": 8.7410743302846e-08, "logits/chosen": -19.034643173217773, "logits/rejected": -18.622634887695312, "logps/chosen": -407.685791015625, "logps/rejected": -385.514892578125, "loss": 0.948, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.137072563171387, "rewards/margins": 0.5055207014083862, "rewards/rejected": 3.631552219390869, "step": 76350 }, { "epoch": 3.5451970843586054, "grad_norm": 159.90748596191406, "learning_rate": 8.738288685640001e-08, "logits/chosen": -19.391441345214844, "logits/rejected": -19.209192276000977, "logps/chosen": -295.7166442871094, "logps/rejected": -338.9818420410156, "loss": 1.5745, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3823862075805664, "rewards/margins": -0.28304699063301086, "rewards/rejected": 2.665433168411255, "step": 76360 }, { "epoch": 3.545661358466038, "grad_norm": 110.0135498046875, "learning_rate": 8.735503040995404e-08, "logits/chosen": -18.274208068847656, "logits/rejected": -18.313732147216797, "logps/chosen": -347.0027770996094, "logps/rejected": -344.42938232421875, "loss": 1.2318, "rewards/accuracies": 0.5, "rewards/chosen": 3.279664993286133, "rewards/margins": 0.4588352143764496, "rewards/rejected": 2.8208298683166504, "step": 76370 }, { "epoch": 3.5461256325734714, "grad_norm": 152.97665405273438, "learning_rate": 8.732717396350805e-08, "logits/chosen": -18.035823822021484, "logits/rejected": -17.15222930908203, "logps/chosen": -444.43731689453125, "logps/rejected": -377.560302734375, "loss": 0.2918, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.032081127166748, "rewards/margins": 1.5772724151611328, "rewards/rejected": 1.4548089504241943, "step": 76380 }, { "epoch": 3.5465899066809046, "grad_norm": 78.30669403076172, "learning_rate": 8.729931751706208e-08, "logits/chosen": -19.831161499023438, "logits/rejected": -19.18569564819336, "logps/chosen": -412.115966796875, "logps/rejected": -395.0038757324219, "loss": 0.6801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.135836601257324, "rewards/margins": 1.1499030590057373, "rewards/rejected": 2.985933780670166, "step": 76390 }, { "epoch": 3.5470541807883373, "grad_norm": 194.3897247314453, "learning_rate": 8.727146107061608e-08, "logits/chosen": -19.337539672851562, "logits/rejected": -19.545040130615234, "logps/chosen": -380.41546630859375, "logps/rejected": -335.7186584472656, "loss": 1.2837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.741241931915283, "rewards/margins": 0.6062588095664978, "rewards/rejected": 3.1349830627441406, "step": 76400 }, { "epoch": 3.5475184548957706, "grad_norm": 13.917216300964355, "learning_rate": 8.724360462417011e-08, "logits/chosen": -19.166975021362305, "logits/rejected": -19.184093475341797, "logps/chosen": -293.96820068359375, "logps/rejected": -328.239501953125, "loss": 0.9689, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.909963607788086, "rewards/margins": 0.2350618839263916, "rewards/rejected": 2.6749017238616943, "step": 76410 }, { "epoch": 3.5479827290032033, "grad_norm": 66.51156616210938, "learning_rate": 8.721574817772413e-08, "logits/chosen": -19.372718811035156, "logits/rejected": -19.118188858032227, "logps/chosen": -387.27227783203125, "logps/rejected": -332.8843688964844, "loss": 0.7702, "rewards/accuracies": 0.5, "rewards/chosen": 2.802903890609741, "rewards/margins": 0.44688087701797485, "rewards/rejected": 2.356022834777832, "step": 76420 }, { "epoch": 3.5484470031106365, "grad_norm": 20.171079635620117, "learning_rate": 8.718789173127814e-08, "logits/chosen": -19.44976806640625, "logits/rejected": -18.955366134643555, "logps/chosen": -437.9056701660156, "logps/rejected": -382.51690673828125, "loss": 0.4843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9588770866394043, "rewards/margins": 1.278313398361206, "rewards/rejected": 1.6805639266967773, "step": 76430 }, { "epoch": 3.5489112772180693, "grad_norm": 55.492095947265625, "learning_rate": 8.716003528483217e-08, "logits/chosen": -19.551740646362305, "logits/rejected": -17.870145797729492, "logps/chosen": -274.2276306152344, "logps/rejected": -227.1756591796875, "loss": 0.5619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0303149223327637, "rewards/margins": 2.168243408203125, "rewards/rejected": 0.8620718121528625, "step": 76440 }, { "epoch": 3.5493755513255025, "grad_norm": 38.558719635009766, "learning_rate": 8.713217883838617e-08, "logits/chosen": -18.86281394958496, "logits/rejected": -17.723417282104492, "logps/chosen": -407.85406494140625, "logps/rejected": -321.65130615234375, "loss": 0.8602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.165403127670288, "rewards/margins": 1.2766587734222412, "rewards/rejected": 1.8887447118759155, "step": 76450 }, { "epoch": 3.5498398254329357, "grad_norm": 54.478370666503906, "learning_rate": 8.71043223919402e-08, "logits/chosen": -18.717891693115234, "logits/rejected": -18.426921844482422, "logps/chosen": -443.44482421875, "logps/rejected": -300.23065185546875, "loss": 0.9299, "rewards/accuracies": 0.5, "rewards/chosen": 3.941129684448242, "rewards/margins": 0.7728766202926636, "rewards/rejected": 3.168252944946289, "step": 76460 }, { "epoch": 3.5503040995403685, "grad_norm": 4.9186320304870605, "learning_rate": 8.707646594549421e-08, "logits/chosen": -19.82059669494629, "logits/rejected": -18.657310485839844, "logps/chosen": -422.1263122558594, "logps/rejected": -353.6187744140625, "loss": 0.2993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6036200523376465, "rewards/margins": 2.462179660797119, "rewards/rejected": 2.1414408683776855, "step": 76470 }, { "epoch": 3.5507683736478017, "grad_norm": 58.68241500854492, "learning_rate": 8.704860949904824e-08, "logits/chosen": -18.938446044921875, "logits/rejected": -18.514760971069336, "logps/chosen": -498.21868896484375, "logps/rejected": -414.20068359375, "loss": 0.5223, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.589972972869873, "rewards/margins": 1.8785569667816162, "rewards/rejected": 2.711416006088257, "step": 76480 }, { "epoch": 3.551232647755235, "grad_norm": 1.0241981744766235, "learning_rate": 8.702075305260224e-08, "logits/chosen": -19.257352828979492, "logits/rejected": -18.219043731689453, "logps/chosen": -368.54473876953125, "logps/rejected": -299.2611999511719, "loss": 0.5739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0798707008361816, "rewards/margins": 1.9263197183609009, "rewards/rejected": 1.1535511016845703, "step": 76490 }, { "epoch": 3.5516969218626677, "grad_norm": 141.9710693359375, "learning_rate": 8.699289660615627e-08, "logits/chosen": -19.165176391601562, "logits/rejected": -17.93790054321289, "logps/chosen": -492.0760803222656, "logps/rejected": -381.71917724609375, "loss": 0.3321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.546750068664551, "rewards/margins": 1.888610601425171, "rewards/rejected": 2.658139705657959, "step": 76500 }, { "epoch": 3.5521611959701005, "grad_norm": 22.399002075195312, "learning_rate": 8.696504015971028e-08, "logits/chosen": -19.581607818603516, "logits/rejected": -19.566999435424805, "logps/chosen": -378.5824279785156, "logps/rejected": -423.98016357421875, "loss": 1.0725, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9360413551330566, "rewards/margins": -0.23644287884235382, "rewards/rejected": 3.1724839210510254, "step": 76510 }, { "epoch": 3.5526254700775337, "grad_norm": 0.6890924572944641, "learning_rate": 8.693718371326431e-08, "logits/chosen": -19.77865219116211, "logits/rejected": -18.669353485107422, "logps/chosen": -386.6445007324219, "logps/rejected": -316.97296142578125, "loss": 0.5249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.86393666267395, "rewards/margins": 1.5508066415786743, "rewards/rejected": 2.3131296634674072, "step": 76520 }, { "epoch": 3.553089744184967, "grad_norm": 2.2964675426483154, "learning_rate": 8.690932726681833e-08, "logits/chosen": -17.88454818725586, "logits/rejected": -17.900915145874023, "logps/chosen": -385.823974609375, "logps/rejected": -383.2587890625, "loss": 0.5104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.238410234451294, "rewards/margins": 1.141259789466858, "rewards/rejected": 2.0971500873565674, "step": 76530 }, { "epoch": 3.5535540182923997, "grad_norm": 0.11464417725801468, "learning_rate": 8.688147082037235e-08, "logits/chosen": -19.590059280395508, "logits/rejected": -18.467384338378906, "logps/chosen": -473.86236572265625, "logps/rejected": -354.07208251953125, "loss": 0.407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.301098346710205, "rewards/margins": 1.570168137550354, "rewards/rejected": 2.7309303283691406, "step": 76540 }, { "epoch": 3.554018292399833, "grad_norm": 0.7517320513725281, "learning_rate": 8.685361437392635e-08, "logits/chosen": -18.352426528930664, "logits/rejected": -17.84323501586914, "logps/chosen": -340.8300476074219, "logps/rejected": -293.09442138671875, "loss": 1.0288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3070836067199707, "rewards/margins": 0.7707967758178711, "rewards/rejected": 2.5362865924835205, "step": 76550 }, { "epoch": 3.554482566507266, "grad_norm": 62.74348068237305, "learning_rate": 8.682575792748038e-08, "logits/chosen": -19.004243850708008, "logits/rejected": -18.48316192626953, "logps/chosen": -416.57110595703125, "logps/rejected": -371.59722900390625, "loss": 0.5085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4107825756073, "rewards/margins": 0.9613007307052612, "rewards/rejected": 2.44948148727417, "step": 76560 }, { "epoch": 3.554946840614699, "grad_norm": 165.78646850585938, "learning_rate": 8.67979014810344e-08, "logits/chosen": -19.943714141845703, "logits/rejected": -19.084123611450195, "logps/chosen": -383.770751953125, "logps/rejected": -369.301025390625, "loss": 0.9138, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.220324993133545, "rewards/margins": 0.8803404569625854, "rewards/rejected": 3.339984893798828, "step": 76570 }, { "epoch": 3.5554111147221317, "grad_norm": 84.74224090576172, "learning_rate": 8.677004503458842e-08, "logits/chosen": -19.659238815307617, "logits/rejected": -19.040395736694336, "logps/chosen": -421.17083740234375, "logps/rejected": -375.755859375, "loss": 0.6891, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.248174667358398, "rewards/margins": 0.5109677910804749, "rewards/rejected": 3.7372069358825684, "step": 76580 }, { "epoch": 3.555875388829565, "grad_norm": 9.914252281188965, "learning_rate": 8.674218858814244e-08, "logits/chosen": -19.20993423461914, "logits/rejected": -18.32613754272461, "logps/chosen": -340.2989196777344, "logps/rejected": -301.2487487792969, "loss": 0.8259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6510491371154785, "rewards/margins": 0.913824737071991, "rewards/rejected": 1.7372243404388428, "step": 76590 }, { "epoch": 3.556339662936998, "grad_norm": 89.19667053222656, "learning_rate": 8.671433214169647e-08, "logits/chosen": -19.162662506103516, "logits/rejected": -18.24311065673828, "logps/chosen": -383.85186767578125, "logps/rejected": -325.0106201171875, "loss": 0.6442, "rewards/accuracies": 0.5, "rewards/chosen": 3.072756290435791, "rewards/margins": 0.8954488635063171, "rewards/rejected": 2.177306890487671, "step": 76600 }, { "epoch": 3.556803937044431, "grad_norm": 129.30313110351562, "learning_rate": 8.668647569525047e-08, "logits/chosen": -18.867883682250977, "logits/rejected": -18.354450225830078, "logps/chosen": -375.9086608886719, "logps/rejected": -333.0613708496094, "loss": 0.8706, "rewards/accuracies": 0.5, "rewards/chosen": 3.4823086261749268, "rewards/margins": 0.2566918134689331, "rewards/rejected": 3.225616455078125, "step": 76610 }, { "epoch": 3.557268211151864, "grad_norm": 0.7620819211006165, "learning_rate": 8.665861924880448e-08, "logits/chosen": -19.115175247192383, "logits/rejected": -18.88540267944336, "logps/chosen": -420.88323974609375, "logps/rejected": -323.173828125, "loss": 0.5733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.165459156036377, "rewards/margins": 0.967154860496521, "rewards/rejected": 3.1983039379119873, "step": 76620 }, { "epoch": 3.5577324852592973, "grad_norm": 34.0992317199707, "learning_rate": 8.663076280235851e-08, "logits/chosen": -18.513051986694336, "logits/rejected": -18.233257293701172, "logps/chosen": -364.29864501953125, "logps/rejected": -374.7543029785156, "loss": 1.3066, "rewards/accuracies": 0.5, "rewards/chosen": 3.6066410541534424, "rewards/margins": 0.21662607789039612, "rewards/rejected": 3.390015125274658, "step": 76630 }, { "epoch": 3.55819675936673, "grad_norm": 54.9664192199707, "learning_rate": 8.660290635591253e-08, "logits/chosen": -18.374542236328125, "logits/rejected": -17.635868072509766, "logps/chosen": -335.51287841796875, "logps/rejected": -258.41351318359375, "loss": 0.8117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.152507781982422, "rewards/margins": 1.658933401107788, "rewards/rejected": 1.493574619293213, "step": 76640 }, { "epoch": 3.5586610334741633, "grad_norm": 89.16069030761719, "learning_rate": 8.657504990946655e-08, "logits/chosen": -18.888904571533203, "logits/rejected": -18.035266876220703, "logps/chosen": -427.35760498046875, "logps/rejected": -365.8262023925781, "loss": 0.3972, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2978386878967285, "rewards/margins": 2.1371562480926514, "rewards/rejected": 2.1606826782226562, "step": 76650 }, { "epoch": 3.559125307581596, "grad_norm": 60.790985107421875, "learning_rate": 8.654719346302055e-08, "logits/chosen": -17.752056121826172, "logits/rejected": -17.495622634887695, "logps/chosen": -402.31158447265625, "logps/rejected": -344.6900634765625, "loss": 0.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2512407302856445, "rewards/margins": 1.146460771560669, "rewards/rejected": 3.1047801971435547, "step": 76660 }, { "epoch": 3.5595895816890293, "grad_norm": 151.7025604248047, "learning_rate": 8.651933701657458e-08, "logits/chosen": -19.049257278442383, "logits/rejected": -18.531417846679688, "logps/chosen": -423.97332763671875, "logps/rejected": -309.97186279296875, "loss": 1.012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.299222946166992, "rewards/margins": 1.2608392238616943, "rewards/rejected": 2.0383834838867188, "step": 76670 }, { "epoch": 3.560053855796462, "grad_norm": 36.8963737487793, "learning_rate": 8.64914805701286e-08, "logits/chosen": -19.06838035583496, "logits/rejected": -17.807004928588867, "logps/chosen": -410.0177307128906, "logps/rejected": -314.1711120605469, "loss": 0.482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.123364448547363, "rewards/margins": 2.265594482421875, "rewards/rejected": 1.8577697277069092, "step": 76680 }, { "epoch": 3.5605181299038953, "grad_norm": 119.64131927490234, "learning_rate": 8.646362412368262e-08, "logits/chosen": -18.150039672851562, "logits/rejected": -17.850749969482422, "logps/chosen": -284.6853332519531, "logps/rejected": -271.09075927734375, "loss": 1.0049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4251277446746826, "rewards/margins": 0.8458575010299683, "rewards/rejected": 1.5792701244354248, "step": 76690 }, { "epoch": 3.5609824040113285, "grad_norm": 93.67432403564453, "learning_rate": 8.643576767723663e-08, "logits/chosen": -19.478734970092773, "logits/rejected": -18.8561954498291, "logps/chosen": -340.9215393066406, "logps/rejected": -322.72637939453125, "loss": 0.8106, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0063788890838623, "rewards/margins": 0.9557306170463562, "rewards/rejected": 2.0506482124328613, "step": 76700 }, { "epoch": 3.5614466781187613, "grad_norm": 149.39952087402344, "learning_rate": 8.640791123079065e-08, "logits/chosen": -19.942790985107422, "logits/rejected": -19.265838623046875, "logps/chosen": -377.3114318847656, "logps/rejected": -458.5044860839844, "loss": 1.2197, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6799755096435547, "rewards/margins": -0.39743009209632874, "rewards/rejected": 3.0774054527282715, "step": 76710 }, { "epoch": 3.5619109522261945, "grad_norm": 180.2148895263672, "learning_rate": 8.638005478434467e-08, "logits/chosen": -18.854549407958984, "logits/rejected": -18.31101417541504, "logps/chosen": -377.1327819824219, "logps/rejected": -259.54071044921875, "loss": 0.5701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.184047222137451, "rewards/margins": 1.5238546133041382, "rewards/rejected": 1.6601924896240234, "step": 76720 }, { "epoch": 3.5623752263336272, "grad_norm": 96.59020233154297, "learning_rate": 8.63521983378987e-08, "logits/chosen": -18.618009567260742, "logits/rejected": -18.29371452331543, "logps/chosen": -354.863525390625, "logps/rejected": -320.51348876953125, "loss": 0.6876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.194085121154785, "rewards/margins": 1.1160085201263428, "rewards/rejected": 3.0780763626098633, "step": 76730 }, { "epoch": 3.5628395004410605, "grad_norm": 77.0574722290039, "learning_rate": 8.632434189145271e-08, "logits/chosen": -19.373085021972656, "logits/rejected": -18.80068588256836, "logps/chosen": -489.6434631347656, "logps/rejected": -375.6415710449219, "loss": 0.3578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.640392303466797, "rewards/margins": 2.8582255840301514, "rewards/rejected": 2.7821669578552246, "step": 76740 }, { "epoch": 3.5633037745484932, "grad_norm": 278.998779296875, "learning_rate": 8.629648544500674e-08, "logits/chosen": -18.799549102783203, "logits/rejected": -18.941349029541016, "logps/chosen": -311.4507751464844, "logps/rejected": -339.47503662109375, "loss": 1.6668, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.275038957595825, "rewards/margins": -0.9598361849784851, "rewards/rejected": 3.234875440597534, "step": 76750 }, { "epoch": 3.5637680486559264, "grad_norm": 21.82758903503418, "learning_rate": 8.626862899856074e-08, "logits/chosen": -19.681232452392578, "logits/rejected": -18.438243865966797, "logps/chosen": -372.99176025390625, "logps/rejected": -250.71768188476562, "loss": 0.5678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.199564456939697, "rewards/margins": 2.4157707691192627, "rewards/rejected": 1.7837932109832764, "step": 76760 }, { "epoch": 3.5642323227633597, "grad_norm": 29.59543800354004, "learning_rate": 8.624077255211477e-08, "logits/chosen": -19.025510787963867, "logits/rejected": -17.895709991455078, "logps/chosen": -356.72515869140625, "logps/rejected": -216.6342315673828, "loss": 0.3362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4100677967071533, "rewards/margins": 2.176945209503174, "rewards/rejected": 1.2331225872039795, "step": 76770 }, { "epoch": 3.5646965968707924, "grad_norm": 185.475341796875, "learning_rate": 8.621291610566878e-08, "logits/chosen": -19.018573760986328, "logits/rejected": -18.77004623413086, "logps/chosen": -419.71563720703125, "logps/rejected": -355.74896240234375, "loss": 0.7663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.478285312652588, "rewards/margins": 0.9079458117485046, "rewards/rejected": 3.570338726043701, "step": 76780 }, { "epoch": 3.5651608709782256, "grad_norm": 0.3596401810646057, "learning_rate": 8.618505965922281e-08, "logits/chosen": -19.053482055664062, "logits/rejected": -19.407800674438477, "logps/chosen": -322.21990966796875, "logps/rejected": -300.98052978515625, "loss": 1.0719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.272177219390869, "rewards/margins": 0.8873590230941772, "rewards/rejected": 3.3848178386688232, "step": 76790 }, { "epoch": 3.5656251450856584, "grad_norm": 173.89532470703125, "learning_rate": 8.615720321277682e-08, "logits/chosen": -18.87038230895996, "logits/rejected": -17.335262298583984, "logps/chosen": -443.633056640625, "logps/rejected": -290.0722351074219, "loss": 0.3322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7626121044158936, "rewards/margins": 2.8227031230926514, "rewards/rejected": 0.9399086833000183, "step": 76800 }, { "epoch": 3.5660894191930916, "grad_norm": 112.76103973388672, "learning_rate": 8.612934676633085e-08, "logits/chosen": -18.724740982055664, "logits/rejected": -19.039892196655273, "logps/chosen": -340.6340026855469, "logps/rejected": -371.35888671875, "loss": 1.119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.169076681137085, "rewards/margins": -0.06452806293964386, "rewards/rejected": 3.2336044311523438, "step": 76810 }, { "epoch": 3.5665536933005244, "grad_norm": 77.54911041259766, "learning_rate": 8.610149031988485e-08, "logits/chosen": -19.19684410095215, "logits/rejected": -17.47783851623535, "logps/chosen": -549.5263061523438, "logps/rejected": -383.89349365234375, "loss": 0.3254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.127479553222656, "rewards/margins": 2.286548137664795, "rewards/rejected": 2.8409318923950195, "step": 76820 }, { "epoch": 3.5670179674079576, "grad_norm": 6.000264644622803, "learning_rate": 8.607363387343887e-08, "logits/chosen": -19.490291595458984, "logits/rejected": -19.0659236907959, "logps/chosen": -425.45526123046875, "logps/rejected": -443.2261657714844, "loss": 0.9734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8865513801574707, "rewards/margins": 0.7533677816390991, "rewards/rejected": 3.1331839561462402, "step": 76830 }, { "epoch": 3.567482241515391, "grad_norm": 150.41891479492188, "learning_rate": 8.60457774269929e-08, "logits/chosen": -19.838186264038086, "logits/rejected": -19.330394744873047, "logps/chosen": -362.1529235839844, "logps/rejected": -312.37493896484375, "loss": 0.4403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9491817951202393, "rewards/margins": 1.471864104270935, "rewards/rejected": 2.4773175716400146, "step": 76840 }, { "epoch": 3.5679465156228236, "grad_norm": 284.97137451171875, "learning_rate": 8.601792098054691e-08, "logits/chosen": -18.654146194458008, "logits/rejected": -18.527606964111328, "logps/chosen": -407.28704833984375, "logps/rejected": -457.2509765625, "loss": 1.4744, "rewards/accuracies": 0.5, "rewards/chosen": 5.150591850280762, "rewards/margins": 0.5405029058456421, "rewards/rejected": 4.6100897789001465, "step": 76850 }, { "epoch": 3.568410789730257, "grad_norm": 64.96223449707031, "learning_rate": 8.599006453410094e-08, "logits/chosen": -18.86507225036621, "logits/rejected": -18.60900115966797, "logps/chosen": -362.27130126953125, "logps/rejected": -266.0586853027344, "loss": 0.6129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.162158727645874, "rewards/margins": 1.050926923751831, "rewards/rejected": 2.111231565475464, "step": 76860 }, { "epoch": 3.56887506383769, "grad_norm": 0.6773722171783447, "learning_rate": 8.596220808765494e-08, "logits/chosen": -20.30672836303711, "logits/rejected": -18.885705947875977, "logps/chosen": -478.00238037109375, "logps/rejected": -304.2058410644531, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 5.795081615447998, "rewards/margins": 3.3007826805114746, "rewards/rejected": 2.4942994117736816, "step": 76870 }, { "epoch": 3.569339337945123, "grad_norm": 44.23284912109375, "learning_rate": 8.593435164120897e-08, "logits/chosen": -19.21746253967285, "logits/rejected": -18.53805923461914, "logps/chosen": -355.74053955078125, "logps/rejected": -332.7435607910156, "loss": 0.5533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9161629676818848, "rewards/margins": 1.3451496362686157, "rewards/rejected": 2.5710132122039795, "step": 76880 }, { "epoch": 3.5698036120525556, "grad_norm": 162.00238037109375, "learning_rate": 8.590649519476298e-08, "logits/chosen": -19.127525329589844, "logits/rejected": -18.816368103027344, "logps/chosen": -403.22406005859375, "logps/rejected": -338.26690673828125, "loss": 0.8383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7217323780059814, "rewards/margins": 0.6727129220962524, "rewards/rejected": 3.0490193367004395, "step": 76890 }, { "epoch": 3.570267886159989, "grad_norm": 35.2608528137207, "learning_rate": 8.587863874831701e-08, "logits/chosen": -18.231800079345703, "logits/rejected": -18.831228256225586, "logps/chosen": -257.24908447265625, "logps/rejected": -310.61004638671875, "loss": 1.1414, "rewards/accuracies": 0.5, "rewards/chosen": 1.8340476751327515, "rewards/margins": -0.1363258808851242, "rewards/rejected": 1.9703733921051025, "step": 76900 }, { "epoch": 3.570732160267422, "grad_norm": 18.256790161132812, "learning_rate": 8.585078230187101e-08, "logits/chosen": -19.032384872436523, "logits/rejected": -18.524410247802734, "logps/chosen": -389.6106872558594, "logps/rejected": -391.57745361328125, "loss": 0.9235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.239006519317627, "rewards/margins": 0.39261895418167114, "rewards/rejected": 2.8463873863220215, "step": 76910 }, { "epoch": 3.571196434374855, "grad_norm": 89.96991729736328, "learning_rate": 8.582292585542504e-08, "logits/chosen": -18.86482810974121, "logits/rejected": -18.39065933227539, "logps/chosen": -316.62890625, "logps/rejected": -250.8125457763672, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.422710418701172, "rewards/margins": 0.9268930554389954, "rewards/rejected": 1.4958174228668213, "step": 76920 }, { "epoch": 3.571660708482288, "grad_norm": 165.31674194335938, "learning_rate": 8.579506940897905e-08, "logits/chosen": -20.187557220458984, "logits/rejected": -18.990800857543945, "logps/chosen": -396.97235107421875, "logps/rejected": -363.08453369140625, "loss": 0.4071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.434993267059326, "rewards/margins": 2.0369045734405518, "rewards/rejected": 2.3980884552001953, "step": 76930 }, { "epoch": 3.572124982589721, "grad_norm": 200.5769805908203, "learning_rate": 8.576721296253308e-08, "logits/chosen": -20.142120361328125, "logits/rejected": -18.222883224487305, "logps/chosen": -449.33782958984375, "logps/rejected": -325.04345703125, "loss": 0.4847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.608628749847412, "rewards/margins": 1.57424795627594, "rewards/rejected": 2.0343809127807617, "step": 76940 }, { "epoch": 3.572589256697154, "grad_norm": 73.605224609375, "learning_rate": 8.57393565160871e-08, "logits/chosen": -18.66780662536621, "logits/rejected": -17.804027557373047, "logps/chosen": -289.5196533203125, "logps/rejected": -247.53125, "loss": 0.7187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8815808296203613, "rewards/margins": 0.9265674352645874, "rewards/rejected": 1.955013632774353, "step": 76950 }, { "epoch": 3.5730535308045868, "grad_norm": 158.95851135253906, "learning_rate": 8.571150006964112e-08, "logits/chosen": -18.180416107177734, "logits/rejected": -17.90872573852539, "logps/chosen": -333.357666015625, "logps/rejected": -299.6575927734375, "loss": 0.9915, "rewards/accuracies": 0.5, "rewards/chosen": 3.6083035469055176, "rewards/margins": 0.3007902503013611, "rewards/rejected": 3.307513475418091, "step": 76960 }, { "epoch": 3.57351780491202, "grad_norm": 0.12022754549980164, "learning_rate": 8.568364362319512e-08, "logits/chosen": -18.821731567382812, "logits/rejected": -17.018312454223633, "logps/chosen": -498.3680114746094, "logps/rejected": -305.73211669921875, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 4.610476970672607, "rewards/margins": 3.4265055656433105, "rewards/rejected": 1.183971643447876, "step": 76970 }, { "epoch": 3.573982079019453, "grad_norm": 95.8665542602539, "learning_rate": 8.565578717674915e-08, "logits/chosen": -20.0318660736084, "logits/rejected": -19.197811126708984, "logps/chosen": -401.7757568359375, "logps/rejected": -309.2984619140625, "loss": 0.4602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5586135387420654, "rewards/margins": 1.425991415977478, "rewards/rejected": 2.1326217651367188, "step": 76980 }, { "epoch": 3.574446353126886, "grad_norm": 85.52146911621094, "learning_rate": 8.562793073030317e-08, "logits/chosen": -18.685317993164062, "logits/rejected": -18.413482666015625, "logps/chosen": -241.6036834716797, "logps/rejected": -223.0069122314453, "loss": 0.7627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8621203899383545, "rewards/margins": 0.2576705515384674, "rewards/rejected": 1.604449987411499, "step": 76990 }, { "epoch": 3.574910627234319, "grad_norm": 3.4269378185272217, "learning_rate": 8.56000742838572e-08, "logits/chosen": -19.2856502532959, "logits/rejected": -18.158222198486328, "logps/chosen": -404.88531494140625, "logps/rejected": -300.9988708496094, "loss": 0.4339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.539717197418213, "rewards/margins": 2.011298418045044, "rewards/rejected": 2.528419017791748, "step": 77000 }, { "epoch": 3.5753749013417524, "grad_norm": 84.22559356689453, "learning_rate": 8.557221783741121e-08, "logits/chosen": -18.965129852294922, "logits/rejected": -18.63054847717285, "logps/chosen": -312.2408142089844, "logps/rejected": -335.40924072265625, "loss": 0.5714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8273751735687256, "rewards/margins": 0.9818331003189087, "rewards/rejected": 1.845542311668396, "step": 77010 }, { "epoch": 3.575839175449185, "grad_norm": 41.546260833740234, "learning_rate": 8.554436139096521e-08, "logits/chosen": -19.25445556640625, "logits/rejected": -18.2875919342041, "logps/chosen": -360.52142333984375, "logps/rejected": -243.61361694335938, "loss": 0.4005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.610356092453003, "rewards/margins": 1.3597018718719482, "rewards/rejected": 1.2506539821624756, "step": 77020 }, { "epoch": 3.5763034495566184, "grad_norm": 232.8157958984375, "learning_rate": 8.551650494451924e-08, "logits/chosen": -18.794784545898438, "logits/rejected": -17.873640060424805, "logps/chosen": -420.45263671875, "logps/rejected": -336.431640625, "loss": 0.8778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8860836029052734, "rewards/margins": 0.8701211810112, "rewards/rejected": 2.0159623622894287, "step": 77030 }, { "epoch": 3.576767723664051, "grad_norm": 4.198354244232178, "learning_rate": 8.548864849807325e-08, "logits/chosen": -19.755157470703125, "logits/rejected": -18.996891021728516, "logps/chosen": -468.6453552246094, "logps/rejected": -457.30889892578125, "loss": 0.7049, "rewards/accuracies": 0.5, "rewards/chosen": 4.425426959991455, "rewards/margins": 0.8581638336181641, "rewards/rejected": 3.567263126373291, "step": 77040 }, { "epoch": 3.5772319977714844, "grad_norm": 13.770305633544922, "learning_rate": 8.546079205162728e-08, "logits/chosen": -19.57793426513672, "logits/rejected": -18.965646743774414, "logps/chosen": -406.2845153808594, "logps/rejected": -376.578125, "loss": 0.8257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5661532878875732, "rewards/margins": 0.523956298828125, "rewards/rejected": 3.042196750640869, "step": 77050 }, { "epoch": 3.577696271878917, "grad_norm": 42.47867202758789, "learning_rate": 8.54329356051813e-08, "logits/chosen": -18.649892807006836, "logits/rejected": -19.14121437072754, "logps/chosen": -360.25103759765625, "logps/rejected": -331.951904296875, "loss": 0.7822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8827426433563232, "rewards/margins": 0.3245919942855835, "rewards/rejected": 2.55815052986145, "step": 77060 }, { "epoch": 3.5781605459863504, "grad_norm": 2.0361721515655518, "learning_rate": 8.540507915873532e-08, "logits/chosen": -19.57969093322754, "logits/rejected": -18.13909912109375, "logps/chosen": -354.5745544433594, "logps/rejected": -283.7837829589844, "loss": 0.287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9025516510009766, "rewards/margins": 2.365893602371216, "rewards/rejected": 1.5366582870483398, "step": 77070 }, { "epoch": 3.5786248200937836, "grad_norm": 136.984619140625, "learning_rate": 8.537722271228932e-08, "logits/chosen": -20.115585327148438, "logits/rejected": -19.391992568969727, "logps/chosen": -351.9049377441406, "logps/rejected": -282.74310302734375, "loss": 0.5534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.944350004196167, "rewards/margins": 1.1669172048568726, "rewards/rejected": 1.7774327993392944, "step": 77080 }, { "epoch": 3.5790890942012163, "grad_norm": 49.108924865722656, "learning_rate": 8.534936626584335e-08, "logits/chosen": -18.416492462158203, "logits/rejected": -18.77080535888672, "logps/chosen": -230.88357543945312, "logps/rejected": -271.0624694824219, "loss": 1.1979, "rewards/accuracies": 0.5, "rewards/chosen": 1.7026258707046509, "rewards/margins": -0.44087857007980347, "rewards/rejected": 2.1435046195983887, "step": 77090 }, { "epoch": 3.5795533683086496, "grad_norm": 1.268713355064392, "learning_rate": 8.532150981939737e-08, "logits/chosen": -18.28316879272461, "logits/rejected": -17.619693756103516, "logps/chosen": -343.0037841796875, "logps/rejected": -245.7681884765625, "loss": 0.5124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6398584842681885, "rewards/margins": 1.38077974319458, "rewards/rejected": 1.2590787410736084, "step": 77100 }, { "epoch": 3.5800176424160823, "grad_norm": 4.0966715812683105, "learning_rate": 8.52936533729514e-08, "logits/chosen": -18.800479888916016, "logits/rejected": -18.13583755493164, "logps/chosen": -314.7762145996094, "logps/rejected": -285.1343994140625, "loss": 0.7126, "rewards/accuracies": 0.5, "rewards/chosen": 2.514308214187622, "rewards/margins": 0.6076334714889526, "rewards/rejected": 1.9066747426986694, "step": 77110 }, { "epoch": 3.5804819165235156, "grad_norm": 37.21462631225586, "learning_rate": 8.52657969265054e-08, "logits/chosen": -19.756542205810547, "logits/rejected": -20.19661521911621, "logps/chosen": -358.202880859375, "logps/rejected": -353.7594909667969, "loss": 0.3867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4347774982452393, "rewards/margins": 1.2825084924697876, "rewards/rejected": 2.152268886566162, "step": 77120 }, { "epoch": 3.5809461906309483, "grad_norm": 24.727657318115234, "learning_rate": 8.523794048005942e-08, "logits/chosen": -19.330137252807617, "logits/rejected": -17.82503890991211, "logps/chosen": -504.31170654296875, "logps/rejected": -361.9094543457031, "loss": 0.3059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.0246453285217285, "rewards/margins": 2.092515468597412, "rewards/rejected": 2.9321300983428955, "step": 77130 }, { "epoch": 3.5814104647383815, "grad_norm": 14.844929695129395, "learning_rate": 8.521008403361344e-08, "logits/chosen": -19.60008430480957, "logits/rejected": -18.095291137695312, "logps/chosen": -475.8312072753906, "logps/rejected": -397.3344421386719, "loss": 0.6445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.313817024230957, "rewards/margins": 1.5726677179336548, "rewards/rejected": 2.741149425506592, "step": 77140 }, { "epoch": 3.5818747388458148, "grad_norm": 45.58113098144531, "learning_rate": 8.518222758716747e-08, "logits/chosen": -19.70307731628418, "logits/rejected": -19.35879898071289, "logps/chosen": -535.260009765625, "logps/rejected": -459.6561584472656, "loss": 0.4325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.556617259979248, "rewards/margins": 1.0921179056167603, "rewards/rejected": 3.4644992351531982, "step": 77150 }, { "epoch": 3.5823390129532475, "grad_norm": 18.609607696533203, "learning_rate": 8.515437114072148e-08, "logits/chosen": -19.147014617919922, "logits/rejected": -18.260290145874023, "logps/chosen": -330.3340148925781, "logps/rejected": -223.77841186523438, "loss": 0.7644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4646098613739014, "rewards/margins": 1.5673681497573853, "rewards/rejected": 1.8972419500350952, "step": 77160 }, { "epoch": 3.5828032870606807, "grad_norm": 53.35624313354492, "learning_rate": 8.512651469427551e-08, "logits/chosen": -19.32160758972168, "logits/rejected": -18.651630401611328, "logps/chosen": -498.8644104003906, "logps/rejected": -384.294921875, "loss": 0.7319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.254372596740723, "rewards/margins": 1.3152878284454346, "rewards/rejected": 2.939084529876709, "step": 77170 }, { "epoch": 3.5832675611681135, "grad_norm": 123.44584655761719, "learning_rate": 8.509865824782951e-08, "logits/chosen": -19.52750587463379, "logits/rejected": -17.7763671875, "logps/chosen": -537.7024536132812, "logps/rejected": -464.00140380859375, "loss": 0.4161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.621426582336426, "rewards/margins": 1.579921007156372, "rewards/rejected": 3.041505813598633, "step": 77180 }, { "epoch": 3.5837318352755467, "grad_norm": 49.06803894042969, "learning_rate": 8.507080180138354e-08, "logits/chosen": -18.84706687927246, "logits/rejected": -17.804851531982422, "logps/chosen": -274.6578063964844, "logps/rejected": -233.580078125, "loss": 0.6214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0535190105438232, "rewards/margins": 0.5959080457687378, "rewards/rejected": 1.4576109647750854, "step": 77190 }, { "epoch": 3.5841961093829795, "grad_norm": 65.35232543945312, "learning_rate": 8.504294535493755e-08, "logits/chosen": -19.285125732421875, "logits/rejected": -19.101978302001953, "logps/chosen": -265.39044189453125, "logps/rejected": -323.7960205078125, "loss": 0.7659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.067033052444458, "rewards/margins": 0.3932657837867737, "rewards/rejected": 1.673767328262329, "step": 77200 }, { "epoch": 3.5846603834904127, "grad_norm": 168.8770294189453, "learning_rate": 8.501508890849158e-08, "logits/chosen": -18.364826202392578, "logits/rejected": -19.058650970458984, "logps/chosen": -344.19110107421875, "logps/rejected": -398.170166015625, "loss": 1.5889, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6842761039733887, "rewards/margins": -0.7518230676651001, "rewards/rejected": 3.4360992908477783, "step": 77210 }, { "epoch": 3.585124657597846, "grad_norm": 6.066861629486084, "learning_rate": 8.49872324620456e-08, "logits/chosen": -18.75143051147461, "logits/rejected": -17.721981048583984, "logps/chosen": -460.61053466796875, "logps/rejected": -356.980224609375, "loss": 0.5485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.144195318222046, "rewards/margins": 0.7256539463996887, "rewards/rejected": 2.418541431427002, "step": 77220 }, { "epoch": 3.5855889317052787, "grad_norm": 47.617435455322266, "learning_rate": 8.49593760155996e-08, "logits/chosen": -18.815038681030273, "logits/rejected": -18.13556480407715, "logps/chosen": -289.00177001953125, "logps/rejected": -268.8831787109375, "loss": 0.6134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.015450954437256, "rewards/margins": 0.5479257106781006, "rewards/rejected": 2.4675252437591553, "step": 77230 }, { "epoch": 3.586053205812712, "grad_norm": 51.26380157470703, "learning_rate": 8.493151956915362e-08, "logits/chosen": -18.214736938476562, "logits/rejected": -16.751680374145508, "logps/chosen": -395.7220153808594, "logps/rejected": -224.73495483398438, "loss": 0.1656, "rewards/accuracies": 1.0, "rewards/chosen": 3.8110861778259277, "rewards/margins": 3.437201976776123, "rewards/rejected": 0.37388449907302856, "step": 77240 }, { "epoch": 3.5865174799201447, "grad_norm": 311.7914733886719, "learning_rate": 8.490366312270764e-08, "logits/chosen": -19.22330093383789, "logits/rejected": -17.91942596435547, "logps/chosen": -418.67999267578125, "logps/rejected": -252.8165740966797, "loss": 0.5239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.845038652420044, "rewards/margins": 1.5243771076202393, "rewards/rejected": 1.3206614255905151, "step": 77250 }, { "epoch": 3.586981754027578, "grad_norm": 65.03514862060547, "learning_rate": 8.487580667626166e-08, "logits/chosen": -19.330026626586914, "logits/rejected": -19.266080856323242, "logps/chosen": -452.35858154296875, "logps/rejected": -441.38330078125, "loss": 1.4486, "rewards/accuracies": 0.5, "rewards/chosen": 3.7408032417297363, "rewards/margins": -0.0814276710152626, "rewards/rejected": 3.822230577468872, "step": 77260 }, { "epoch": 3.5874460281350107, "grad_norm": 47.28074645996094, "learning_rate": 8.484795022981568e-08, "logits/chosen": -18.36473274230957, "logits/rejected": -18.43044662475586, "logps/chosen": -346.7044677734375, "logps/rejected": -381.222900390625, "loss": 1.4274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4611706733703613, "rewards/margins": -0.16895398497581482, "rewards/rejected": 3.63012433052063, "step": 77270 }, { "epoch": 3.587910302242444, "grad_norm": 59.23974609375, "learning_rate": 8.482009378336971e-08, "logits/chosen": -18.377931594848633, "logits/rejected": -18.23093032836914, "logps/chosen": -327.5889587402344, "logps/rejected": -261.84405517578125, "loss": 0.4636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.639768600463867, "rewards/margins": 1.3353891372680664, "rewards/rejected": 1.3043795824050903, "step": 77280 }, { "epoch": 3.588374576349877, "grad_norm": 40.723270416259766, "learning_rate": 8.479223733692371e-08, "logits/chosen": -19.7340145111084, "logits/rejected": -19.302141189575195, "logps/chosen": -454.224365234375, "logps/rejected": -408.6576843261719, "loss": 0.5931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7315635681152344, "rewards/margins": 0.6069894433021545, "rewards/rejected": 3.1245739459991455, "step": 77290 }, { "epoch": 3.58883885045731, "grad_norm": 51.433712005615234, "learning_rate": 8.476438089047774e-08, "logits/chosen": -19.537113189697266, "logits/rejected": -19.50101089477539, "logps/chosen": -322.50189208984375, "logps/rejected": -307.26385498046875, "loss": 1.4209, "rewards/accuracies": 0.5, "rewards/chosen": 2.7353830337524414, "rewards/margins": -0.3525110185146332, "rewards/rejected": 3.0878939628601074, "step": 77300 }, { "epoch": 3.589303124564743, "grad_norm": 53.52959442138672, "learning_rate": 8.473652444403175e-08, "logits/chosen": -18.180444717407227, "logits/rejected": -18.163915634155273, "logps/chosen": -423.2500915527344, "logps/rejected": -395.34136962890625, "loss": 0.8737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.809406280517578, "rewards/margins": 1.10447096824646, "rewards/rejected": 2.704935312271118, "step": 77310 }, { "epoch": 3.5897673986721763, "grad_norm": 113.71672058105469, "learning_rate": 8.470866799758578e-08, "logits/chosen": -19.265522003173828, "logits/rejected": -18.442880630493164, "logps/chosen": -425.4098205566406, "logps/rejected": -357.6912841796875, "loss": 0.6582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.684380292892456, "rewards/margins": 0.9479449391365051, "rewards/rejected": 2.7364351749420166, "step": 77320 }, { "epoch": 3.590231672779609, "grad_norm": 20.479278564453125, "learning_rate": 8.468081155113978e-08, "logits/chosen": -19.70895004272461, "logits/rejected": -18.80641746520996, "logps/chosen": -463.19122314453125, "logps/rejected": -395.1956787109375, "loss": 0.4269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.433488845825195, "rewards/margins": 1.2831649780273438, "rewards/rejected": 3.1503241062164307, "step": 77330 }, { "epoch": 3.590695946887042, "grad_norm": 1.0052706003189087, "learning_rate": 8.465295510469381e-08, "logits/chosen": -18.94973373413086, "logits/rejected": -17.61436653137207, "logps/chosen": -424.2859802246094, "logps/rejected": -358.1861267089844, "loss": 0.8429, "rewards/accuracies": 0.5, "rewards/chosen": 3.341212034225464, "rewards/margins": 1.2913973331451416, "rewards/rejected": 2.0498147010803223, "step": 77340 }, { "epoch": 3.591160220994475, "grad_norm": 60.44784927368164, "learning_rate": 8.462509865824782e-08, "logits/chosen": -19.3381290435791, "logits/rejected": -18.705860137939453, "logps/chosen": -360.5359191894531, "logps/rejected": -389.97601318359375, "loss": 0.7264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8932201862335205, "rewards/margins": 0.6497637033462524, "rewards/rejected": 3.2434563636779785, "step": 77350 }, { "epoch": 3.5916244951019083, "grad_norm": 7.9913105964660645, "learning_rate": 8.459724221180185e-08, "logits/chosen": -18.936511993408203, "logits/rejected": -18.266359329223633, "logps/chosen": -398.2181091308594, "logps/rejected": -356.0415954589844, "loss": 0.6442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3055026531219482, "rewards/margins": 1.8128044605255127, "rewards/rejected": 1.4926984310150146, "step": 77360 }, { "epoch": 3.592088769209341, "grad_norm": 75.420166015625, "learning_rate": 8.456938576535586e-08, "logits/chosen": -19.64040756225586, "logits/rejected": -18.15640640258789, "logps/chosen": -445.59906005859375, "logps/rejected": -336.97607421875, "loss": 0.4854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.091360569000244, "rewards/margins": 0.9687578082084656, "rewards/rejected": 2.122602701187134, "step": 77370 }, { "epoch": 3.5925530433167743, "grad_norm": 79.92000579833984, "learning_rate": 8.454152931890989e-08, "logits/chosen": -19.554798126220703, "logits/rejected": -18.748249053955078, "logps/chosen": -323.6881408691406, "logps/rejected": -307.54583740234375, "loss": 0.8563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.150019884109497, "rewards/margins": 0.34480780363082886, "rewards/rejected": 2.8052120208740234, "step": 77380 }, { "epoch": 3.5930173174242075, "grad_norm": 75.00927734375, "learning_rate": 8.45136728724639e-08, "logits/chosen": -18.79730987548828, "logits/rejected": -18.017921447753906, "logps/chosen": -355.30810546875, "logps/rejected": -285.1210632324219, "loss": 0.716, "rewards/accuracies": 0.5, "rewards/chosen": 2.7964327335357666, "rewards/margins": 1.3652677536010742, "rewards/rejected": 1.4311648607254028, "step": 77390 }, { "epoch": 3.5934815915316403, "grad_norm": 19.026906967163086, "learning_rate": 8.448581642601792e-08, "logits/chosen": -20.739046096801758, "logits/rejected": -19.738292694091797, "logps/chosen": -520.5715942382812, "logps/rejected": -405.14044189453125, "loss": 0.3061, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.6625237464904785, "rewards/margins": 1.9678376913070679, "rewards/rejected": 3.6946861743927, "step": 77400 }, { "epoch": 3.593945865639073, "grad_norm": 131.45608520507812, "learning_rate": 8.445795997957194e-08, "logits/chosen": -18.70012855529785, "logits/rejected": -18.127758026123047, "logps/chosen": -419.9502868652344, "logps/rejected": -378.0461120605469, "loss": 0.6377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.439959526062012, "rewards/margins": 0.5786604881286621, "rewards/rejected": 3.8612990379333496, "step": 77410 }, { "epoch": 3.5944101397465062, "grad_norm": 87.52277374267578, "learning_rate": 8.443010353312595e-08, "logits/chosen": -19.137840270996094, "logits/rejected": -18.132457733154297, "logps/chosen": -416.11126708984375, "logps/rejected": -297.07257080078125, "loss": 0.3277, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.033198356628418, "rewards/margins": 1.9464057683944702, "rewards/rejected": 2.0867931842803955, "step": 77420 }, { "epoch": 3.5948744138539395, "grad_norm": 18.841079711914062, "learning_rate": 8.440224708667998e-08, "logits/chosen": -18.543371200561523, "logits/rejected": -18.241348266601562, "logps/chosen": -446.001708984375, "logps/rejected": -386.546142578125, "loss": 0.5868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.870992660522461, "rewards/margins": 1.7012155055999756, "rewards/rejected": 3.1697771549224854, "step": 77430 }, { "epoch": 3.5953386879613722, "grad_norm": 13.78931713104248, "learning_rate": 8.437439064023398e-08, "logits/chosen": -19.04273223876953, "logits/rejected": -18.0092830657959, "logps/chosen": -453.11370849609375, "logps/rejected": -391.58343505859375, "loss": 0.2878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.199971675872803, "rewards/margins": 1.9316399097442627, "rewards/rejected": 2.2683322429656982, "step": 77440 }, { "epoch": 3.5958029620688055, "grad_norm": 159.67539978027344, "learning_rate": 8.434653419378801e-08, "logits/chosen": -19.37234878540039, "logits/rejected": -19.628097534179688, "logps/chosen": -395.36383056640625, "logps/rejected": -386.20654296875, "loss": 1.22, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3797664642333984, "rewards/margins": -0.4313777983188629, "rewards/rejected": 3.8111438751220703, "step": 77450 }, { "epoch": 3.5962672361762387, "grad_norm": 160.51927185058594, "learning_rate": 8.431867774734202e-08, "logits/chosen": -19.52865982055664, "logits/rejected": -18.43184471130371, "logps/chosen": -418.8861389160156, "logps/rejected": -275.23187255859375, "loss": 0.2902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.407467365264893, "rewards/margins": 2.885277509689331, "rewards/rejected": 1.5221892595291138, "step": 77460 }, { "epoch": 3.5967315102836714, "grad_norm": 76.24901580810547, "learning_rate": 8.429082130089605e-08, "logits/chosen": -19.85419464111328, "logits/rejected": -18.074363708496094, "logps/chosen": -400.1998596191406, "logps/rejected": -315.1593933105469, "loss": 0.2948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.276867389678955, "rewards/margins": 3.2633635997772217, "rewards/rejected": 2.0135040283203125, "step": 77470 }, { "epoch": 3.5971957843911047, "grad_norm": 19.588001251220703, "learning_rate": 8.426296485445006e-08, "logits/chosen": -20.341684341430664, "logits/rejected": -19.055347442626953, "logps/chosen": -598.2691650390625, "logps/rejected": -412.7605895996094, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": 5.455567359924316, "rewards/margins": 2.716078519821167, "rewards/rejected": 2.7394890785217285, "step": 77480 }, { "epoch": 3.5976600584985374, "grad_norm": 37.306129455566406, "learning_rate": 8.423510840800409e-08, "logits/chosen": -19.469911575317383, "logits/rejected": -18.493154525756836, "logps/chosen": -381.3955993652344, "logps/rejected": -279.502685546875, "loss": 0.7669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4722259044647217, "rewards/margins": 1.0048896074295044, "rewards/rejected": 2.467336416244507, "step": 77490 }, { "epoch": 3.5981243326059706, "grad_norm": 2.7244722843170166, "learning_rate": 8.420725196155809e-08, "logits/chosen": -18.756816864013672, "logits/rejected": -18.183732986450195, "logps/chosen": -380.4388732910156, "logps/rejected": -345.10174560546875, "loss": 0.6636, "rewards/accuracies": 0.5, "rewards/chosen": 3.7144291400909424, "rewards/margins": 1.3729411363601685, "rewards/rejected": 2.3414883613586426, "step": 77500 }, { "epoch": 3.5985886067134034, "grad_norm": 65.44622039794922, "learning_rate": 8.417939551511212e-08, "logits/chosen": -18.43063735961914, "logits/rejected": -18.26850128173828, "logps/chosen": -393.3245849609375, "logps/rejected": -394.7164611816406, "loss": 1.2603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4858880043029785, "rewards/margins": 0.752644419670105, "rewards/rejected": 2.733243703842163, "step": 77510 }, { "epoch": 3.5990528808208366, "grad_norm": 267.7353515625, "learning_rate": 8.415153906866614e-08, "logits/chosen": -18.796300888061523, "logits/rejected": -18.52251434326172, "logps/chosen": -356.4979553222656, "logps/rejected": -313.89837646484375, "loss": 0.6142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8420116901397705, "rewards/margins": 0.6277950406074524, "rewards/rejected": 2.214216470718384, "step": 77520 }, { "epoch": 3.59951715492827, "grad_norm": 110.86639404296875, "learning_rate": 8.412368262222016e-08, "logits/chosen": -19.598485946655273, "logits/rejected": -18.923620223999023, "logps/chosen": -439.28375244140625, "logps/rejected": -341.77459716796875, "loss": 0.3717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.771294355392456, "rewards/margins": 1.1280851364135742, "rewards/rejected": 2.64320969581604, "step": 77530 }, { "epoch": 3.5999814290357026, "grad_norm": 53.49076843261719, "learning_rate": 8.409582617577416e-08, "logits/chosen": -19.675931930541992, "logits/rejected": -20.06011199951172, "logps/chosen": -417.36572265625, "logps/rejected": -403.2921142578125, "loss": 1.1523, "rewards/accuracies": 0.5, "rewards/chosen": 4.180300235748291, "rewards/margins": 0.5622605085372925, "rewards/rejected": 3.61803936958313, "step": 77540 }, { "epoch": 3.600445703143136, "grad_norm": 19.774934768676758, "learning_rate": 8.406796972932819e-08, "logits/chosen": -19.078807830810547, "logits/rejected": -18.278305053710938, "logps/chosen": -445.31597900390625, "logps/rejected": -367.39227294921875, "loss": 0.5119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.330897092819214, "rewards/margins": 1.0085389614105225, "rewards/rejected": 2.3223581314086914, "step": 77550 }, { "epoch": 3.6009099772505686, "grad_norm": 6.8526930809021, "learning_rate": 8.404011328288221e-08, "logits/chosen": -18.42636489868164, "logits/rejected": -17.962244033813477, "logps/chosen": -521.2228393554688, "logps/rejected": -517.2861938476562, "loss": 0.5154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.531131267547607, "rewards/margins": 1.3432238101959229, "rewards/rejected": 3.1879079341888428, "step": 77560 }, { "epoch": 3.601374251358002, "grad_norm": 2.5702171325683594, "learning_rate": 8.401504248108083e-08, "logits/chosen": -18.831798553466797, "logits/rejected": -18.418188095092773, "logps/chosen": -385.9378662109375, "logps/rejected": -321.55108642578125, "loss": 0.9993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.378818988800049, "rewards/margins": 0.9223307371139526, "rewards/rejected": 2.4564881324768066, "step": 77570 }, { "epoch": 3.6018385254654346, "grad_norm": 70.74494934082031, "learning_rate": 8.398718603463486e-08, "logits/chosen": -18.339550018310547, "logits/rejected": -18.469392776489258, "logps/chosen": -266.9631652832031, "logps/rejected": -305.31195068359375, "loss": 1.0369, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.615692615509033, "rewards/margins": 0.06538544595241547, "rewards/rejected": 2.550307273864746, "step": 77580 }, { "epoch": 3.602302799572868, "grad_norm": 45.75593185424805, "learning_rate": 8.395932958818886e-08, "logits/chosen": -18.348613739013672, "logits/rejected": -17.847492218017578, "logps/chosen": -396.241943359375, "logps/rejected": -342.93377685546875, "loss": 0.7897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3215365409851074, "rewards/margins": 1.0135860443115234, "rewards/rejected": 2.307950496673584, "step": 77590 }, { "epoch": 3.602767073680301, "grad_norm": 34.099205017089844, "learning_rate": 8.393147314174288e-08, "logits/chosen": -19.032947540283203, "logits/rejected": -18.57042694091797, "logps/chosen": -513.0552978515625, "logps/rejected": -382.04400634765625, "loss": 0.3387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.95391845703125, "rewards/margins": 1.4790892601013184, "rewards/rejected": 2.4748289585113525, "step": 77600 }, { "epoch": 3.603231347787734, "grad_norm": 30.0655517578125, "learning_rate": 8.39036166952969e-08, "logits/chosen": -19.797866821289062, "logits/rejected": -18.958919525146484, "logps/chosen": -413.24224853515625, "logps/rejected": -238.61715698242188, "loss": 0.6595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.778630495071411, "rewards/margins": 1.7800753116607666, "rewards/rejected": 1.9985551834106445, "step": 77610 }, { "epoch": 3.603695621895167, "grad_norm": 212.0994415283203, "learning_rate": 8.387576024885093e-08, "logits/chosen": -19.394962310791016, "logits/rejected": -18.874244689941406, "logps/chosen": -457.05206298828125, "logps/rejected": -382.5931091308594, "loss": 1.0433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.097069263458252, "rewards/margins": 0.749438464641571, "rewards/rejected": 3.347630739212036, "step": 77620 }, { "epoch": 3.6041598960026, "grad_norm": 4.128126621246338, "learning_rate": 8.384790380240493e-08, "logits/chosen": -18.837675094604492, "logits/rejected": -17.332517623901367, "logps/chosen": -300.9912414550781, "logps/rejected": -183.98944091796875, "loss": 0.5301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5066885948181152, "rewards/margins": 1.9786031246185303, "rewards/rejected": 0.5280854105949402, "step": 77630 }, { "epoch": 3.604624170110033, "grad_norm": 144.27708435058594, "learning_rate": 8.382004735595896e-08, "logits/chosen": -18.339988708496094, "logits/rejected": -18.723669052124023, "logps/chosen": -421.6475524902344, "logps/rejected": -433.7847595214844, "loss": 1.2959, "rewards/accuracies": 0.5, "rewards/chosen": 3.1242425441741943, "rewards/margins": -0.19428391754627228, "rewards/rejected": 3.3185267448425293, "step": 77640 }, { "epoch": 3.6050884442174658, "grad_norm": 77.73936462402344, "learning_rate": 8.379219090951297e-08, "logits/chosen": -18.404565811157227, "logits/rejected": -18.175519943237305, "logps/chosen": -391.7499084472656, "logps/rejected": -431.1551818847656, "loss": 1.11, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9246203899383545, "rewards/margins": -0.019173432141542435, "rewards/rejected": 2.943794012069702, "step": 77650 }, { "epoch": 3.605552718324899, "grad_norm": 260.28173828125, "learning_rate": 8.3764334463067e-08, "logits/chosen": -19.5874080657959, "logits/rejected": -19.237314224243164, "logps/chosen": -421.8734436035156, "logps/rejected": -387.99530029296875, "loss": 0.7217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.493241786956787, "rewards/margins": 0.7694133520126343, "rewards/rejected": 2.7238283157348633, "step": 77660 }, { "epoch": 3.606016992432332, "grad_norm": 9.045784950256348, "learning_rate": 8.373647801662101e-08, "logits/chosen": -20.031904220581055, "logits/rejected": -19.629165649414062, "logps/chosen": -442.102783203125, "logps/rejected": -379.24591064453125, "loss": 0.6561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9995665550231934, "rewards/margins": 0.5890635251998901, "rewards/rejected": 3.410503387451172, "step": 77670 }, { "epoch": 3.606481266539765, "grad_norm": 2.101635694503784, "learning_rate": 8.370862157017501e-08, "logits/chosen": -20.083423614501953, "logits/rejected": -19.57155990600586, "logps/chosen": -413.3544006347656, "logps/rejected": -333.1742858886719, "loss": 1.2998, "rewards/accuracies": 0.5, "rewards/chosen": 3.1465225219726562, "rewards/margins": 0.8750765919685364, "rewards/rejected": 2.2714457511901855, "step": 77680 }, { "epoch": 3.606945540647198, "grad_norm": 3.1178371906280518, "learning_rate": 8.368076512372904e-08, "logits/chosen": -20.127727508544922, "logits/rejected": -18.956905364990234, "logps/chosen": -441.21771240234375, "logps/rejected": -307.09124755859375, "loss": 0.5221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.626133441925049, "rewards/margins": 2.2577929496765137, "rewards/rejected": 2.368339776992798, "step": 77690 }, { "epoch": 3.6074098147546314, "grad_norm": 0.26743283867836, "learning_rate": 8.365290867728306e-08, "logits/chosen": -19.359539031982422, "logits/rejected": -18.795059204101562, "logps/chosen": -404.64501953125, "logps/rejected": -341.06988525390625, "loss": 0.3684, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.076717853546143, "rewards/margins": 2.876042604446411, "rewards/rejected": 2.200674533843994, "step": 77700 }, { "epoch": 3.607874088862064, "grad_norm": 0.25661155581474304, "learning_rate": 8.362505223083708e-08, "logits/chosen": -19.75881004333496, "logits/rejected": -18.939128875732422, "logps/chosen": -361.1927185058594, "logps/rejected": -227.489990234375, "loss": 0.7944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1754233837127686, "rewards/margins": 1.3718398809432983, "rewards/rejected": 1.8035835027694702, "step": 77710 }, { "epoch": 3.608338362969497, "grad_norm": 33.43780517578125, "learning_rate": 8.35971957843911e-08, "logits/chosen": -18.76628303527832, "logits/rejected": -17.410306930541992, "logps/chosen": -360.13751220703125, "logps/rejected": -294.2298583984375, "loss": 0.5463, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.529966354370117, "rewards/margins": 1.5854437351226807, "rewards/rejected": 1.944522500038147, "step": 77720 }, { "epoch": 3.60880263707693, "grad_norm": 81.01337432861328, "learning_rate": 8.356933933794513e-08, "logits/chosen": -18.765155792236328, "logits/rejected": -17.47830581665039, "logps/chosen": -417.146728515625, "logps/rejected": -279.9026794433594, "loss": 0.6382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.706141948699951, "rewards/margins": 1.86053466796875, "rewards/rejected": 0.8456071615219116, "step": 77730 }, { "epoch": 3.6092669111843634, "grad_norm": 182.24819946289062, "learning_rate": 8.354148289149913e-08, "logits/chosen": -18.58460807800293, "logits/rejected": -18.636032104492188, "logps/chosen": -320.3326721191406, "logps/rejected": -247.7434844970703, "loss": 0.4624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.422238826751709, "rewards/margins": 1.5163028240203857, "rewards/rejected": 0.905936062335968, "step": 77740 }, { "epoch": 3.609731185291796, "grad_norm": 116.59769439697266, "learning_rate": 8.351362644505316e-08, "logits/chosen": -19.014049530029297, "logits/rejected": -18.022729873657227, "logps/chosen": -460.289794921875, "logps/rejected": -377.44586181640625, "loss": 0.768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.148055076599121, "rewards/margins": 1.32021164894104, "rewards/rejected": 2.8278439044952393, "step": 77750 }, { "epoch": 3.6101954593992294, "grad_norm": 0.2515781819820404, "learning_rate": 8.348576999860717e-08, "logits/chosen": -17.961410522460938, "logits/rejected": -17.45089340209961, "logps/chosen": -411.24853515625, "logps/rejected": -393.25030517578125, "loss": 0.5741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.113002300262451, "rewards/margins": 1.0452789068222046, "rewards/rejected": 2.067723035812378, "step": 77760 }, { "epoch": 3.6106597335066626, "grad_norm": 187.86572265625, "learning_rate": 8.34579135521612e-08, "logits/chosen": -18.528118133544922, "logits/rejected": -17.79393768310547, "logps/chosen": -367.98333740234375, "logps/rejected": -298.16094970703125, "loss": 1.0363, "rewards/accuracies": 0.5, "rewards/chosen": 3.958702564239502, "rewards/margins": 0.37566059827804565, "rewards/rejected": 3.583042621612549, "step": 77770 }, { "epoch": 3.6111240076140954, "grad_norm": 54.7931022644043, "learning_rate": 8.343005710571521e-08, "logits/chosen": -18.587631225585938, "logits/rejected": -17.951839447021484, "logps/chosen": -418.5262145996094, "logps/rejected": -346.3668212890625, "loss": 1.2894, "rewards/accuracies": 0.5, "rewards/chosen": 3.2720847129821777, "rewards/margins": 0.22309179604053497, "rewards/rejected": 3.048992872238159, "step": 77780 }, { "epoch": 3.611588281721528, "grad_norm": 34.47636413574219, "learning_rate": 8.340220065926924e-08, "logits/chosen": -18.705097198486328, "logits/rejected": -18.41330337524414, "logps/chosen": -470.73773193359375, "logps/rejected": -385.3334045410156, "loss": 0.634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8656692504882812, "rewards/margins": 1.4309475421905518, "rewards/rejected": 2.4347217082977295, "step": 77790 }, { "epoch": 3.6120525558289613, "grad_norm": 116.66838836669922, "learning_rate": 8.337434421282324e-08, "logits/chosen": -19.433799743652344, "logits/rejected": -18.338207244873047, "logps/chosen": -504.2373962402344, "logps/rejected": -340.40142822265625, "loss": 0.4576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.816059112548828, "rewards/margins": 1.102099061012268, "rewards/rejected": 2.7139601707458496, "step": 77800 }, { "epoch": 3.6125168299363946, "grad_norm": 151.71624755859375, "learning_rate": 8.334648776637727e-08, "logits/chosen": -19.520206451416016, "logits/rejected": -18.155284881591797, "logps/chosen": -534.3146362304688, "logps/rejected": -392.6239929199219, "loss": 0.2617, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.639508247375488, "rewards/margins": 2.12037992477417, "rewards/rejected": 2.51912784576416, "step": 77810 }, { "epoch": 3.6129811040438273, "grad_norm": 76.84159088134766, "learning_rate": 8.331863131993128e-08, "logits/chosen": -18.77200698852539, "logits/rejected": -18.54842758178711, "logps/chosen": -400.01678466796875, "logps/rejected": -317.9393310546875, "loss": 0.5803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0664100646972656, "rewards/margins": 0.6460890769958496, "rewards/rejected": 2.420320987701416, "step": 77820 }, { "epoch": 3.6134453781512605, "grad_norm": 48.14533996582031, "learning_rate": 8.329077487348531e-08, "logits/chosen": -19.32227325439453, "logits/rejected": -18.05809783935547, "logps/chosen": -412.3184509277344, "logps/rejected": -332.7889404296875, "loss": 0.9581, "rewards/accuracies": 0.5, "rewards/chosen": 2.963022470474243, "rewards/margins": 0.5024818181991577, "rewards/rejected": 2.460541009902954, "step": 77830 }, { "epoch": 3.6139096522586938, "grad_norm": 106.27574920654297, "learning_rate": 8.326291842703931e-08, "logits/chosen": -19.135622024536133, "logits/rejected": -18.236942291259766, "logps/chosen": -472.9469299316406, "logps/rejected": -420.23577880859375, "loss": 0.4856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.544286727905273, "rewards/margins": 1.7589555978775024, "rewards/rejected": 2.7853314876556396, "step": 77840 }, { "epoch": 3.6143739263661265, "grad_norm": 5.8419599533081055, "learning_rate": 8.323506198059334e-08, "logits/chosen": -18.746179580688477, "logits/rejected": -17.68631362915039, "logps/chosen": -567.607177734375, "logps/rejected": -433.73553466796875, "loss": 0.2851, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.171173572540283, "rewards/margins": 2.3601484298706055, "rewards/rejected": 2.8110249042510986, "step": 77850 }, { "epoch": 3.6148382004735597, "grad_norm": 17.789766311645508, "learning_rate": 8.320720553414736e-08, "logits/chosen": -18.790630340576172, "logits/rejected": -18.54905891418457, "logps/chosen": -395.75347900390625, "logps/rejected": -270.096923828125, "loss": 0.4002, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.097649097442627, "rewards/margins": 1.787981629371643, "rewards/rejected": 1.3096674680709839, "step": 77860 }, { "epoch": 3.6153024745809925, "grad_norm": 8.298139572143555, "learning_rate": 8.317934908770137e-08, "logits/chosen": -18.35659408569336, "logits/rejected": -17.873069763183594, "logps/chosen": -338.11328125, "logps/rejected": -261.42510986328125, "loss": 0.4378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4603512287139893, "rewards/margins": 1.7524913549423218, "rewards/rejected": 0.7078596949577332, "step": 77870 }, { "epoch": 3.6157667486884257, "grad_norm": 34.6536979675293, "learning_rate": 8.31514926412554e-08, "logits/chosen": -19.06846046447754, "logits/rejected": -18.009552001953125, "logps/chosen": -478.026123046875, "logps/rejected": -327.7030944824219, "loss": 0.4463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.141087532043457, "rewards/margins": 1.279815912246704, "rewards/rejected": 1.8612715005874634, "step": 77880 }, { "epoch": 3.6162310227958585, "grad_norm": 17.109113693237305, "learning_rate": 8.31236361948094e-08, "logits/chosen": -18.613685607910156, "logits/rejected": -17.68018341064453, "logps/chosen": -378.5572204589844, "logps/rejected": -239.55477905273438, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 3.7896244525909424, "rewards/margins": 2.3032517433166504, "rewards/rejected": 1.4863728284835815, "step": 77890 }, { "epoch": 3.6166952969032917, "grad_norm": 2.4488883018493652, "learning_rate": 8.309577974836343e-08, "logits/chosen": -19.611631393432617, "logits/rejected": -19.059310913085938, "logps/chosen": -327.93499755859375, "logps/rejected": -335.3520202636719, "loss": 0.6428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.044419765472412, "rewards/margins": 0.8106447458267212, "rewards/rejected": 2.2337751388549805, "step": 77900 }, { "epoch": 3.617159571010725, "grad_norm": 22.234342575073242, "learning_rate": 8.306792330191744e-08, "logits/chosen": -19.694164276123047, "logits/rejected": -19.31845474243164, "logps/chosen": -277.35198974609375, "logps/rejected": -234.149658203125, "loss": 1.2129, "rewards/accuracies": 0.5, "rewards/chosen": 1.4647021293640137, "rewards/margins": 0.07254375517368317, "rewards/rejected": 1.3921583890914917, "step": 77910 }, { "epoch": 3.6176238451181577, "grad_norm": 16.823110580444336, "learning_rate": 8.304006685547147e-08, "logits/chosen": -19.27499771118164, "logits/rejected": -18.857311248779297, "logps/chosen": -556.4868774414062, "logps/rejected": -534.5250854492188, "loss": 0.9009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.225010871887207, "rewards/margins": 0.9157489538192749, "rewards/rejected": 3.30926251411438, "step": 77920 }, { "epoch": 3.618088119225591, "grad_norm": 1.612713098526001, "learning_rate": 8.301221040902548e-08, "logits/chosen": -19.494234085083008, "logits/rejected": -18.697547912597656, "logps/chosen": -364.5980224609375, "logps/rejected": -336.40447998046875, "loss": 0.8437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7587695121765137, "rewards/margins": 1.3044483661651611, "rewards/rejected": 2.4543209075927734, "step": 77930 }, { "epoch": 3.6185523933330237, "grad_norm": 78.61761474609375, "learning_rate": 8.298435396257951e-08, "logits/chosen": -18.64012336730957, "logits/rejected": -17.337297439575195, "logps/chosen": -326.4209899902344, "logps/rejected": -300.1498718261719, "loss": 0.8789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.264197826385498, "rewards/margins": 0.8307560682296753, "rewards/rejected": 2.4334421157836914, "step": 77940 }, { "epoch": 3.619016667440457, "grad_norm": 62.343502044677734, "learning_rate": 8.295649751613351e-08, "logits/chosen": -19.849674224853516, "logits/rejected": -18.306760787963867, "logps/chosen": -421.5677795410156, "logps/rejected": -370.52252197265625, "loss": 0.526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.888050079345703, "rewards/margins": 1.6833432912826538, "rewards/rejected": 2.204706907272339, "step": 77950 }, { "epoch": 3.6194809415478897, "grad_norm": 44.59721755981445, "learning_rate": 8.292864106968754e-08, "logits/chosen": -19.320974349975586, "logits/rejected": -18.653139114379883, "logps/chosen": -431.9847106933594, "logps/rejected": -388.4430236816406, "loss": 0.5797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.456834077835083, "rewards/margins": 0.6715995669364929, "rewards/rejected": 2.7852344512939453, "step": 77960 }, { "epoch": 3.619945215655323, "grad_norm": 53.7072868347168, "learning_rate": 8.290078462324155e-08, "logits/chosen": -18.26553726196289, "logits/rejected": -18.495548248291016, "logps/chosen": -372.31854248046875, "logps/rejected": -358.96380615234375, "loss": 2.155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7608587741851807, "rewards/margins": -0.7802461385726929, "rewards/rejected": 3.541104793548584, "step": 77970 }, { "epoch": 3.620409489762756, "grad_norm": 0.43332111835479736, "learning_rate": 8.287292817679558e-08, "logits/chosen": -18.786535263061523, "logits/rejected": -18.01784324645996, "logps/chosen": -389.6075744628906, "logps/rejected": -302.1981506347656, "loss": 0.4337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.032922744750977, "rewards/margins": 2.0195791721343994, "rewards/rejected": 2.013343334197998, "step": 77980 }, { "epoch": 3.620873763870189, "grad_norm": 55.5138053894043, "learning_rate": 8.28450717303496e-08, "logits/chosen": -18.816722869873047, "logits/rejected": -18.038169860839844, "logps/chosen": -367.40283203125, "logps/rejected": -302.5234680175781, "loss": 0.7371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.603130340576172, "rewards/margins": 1.1128480434417725, "rewards/rejected": 1.4902825355529785, "step": 77990 }, { "epoch": 3.621338037977622, "grad_norm": 74.11945343017578, "learning_rate": 8.281721528390362e-08, "logits/chosen": -18.25733757019043, "logits/rejected": -17.63406753540039, "logps/chosen": -281.4507751464844, "logps/rejected": -245.0518798828125, "loss": 0.7925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.544551372528076, "rewards/margins": 0.8265743255615234, "rewards/rejected": 1.7179772853851318, "step": 78000 }, { "epoch": 3.621802312085055, "grad_norm": 96.87596893310547, "learning_rate": 8.278935883745763e-08, "logits/chosen": -19.785884857177734, "logits/rejected": -18.661880493164062, "logps/chosen": -338.1484680175781, "logps/rejected": -361.81683349609375, "loss": 0.6151, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9278578758239746, "rewards/margins": 1.2408632040023804, "rewards/rejected": 2.6869945526123047, "step": 78010 }, { "epoch": 3.622266586192488, "grad_norm": 87.37191772460938, "learning_rate": 8.276150239101165e-08, "logits/chosen": -19.37933921813965, "logits/rejected": -17.756946563720703, "logps/chosen": -551.7484130859375, "logps/rejected": -336.513916015625, "loss": 0.1882, "rewards/accuracies": 1.0, "rewards/chosen": 5.317760467529297, "rewards/margins": 2.9316654205322266, "rewards/rejected": 2.3860950469970703, "step": 78020 }, { "epoch": 3.622730860299921, "grad_norm": 77.69979858398438, "learning_rate": 8.273364594456567e-08, "logits/chosen": -17.92061424255371, "logits/rejected": -18.414779663085938, "logps/chosen": -330.43206787109375, "logps/rejected": -346.2048645019531, "loss": 0.9269, "rewards/accuracies": 0.5, "rewards/chosen": 1.8897873163223267, "rewards/margins": 0.1904464066028595, "rewards/rejected": 1.6993408203125, "step": 78030 }, { "epoch": 3.623195134407354, "grad_norm": 15.626084327697754, "learning_rate": 8.27057894981197e-08, "logits/chosen": -18.419795989990234, "logits/rejected": -18.56902503967285, "logps/chosen": -395.8247985839844, "logps/rejected": -473.451904296875, "loss": 0.8173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6536223888397217, "rewards/margins": 0.3593560457229614, "rewards/rejected": 3.29426646232605, "step": 78040 }, { "epoch": 3.6236594085147873, "grad_norm": 5.420627117156982, "learning_rate": 8.26779330516737e-08, "logits/chosen": -19.4077091217041, "logits/rejected": -18.714067459106445, "logps/chosen": -409.4081115722656, "logps/rejected": -328.7110900878906, "loss": 1.1118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9261176586151123, "rewards/margins": 0.6633682250976562, "rewards/rejected": 3.2627499103546143, "step": 78050 }, { "epoch": 3.62412368262222, "grad_norm": 121.2001724243164, "learning_rate": 8.265007660522773e-08, "logits/chosen": -19.0269832611084, "logits/rejected": -19.103771209716797, "logps/chosen": -306.4821472167969, "logps/rejected": -366.64569091796875, "loss": 1.2081, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.6134750843048096, "rewards/margins": -0.004766619298607111, "rewards/rejected": 3.618241786956787, "step": 78060 }, { "epoch": 3.6245879567296533, "grad_norm": 85.85736846923828, "learning_rate": 8.262222015878174e-08, "logits/chosen": -19.837045669555664, "logits/rejected": -18.602649688720703, "logps/chosen": -383.4031677246094, "logps/rejected": -311.91448974609375, "loss": 0.5698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.606567859649658, "rewards/margins": 1.2880430221557617, "rewards/rejected": 2.3185248374938965, "step": 78070 }, { "epoch": 3.625052230837086, "grad_norm": 244.2653350830078, "learning_rate": 8.259436371233575e-08, "logits/chosen": -19.990161895751953, "logits/rejected": -19.0612850189209, "logps/chosen": -424.4964294433594, "logps/rejected": -357.08929443359375, "loss": 0.4661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.824109077453613, "rewards/margins": 2.1250622272491455, "rewards/rejected": 2.699047088623047, "step": 78080 }, { "epoch": 3.6255165049445193, "grad_norm": 0.2846721410751343, "learning_rate": 8.256650726588978e-08, "logits/chosen": -19.40115737915039, "logits/rejected": -18.23711585998535, "logps/chosen": -371.221923828125, "logps/rejected": -330.540771484375, "loss": 0.2506, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2769038677215576, "rewards/margins": 2.4442999362945557, "rewards/rejected": 0.8326044082641602, "step": 78090 }, { "epoch": 3.625980779051952, "grad_norm": 8.259500503540039, "learning_rate": 8.253865081944378e-08, "logits/chosen": -19.27017593383789, "logits/rejected": -18.48906898498535, "logps/chosen": -450.9043884277344, "logps/rejected": -411.8765563964844, "loss": 0.8279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.057729244232178, "rewards/margins": 1.654275894165039, "rewards/rejected": 3.4034533500671387, "step": 78100 }, { "epoch": 3.6264450531593853, "grad_norm": 94.9315414428711, "learning_rate": 8.251079437299781e-08, "logits/chosen": -18.473453521728516, "logits/rejected": -17.773893356323242, "logps/chosen": -407.8563537597656, "logps/rejected": -351.6073913574219, "loss": 0.6622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.034930229187012, "rewards/margins": 1.094015121459961, "rewards/rejected": 2.9409143924713135, "step": 78110 }, { "epoch": 3.6269093272668185, "grad_norm": 5.866820335388184, "learning_rate": 8.248293792655183e-08, "logits/chosen": -19.14869499206543, "logits/rejected": -17.373825073242188, "logps/chosen": -446.146728515625, "logps/rejected": -284.54193115234375, "loss": 0.4519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.019374370574951, "rewards/margins": 3.549215316772461, "rewards/rejected": 1.4701592922210693, "step": 78120 }, { "epoch": 3.6273736013742512, "grad_norm": 95.93441772460938, "learning_rate": 8.245508148010585e-08, "logits/chosen": -18.310331344604492, "logits/rejected": -18.413257598876953, "logps/chosen": -295.88421630859375, "logps/rejected": -339.3153381347656, "loss": 1.6562, "rewards/accuracies": 0.5, "rewards/chosen": 2.6691925525665283, "rewards/margins": -0.3749566078186035, "rewards/rejected": 3.044149398803711, "step": 78130 }, { "epoch": 3.6278378754816845, "grad_norm": 149.6656494140625, "learning_rate": 8.242722503365987e-08, "logits/chosen": -18.940340042114258, "logits/rejected": -18.19054412841797, "logps/chosen": -397.111572265625, "logps/rejected": -256.64117431640625, "loss": 0.8258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4375526905059814, "rewards/margins": 1.8907525539398193, "rewards/rejected": 1.546799898147583, "step": 78140 }, { "epoch": 3.6283021495891177, "grad_norm": 40.330162048339844, "learning_rate": 8.23993685872139e-08, "logits/chosen": -19.00337791442871, "logits/rejected": -18.71259307861328, "logps/chosen": -333.24822998046875, "logps/rejected": -297.26190185546875, "loss": 0.4559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8127894401550293, "rewards/margins": 1.3127942085266113, "rewards/rejected": 1.499995470046997, "step": 78150 }, { "epoch": 3.6287664236965504, "grad_norm": 247.83792114257812, "learning_rate": 8.23715121407679e-08, "logits/chosen": -19.585784912109375, "logits/rejected": -19.403400421142578, "logps/chosen": -507.0770568847656, "logps/rejected": -440.17718505859375, "loss": 1.1146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.187126636505127, "rewards/margins": 0.5385489463806152, "rewards/rejected": 3.648577928543091, "step": 78160 }, { "epoch": 3.629230697803983, "grad_norm": 211.2663116455078, "learning_rate": 8.234365569432193e-08, "logits/chosen": -17.632797241210938, "logits/rejected": -18.037002563476562, "logps/chosen": -380.1756591796875, "logps/rejected": -422.00347900390625, "loss": 0.8267, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3142433166503906, "rewards/margins": 0.40130066871643066, "rewards/rejected": 2.91294264793396, "step": 78170 }, { "epoch": 3.6296949719114164, "grad_norm": 150.78146362304688, "learning_rate": 8.231579924787594e-08, "logits/chosen": -19.717443466186523, "logits/rejected": -17.660518646240234, "logps/chosen": -396.76104736328125, "logps/rejected": -287.0826110839844, "loss": 0.2972, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0998260974884033, "rewards/margins": 2.0057997703552246, "rewards/rejected": 1.0940262079238892, "step": 78180 }, { "epoch": 3.6301592460188497, "grad_norm": 0.06467664986848831, "learning_rate": 8.228794280142997e-08, "logits/chosen": -18.817859649658203, "logits/rejected": -17.560514450073242, "logps/chosen": -409.5346984863281, "logps/rejected": -284.91387939453125, "loss": 0.6428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.383120059967041, "rewards/margins": 2.39265775680542, "rewards/rejected": 1.9904625415802002, "step": 78190 }, { "epoch": 3.6306235201262824, "grad_norm": 98.0693130493164, "learning_rate": 8.226008635498398e-08, "logits/chosen": -17.70136070251465, "logits/rejected": -17.190975189208984, "logps/chosen": -329.3199768066406, "logps/rejected": -253.7830352783203, "loss": 0.6478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.080129384994507, "rewards/margins": 0.7840534448623657, "rewards/rejected": 1.2960760593414307, "step": 78200 }, { "epoch": 3.6310877942337156, "grad_norm": 21.258169174194336, "learning_rate": 8.223222990853801e-08, "logits/chosen": -18.879093170166016, "logits/rejected": -18.360340118408203, "logps/chosen": -407.2041320800781, "logps/rejected": -345.5792541503906, "loss": 0.9059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.575493097305298, "rewards/margins": 0.5007423162460327, "rewards/rejected": 2.0747509002685547, "step": 78210 }, { "epoch": 3.631552068341149, "grad_norm": 93.81593322753906, "learning_rate": 8.220437346209201e-08, "logits/chosen": -20.744680404663086, "logits/rejected": -18.554710388183594, "logps/chosen": -393.96826171875, "logps/rejected": -261.68402099609375, "loss": 0.3213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.135982036590576, "rewards/margins": 2.6456620693206787, "rewards/rejected": 1.4903204441070557, "step": 78220 }, { "epoch": 3.6320163424485816, "grad_norm": 8.493637084960938, "learning_rate": 8.217651701564604e-08, "logits/chosen": -19.322948455810547, "logits/rejected": -18.650348663330078, "logps/chosen": -347.3235778808594, "logps/rejected": -319.02685546875, "loss": 0.6326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7738516330718994, "rewards/margins": 0.7350680828094482, "rewards/rejected": 2.038783550262451, "step": 78230 }, { "epoch": 3.6324806165560144, "grad_norm": 82.1558837890625, "learning_rate": 8.214866056920005e-08, "logits/chosen": -19.43646812438965, "logits/rejected": -18.432079315185547, "logps/chosen": -458.6708068847656, "logps/rejected": -334.95281982421875, "loss": 0.9366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.417168140411377, "rewards/margins": 1.8386147022247314, "rewards/rejected": 2.5785531997680664, "step": 78240 }, { "epoch": 3.6329448906634476, "grad_norm": 16.49592399597168, "learning_rate": 8.212080412275408e-08, "logits/chosen": -18.598072052001953, "logits/rejected": -17.476665496826172, "logps/chosen": -309.8712158203125, "logps/rejected": -269.53033447265625, "loss": 0.7297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.744764804840088, "rewards/margins": 1.9750595092773438, "rewards/rejected": 1.7697051763534546, "step": 78250 }, { "epoch": 3.633409164770881, "grad_norm": 25.259918212890625, "learning_rate": 8.209294767630808e-08, "logits/chosen": -18.600175857543945, "logits/rejected": -17.32235336303711, "logps/chosen": -333.7431335449219, "logps/rejected": -309.4699401855469, "loss": 1.0242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0175271034240723, "rewards/margins": 1.0184839963912964, "rewards/rejected": 1.9990431070327759, "step": 78260 }, { "epoch": 3.6338734388783136, "grad_norm": 33.19819259643555, "learning_rate": 8.20650912298621e-08, "logits/chosen": -19.983203887939453, "logits/rejected": -19.10508155822754, "logps/chosen": -302.7371826171875, "logps/rejected": -264.474365234375, "loss": 0.6479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.625119924545288, "rewards/margins": 0.8381859660148621, "rewards/rejected": 1.78693425655365, "step": 78270 }, { "epoch": 3.634337712985747, "grad_norm": 2.875561475753784, "learning_rate": 8.203723478341612e-08, "logits/chosen": -19.24529457092285, "logits/rejected": -17.991989135742188, "logps/chosen": -508.4414978027344, "logps/rejected": -394.20721435546875, "loss": 0.2458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.773319721221924, "rewards/margins": 2.201788902282715, "rewards/rejected": 2.571530818939209, "step": 78280 }, { "epoch": 3.63480198709318, "grad_norm": 38.28824234008789, "learning_rate": 8.200937833697014e-08, "logits/chosen": -19.33340072631836, "logits/rejected": -17.602340698242188, "logps/chosen": -430.9728088378906, "logps/rejected": -249.4294891357422, "loss": 0.4924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.367827415466309, "rewards/margins": 3.1095235347747803, "rewards/rejected": 1.2583036422729492, "step": 78290 }, { "epoch": 3.635266261200613, "grad_norm": 51.57752227783203, "learning_rate": 8.198152189052417e-08, "logits/chosen": -18.03512954711914, "logits/rejected": -17.09445571899414, "logps/chosen": -363.25335693359375, "logps/rejected": -251.871337890625, "loss": 0.4954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.379624605178833, "rewards/margins": 1.5667160749435425, "rewards/rejected": 1.8129085302352905, "step": 78300 }, { "epoch": 3.635730535308046, "grad_norm": 37.62508010864258, "learning_rate": 8.195366544407817e-08, "logits/chosen": -20.682872772216797, "logits/rejected": -19.401071548461914, "logps/chosen": -333.4405822753906, "logps/rejected": -267.5664978027344, "loss": 0.3026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8731884956359863, "rewards/margins": 1.7850929498672485, "rewards/rejected": 2.0880956649780273, "step": 78310 }, { "epoch": 3.636194809415479, "grad_norm": 0.02112836390733719, "learning_rate": 8.19258089976322e-08, "logits/chosen": -19.4029483795166, "logits/rejected": -17.401830673217773, "logps/chosen": -538.2840576171875, "logps/rejected": -275.48126220703125, "loss": 0.2582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.175040245056152, "rewards/margins": 2.9882938861846924, "rewards/rejected": 2.186746120452881, "step": 78320 }, { "epoch": 3.636659083522912, "grad_norm": 7.1704583168029785, "learning_rate": 8.189795255118621e-08, "logits/chosen": -18.874378204345703, "logits/rejected": -17.999042510986328, "logps/chosen": -500.98974609375, "logps/rejected": -443.3487243652344, "loss": 0.6166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.160449028015137, "rewards/margins": 1.5171668529510498, "rewards/rejected": 3.643282651901245, "step": 78330 }, { "epoch": 3.6371233576303448, "grad_norm": 135.32162475585938, "learning_rate": 8.187009610474024e-08, "logits/chosen": -19.430126190185547, "logits/rejected": -18.756900787353516, "logps/chosen": -483.5340881347656, "logps/rejected": -404.58612060546875, "loss": 0.7414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.53065824508667, "rewards/margins": 1.6507856845855713, "rewards/rejected": 3.8798725605010986, "step": 78340 }, { "epoch": 3.637587631737778, "grad_norm": 258.0954895019531, "learning_rate": 8.184223965829425e-08, "logits/chosen": -19.536977767944336, "logits/rejected": -19.307815551757812, "logps/chosen": -341.30682373046875, "logps/rejected": -396.2134704589844, "loss": 0.793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.720543622970581, "rewards/margins": 0.41891950368881226, "rewards/rejected": 3.301624298095703, "step": 78350 }, { "epoch": 3.638051905845211, "grad_norm": 49.19013977050781, "learning_rate": 8.181438321184828e-08, "logits/chosen": -19.136106491088867, "logits/rejected": -17.856929779052734, "logps/chosen": -524.7559814453125, "logps/rejected": -383.8372802734375, "loss": 0.2285, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.039940357208252, "rewards/margins": 2.893179416656494, "rewards/rejected": 2.146761178970337, "step": 78360 }, { "epoch": 3.638516179952644, "grad_norm": 3.6046230792999268, "learning_rate": 8.178652676540228e-08, "logits/chosen": -19.794513702392578, "logits/rejected": -18.424970626831055, "logps/chosen": -386.1730041503906, "logps/rejected": -245.43899536132812, "loss": 0.3926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6475729942321777, "rewards/margins": 2.531235456466675, "rewards/rejected": 1.116337537765503, "step": 78370 }, { "epoch": 3.638980454060077, "grad_norm": 228.821044921875, "learning_rate": 8.175867031895631e-08, "logits/chosen": -18.89628791809082, "logits/rejected": -18.322681427001953, "logps/chosen": -341.56805419921875, "logps/rejected": -377.2026672363281, "loss": 0.9046, "rewards/accuracies": 0.5, "rewards/chosen": 2.725917100906372, "rewards/margins": 0.3187294006347656, "rewards/rejected": 2.4071877002716064, "step": 78380 }, { "epoch": 3.63944472816751, "grad_norm": 3.742428779602051, "learning_rate": 8.173081387251032e-08, "logits/chosen": -19.960355758666992, "logits/rejected": -19.271419525146484, "logps/chosen": -348.24688720703125, "logps/rejected": -204.68832397460938, "loss": 0.2428, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7101974487304688, "rewards/margins": 2.7014384269714355, "rewards/rejected": 1.0087590217590332, "step": 78390 }, { "epoch": 3.639909002274943, "grad_norm": 7.658459663391113, "learning_rate": 8.170295742606435e-08, "logits/chosen": -18.877016067504883, "logits/rejected": -17.851333618164062, "logps/chosen": -402.45318603515625, "logps/rejected": -328.11468505859375, "loss": 0.2502, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6207327842712402, "rewards/margins": 2.080414056777954, "rewards/rejected": 1.5403187274932861, "step": 78400 }, { "epoch": 3.640373276382376, "grad_norm": 1.4884519577026367, "learning_rate": 8.167510097961837e-08, "logits/chosen": -19.493114471435547, "logits/rejected": -18.207244873046875, "logps/chosen": -372.0506286621094, "logps/rejected": -255.6359100341797, "loss": 1.0266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0138607025146484, "rewards/margins": 0.9078255891799927, "rewards/rejected": 2.106034994125366, "step": 78410 }, { "epoch": 3.640837550489809, "grad_norm": 196.7165985107422, "learning_rate": 8.16472445331724e-08, "logits/chosen": -18.674556732177734, "logits/rejected": -17.146785736083984, "logps/chosen": -446.80731201171875, "logps/rejected": -313.8118591308594, "loss": 0.9347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.073070049285889, "rewards/margins": 1.3917123079299927, "rewards/rejected": 2.6813576221466064, "step": 78420 }, { "epoch": 3.6413018245972424, "grad_norm": 37.18736267089844, "learning_rate": 8.16193880867264e-08, "logits/chosen": -17.284122467041016, "logits/rejected": -17.442726135253906, "logps/chosen": -295.7003479003906, "logps/rejected": -306.0481872558594, "loss": 0.6804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.293980836868286, "rewards/margins": 0.7611096501350403, "rewards/rejected": 1.532871127128601, "step": 78430 }, { "epoch": 3.641766098704675, "grad_norm": 4.764109134674072, "learning_rate": 8.159153164028042e-08, "logits/chosen": -19.747304916381836, "logits/rejected": -18.65018081665039, "logps/chosen": -483.31390380859375, "logps/rejected": -407.46820068359375, "loss": 0.5236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.000528812408447, "rewards/margins": 2.552921772003174, "rewards/rejected": 2.4476075172424316, "step": 78440 }, { "epoch": 3.6422303728121084, "grad_norm": 243.6584930419922, "learning_rate": 8.156367519383444e-08, "logits/chosen": -18.66179656982422, "logits/rejected": -18.427600860595703, "logps/chosen": -395.51837158203125, "logps/rejected": -367.5290832519531, "loss": 0.9525, "rewards/accuracies": 0.5, "rewards/chosen": 3.5741848945617676, "rewards/margins": 1.1248565912246704, "rewards/rejected": 2.4493281841278076, "step": 78450 }, { "epoch": 3.642694646919541, "grad_norm": 95.30056762695312, "learning_rate": 8.153581874738845e-08, "logits/chosen": -20.41714859008789, "logits/rejected": -19.56284523010254, "logps/chosen": -398.96783447265625, "logps/rejected": -320.781494140625, "loss": 0.414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.73676872253418, "rewards/margins": 1.5201588869094849, "rewards/rejected": 3.2166099548339844, "step": 78460 }, { "epoch": 3.6431589210269744, "grad_norm": 66.12489318847656, "learning_rate": 8.150796230094247e-08, "logits/chosen": -18.633657455444336, "logits/rejected": -17.958351135253906, "logps/chosen": -419.81640625, "logps/rejected": -285.0385437011719, "loss": 0.3847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.194897651672363, "rewards/margins": 2.2181930541992188, "rewards/rejected": 1.9767048358917236, "step": 78470 }, { "epoch": 3.643623195134407, "grad_norm": 3.727971315383911, "learning_rate": 8.148010585449648e-08, "logits/chosen": -19.39254379272461, "logits/rejected": -18.619356155395508, "logps/chosen": -549.7008666992188, "logps/rejected": -466.95465087890625, "loss": 0.7519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.767945766448975, "rewards/margins": 1.4676610231399536, "rewards/rejected": 3.3002846240997314, "step": 78480 }, { "epoch": 3.6440874692418403, "grad_norm": 0.13994336128234863, "learning_rate": 8.145224940805051e-08, "logits/chosen": -19.402488708496094, "logits/rejected": -18.625322341918945, "logps/chosen": -346.7416687011719, "logps/rejected": -312.9924621582031, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": 2.880478858947754, "rewards/margins": 1.1070690155029297, "rewards/rejected": 1.7734100818634033, "step": 78490 }, { "epoch": 3.6445517433492736, "grad_norm": 179.2539520263672, "learning_rate": 8.142439296160452e-08, "logits/chosen": -18.39322853088379, "logits/rejected": -17.7075138092041, "logps/chosen": -403.5127258300781, "logps/rejected": -304.35986328125, "loss": 0.6564, "rewards/accuracies": 0.5, "rewards/chosen": 3.476551055908203, "rewards/margins": 1.3767352104187012, "rewards/rejected": 2.099815845489502, "step": 78500 }, { "epoch": 3.6450160174567063, "grad_norm": 154.588623046875, "learning_rate": 8.139653651515855e-08, "logits/chosen": -19.070165634155273, "logits/rejected": -18.240522384643555, "logps/chosen": -519.7553100585938, "logps/rejected": -452.60650634765625, "loss": 0.5348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.305322647094727, "rewards/margins": 1.104888677597046, "rewards/rejected": 3.2004342079162598, "step": 78510 }, { "epoch": 3.6454802915641396, "grad_norm": 244.7286376953125, "learning_rate": 8.136868006871255e-08, "logits/chosen": -18.6734676361084, "logits/rejected": -17.39569664001465, "logps/chosen": -424.32147216796875, "logps/rejected": -305.4771423339844, "loss": 0.7958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.472956657409668, "rewards/margins": 1.9877372980117798, "rewards/rejected": 2.4852192401885986, "step": 78520 }, { "epoch": 3.6459445656715728, "grad_norm": 25.67150115966797, "learning_rate": 8.134082362226658e-08, "logits/chosen": -19.099170684814453, "logits/rejected": -18.992036819458008, "logps/chosen": -335.13702392578125, "logps/rejected": -290.6292724609375, "loss": 0.6933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9526360034942627, "rewards/margins": 1.2608168125152588, "rewards/rejected": 1.6918189525604248, "step": 78530 }, { "epoch": 3.6464088397790055, "grad_norm": 136.2620849609375, "learning_rate": 8.13129671758206e-08, "logits/chosen": -19.0253963470459, "logits/rejected": -18.810806274414062, "logps/chosen": -453.566650390625, "logps/rejected": -408.7903747558594, "loss": 0.2816, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.524493217468262, "rewards/margins": 1.8204715251922607, "rewards/rejected": 2.70402193069458, "step": 78540 }, { "epoch": 3.6468731138864383, "grad_norm": 29.11225128173828, "learning_rate": 8.128511072937462e-08, "logits/chosen": -18.957918167114258, "logits/rejected": -18.392690658569336, "logps/chosen": -366.1370544433594, "logps/rejected": -323.27191162109375, "loss": 0.5451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9564404487609863, "rewards/margins": 0.8824268579483032, "rewards/rejected": 2.0740139484405518, "step": 78550 }, { "epoch": 3.6473373879938715, "grad_norm": 0.07220568507909775, "learning_rate": 8.125725428292864e-08, "logits/chosen": -19.546768188476562, "logits/rejected": -18.582582473754883, "logps/chosen": -434.5824279785156, "logps/rejected": -291.79669189453125, "loss": 0.5076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.9012627601623535, "rewards/margins": 2.8951339721679688, "rewards/rejected": 2.0061287879943848, "step": 78560 }, { "epoch": 3.6478016621013047, "grad_norm": 209.224365234375, "learning_rate": 8.122939783648267e-08, "logits/chosen": -19.033145904541016, "logits/rejected": -18.593896865844727, "logps/chosen": -492.29498291015625, "logps/rejected": -462.6224670410156, "loss": 0.6511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1206135749816895, "rewards/margins": 1.089796781539917, "rewards/rejected": 3.0308165550231934, "step": 78570 }, { "epoch": 3.6482659362087375, "grad_norm": 278.7164306640625, "learning_rate": 8.120154139003667e-08, "logits/chosen": -18.141902923583984, "logits/rejected": -18.380985260009766, "logps/chosen": -425.5701599121094, "logps/rejected": -403.3948669433594, "loss": 1.6181, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0924696922302246, "rewards/margins": -0.492567777633667, "rewards/rejected": 3.5850377082824707, "step": 78580 }, { "epoch": 3.6487302103161707, "grad_norm": 36.63792419433594, "learning_rate": 8.11736849435907e-08, "logits/chosen": -18.51782989501953, "logits/rejected": -17.86298179626465, "logps/chosen": -362.717041015625, "logps/rejected": -283.527587890625, "loss": 0.7853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.753293037414551, "rewards/margins": 1.014241337776184, "rewards/rejected": 1.7390518188476562, "step": 78590 }, { "epoch": 3.649194484423604, "grad_norm": 23.546144485473633, "learning_rate": 8.114582849714471e-08, "logits/chosen": -19.974727630615234, "logits/rejected": -19.482309341430664, "logps/chosen": -326.448974609375, "logps/rejected": -298.9087219238281, "loss": 0.6985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7365524768829346, "rewards/margins": 1.1391942501068115, "rewards/rejected": 1.5973584651947021, "step": 78600 }, { "epoch": 3.6496587585310367, "grad_norm": 57.494224548339844, "learning_rate": 8.111797205069874e-08, "logits/chosen": -18.78888511657715, "logits/rejected": -17.95684051513672, "logps/chosen": -397.13763427734375, "logps/rejected": -317.68743896484375, "loss": 0.9579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1698532104492188, "rewards/margins": 0.9071502685546875, "rewards/rejected": 2.2627031803131104, "step": 78610 }, { "epoch": 3.6501230326384695, "grad_norm": 30.307958602905273, "learning_rate": 8.109011560425275e-08, "logits/chosen": -19.772674560546875, "logits/rejected": -19.406112670898438, "logps/chosen": -432.3387756347656, "logps/rejected": -334.27679443359375, "loss": 0.6242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.32729434967041, "rewards/margins": 1.1177812814712524, "rewards/rejected": 3.2095131874084473, "step": 78620 }, { "epoch": 3.6505873067459027, "grad_norm": 2.620375156402588, "learning_rate": 8.106225915780678e-08, "logits/chosen": -18.2979736328125, "logits/rejected": -18.074159622192383, "logps/chosen": -411.2349548339844, "logps/rejected": -342.48681640625, "loss": 0.9479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.966557741165161, "rewards/margins": 0.8529195785522461, "rewards/rejected": 3.113638401031494, "step": 78630 }, { "epoch": 3.651051580853336, "grad_norm": 194.99449157714844, "learning_rate": 8.103440271136078e-08, "logits/chosen": -19.272306442260742, "logits/rejected": -19.228191375732422, "logps/chosen": -379.4727478027344, "logps/rejected": -393.95562744140625, "loss": 0.6137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.874927043914795, "rewards/margins": 0.4691091477870941, "rewards/rejected": 3.405818223953247, "step": 78640 }, { "epoch": 3.6515158549607687, "grad_norm": 24.593425750732422, "learning_rate": 8.100654626491481e-08, "logits/chosen": -19.3370418548584, "logits/rejected": -18.572887420654297, "logps/chosen": -542.7238159179688, "logps/rejected": -397.155517578125, "loss": 0.388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.512762546539307, "rewards/margins": 1.6116364002227783, "rewards/rejected": 2.901125431060791, "step": 78650 }, { "epoch": 3.651980129068202, "grad_norm": 2.275979518890381, "learning_rate": 8.097868981846882e-08, "logits/chosen": -19.358783721923828, "logits/rejected": -18.575613021850586, "logps/chosen": -419.8741760253906, "logps/rejected": -358.1165466308594, "loss": 0.6156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.692082166671753, "rewards/margins": 1.8772302865982056, "rewards/rejected": 1.814851999282837, "step": 78660 }, { "epoch": 3.652444403175635, "grad_norm": 5.278325080871582, "learning_rate": 8.095083337202284e-08, "logits/chosen": -19.760700225830078, "logits/rejected": -18.796772003173828, "logps/chosen": -407.65155029296875, "logps/rejected": -365.1517028808594, "loss": 0.8861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.61429500579834, "rewards/margins": 1.3244383335113525, "rewards/rejected": 3.2898573875427246, "step": 78670 }, { "epoch": 3.652908677283068, "grad_norm": 65.7992172241211, "learning_rate": 8.092297692557685e-08, "logits/chosen": -20.148719787597656, "logits/rejected": -18.757169723510742, "logps/chosen": -451.3511657714844, "logps/rejected": -308.3291931152344, "loss": 0.3353, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.037436485290527, "rewards/margins": 2.1157772541046143, "rewards/rejected": 1.921659231185913, "step": 78680 }, { "epoch": 3.653372951390501, "grad_norm": 123.98731231689453, "learning_rate": 8.089512047913087e-08, "logits/chosen": -18.8693904876709, "logits/rejected": -18.164112091064453, "logps/chosen": -390.4701843261719, "logps/rejected": -337.9581298828125, "loss": 0.4823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.029311180114746, "rewards/margins": 1.8131122589111328, "rewards/rejected": 2.2161991596221924, "step": 78690 }, { "epoch": 3.653837225497934, "grad_norm": 37.67160415649414, "learning_rate": 8.08672640326849e-08, "logits/chosen": -18.9560489654541, "logits/rejected": -18.17398452758789, "logps/chosen": -320.42169189453125, "logps/rejected": -250.39279174804688, "loss": 0.5245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8312923908233643, "rewards/margins": 1.3729400634765625, "rewards/rejected": 1.4583523273468018, "step": 78700 }, { "epoch": 3.654301499605367, "grad_norm": 2.612974166870117, "learning_rate": 8.083940758623891e-08, "logits/chosen": -18.56345558166504, "logits/rejected": -18.187578201293945, "logps/chosen": -389.27691650390625, "logps/rejected": -321.40460205078125, "loss": 0.628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8657522201538086, "rewards/margins": 0.9890971183776855, "rewards/rejected": 1.8766552209854126, "step": 78710 }, { "epoch": 3.6547657737128, "grad_norm": 57.72054672241211, "learning_rate": 8.081155113979294e-08, "logits/chosen": -19.85968780517578, "logits/rejected": -19.581802368164062, "logps/chosen": -391.4904479980469, "logps/rejected": -393.7786560058594, "loss": 0.5411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2446510791778564, "rewards/margins": 0.5677447319030762, "rewards/rejected": 2.676906108856201, "step": 78720 }, { "epoch": 3.655230047820233, "grad_norm": 0.2997031807899475, "learning_rate": 8.078369469334694e-08, "logits/chosen": -19.161134719848633, "logits/rejected": -18.45772361755371, "logps/chosen": -478.19561767578125, "logps/rejected": -380.37921142578125, "loss": 0.4085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.634576797485352, "rewards/margins": 1.9889812469482422, "rewards/rejected": 2.6455960273742676, "step": 78730 }, { "epoch": 3.6556943219276663, "grad_norm": 78.1318130493164, "learning_rate": 8.075583824690097e-08, "logits/chosen": -19.458789825439453, "logits/rejected": -18.883975982666016, "logps/chosen": -416.77874755859375, "logps/rejected": -347.6505126953125, "loss": 0.5859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.557224750518799, "rewards/margins": 1.0298175811767578, "rewards/rejected": 2.527406930923462, "step": 78740 }, { "epoch": 3.656158596035099, "grad_norm": 53.72212600708008, "learning_rate": 8.072798180045498e-08, "logits/chosen": -18.981172561645508, "logits/rejected": -18.389421463012695, "logps/chosen": -426.14508056640625, "logps/rejected": -316.9446716308594, "loss": 0.2523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.088708400726318, "rewards/margins": 2.200610399246216, "rewards/rejected": 1.888098120689392, "step": 78750 }, { "epoch": 3.6566228701425323, "grad_norm": 46.42633819580078, "learning_rate": 8.070012535400901e-08, "logits/chosen": -19.102222442626953, "logits/rejected": -17.186174392700195, "logps/chosen": -490.5204162597656, "logps/rejected": -280.9131164550781, "loss": 0.2737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1796562671661377, "rewards/margins": 2.3380396366119385, "rewards/rejected": 0.8416166305541992, "step": 78760 }, { "epoch": 3.657087144249965, "grad_norm": 89.8895492553711, "learning_rate": 8.067226890756302e-08, "logits/chosen": -19.11276626586914, "logits/rejected": -17.744693756103516, "logps/chosen": -410.0858459472656, "logps/rejected": -325.57073974609375, "loss": 0.5468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.680812358856201, "rewards/margins": 1.4368778467178345, "rewards/rejected": 2.2439346313476562, "step": 78770 }, { "epoch": 3.6575514183573983, "grad_norm": 5.965119361877441, "learning_rate": 8.064441246111705e-08, "logits/chosen": -18.94808578491211, "logits/rejected": -18.320098876953125, "logps/chosen": -330.7630310058594, "logps/rejected": -280.4710998535156, "loss": 0.4355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7504658699035645, "rewards/margins": 1.9981935024261475, "rewards/rejected": 1.7522720098495483, "step": 78780 }, { "epoch": 3.658015692464831, "grad_norm": 0.10798683017492294, "learning_rate": 8.061655601467105e-08, "logits/chosen": -18.751270294189453, "logits/rejected": -18.62826156616211, "logps/chosen": -327.2670593261719, "logps/rejected": -310.6471252441406, "loss": 0.7061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4706642627716064, "rewards/margins": 1.364195704460144, "rewards/rejected": 2.1064682006835938, "step": 78790 }, { "epoch": 3.6584799665722643, "grad_norm": 65.5243911743164, "learning_rate": 8.058869956822508e-08, "logits/chosen": -19.406543731689453, "logits/rejected": -18.11441421508789, "logps/chosen": -405.22198486328125, "logps/rejected": -320.1504821777344, "loss": 0.6169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.557767152786255, "rewards/margins": 1.2037500143051147, "rewards/rejected": 1.3540170192718506, "step": 78800 }, { "epoch": 3.6589442406796975, "grad_norm": 60.363807678222656, "learning_rate": 8.05608431217791e-08, "logits/chosen": -18.169221878051758, "logits/rejected": -17.13619613647461, "logps/chosen": -494.55419921875, "logps/rejected": -300.76177978515625, "loss": 0.5439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5028903484344482, "rewards/margins": 1.8153797388076782, "rewards/rejected": 1.6875102519989014, "step": 78810 }, { "epoch": 3.6594085147871303, "grad_norm": 28.626901626586914, "learning_rate": 8.053298667533312e-08, "logits/chosen": -19.934417724609375, "logits/rejected": -19.565399169921875, "logps/chosen": -356.39892578125, "logps/rejected": -367.6105041503906, "loss": 0.4987, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1007041931152344, "rewards/margins": 0.9092668294906616, "rewards/rejected": 2.1914374828338623, "step": 78820 }, { "epoch": 3.6598727888945635, "grad_norm": 203.2877197265625, "learning_rate": 8.050513022888714e-08, "logits/chosen": -19.032228469848633, "logits/rejected": -19.845426559448242, "logps/chosen": -336.937255859375, "logps/rejected": -371.74627685546875, "loss": 0.9706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9349327087402344, "rewards/margins": 0.6212008595466614, "rewards/rejected": 3.313732147216797, "step": 78830 }, { "epoch": 3.6603370630019962, "grad_norm": 19.86249542236328, "learning_rate": 8.047727378244116e-08, "logits/chosen": -19.131816864013672, "logits/rejected": -18.188358306884766, "logps/chosen": -382.8161315917969, "logps/rejected": -286.0841979980469, "loss": 0.3679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.02323579788208, "rewards/margins": 1.631911277770996, "rewards/rejected": 1.3913244009017944, "step": 78840 }, { "epoch": 3.6608013371094295, "grad_norm": 10.936079025268555, "learning_rate": 8.044941733599516e-08, "logits/chosen": -18.804582595825195, "logits/rejected": -18.17413330078125, "logps/chosen": -250.8561553955078, "logps/rejected": -205.15322875976562, "loss": 0.4792, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9437968730926514, "rewards/margins": 1.2741682529449463, "rewards/rejected": 0.6696287989616394, "step": 78850 }, { "epoch": 3.6612656112168622, "grad_norm": 5.611032485961914, "learning_rate": 8.042156088954918e-08, "logits/chosen": -18.5778865814209, "logits/rejected": -17.417221069335938, "logps/chosen": -360.60723876953125, "logps/rejected": -240.4558563232422, "loss": 0.3334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0797905921936035, "rewards/margins": 1.5925408601760864, "rewards/rejected": 1.4872499704360962, "step": 78860 }, { "epoch": 3.6617298853242954, "grad_norm": 1.732567310333252, "learning_rate": 8.039370444310321e-08, "logits/chosen": -18.955055236816406, "logits/rejected": -17.056560516357422, "logps/chosen": -437.6205139160156, "logps/rejected": -241.30923461914062, "loss": 0.2404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.094665050506592, "rewards/margins": 3.444080352783203, "rewards/rejected": 0.6505847573280334, "step": 78870 }, { "epoch": 3.6621941594317287, "grad_norm": 101.88606262207031, "learning_rate": 8.036584799665722e-08, "logits/chosen": -18.95547866821289, "logits/rejected": -18.374408721923828, "logps/chosen": -312.1493225097656, "logps/rejected": -290.84014892578125, "loss": 0.4415, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1811599731445312, "rewards/margins": 1.120063304901123, "rewards/rejected": 2.061096668243408, "step": 78880 }, { "epoch": 3.6626584335391614, "grad_norm": 12.213144302368164, "learning_rate": 8.033799155021124e-08, "logits/chosen": -19.70865249633789, "logits/rejected": -18.652301788330078, "logps/chosen": -441.8985290527344, "logps/rejected": -385.6724548339844, "loss": 0.3321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.367846488952637, "rewards/margins": 1.891411542892456, "rewards/rejected": 2.4764351844787598, "step": 78890 }, { "epoch": 3.6631227076465946, "grad_norm": 158.41160583496094, "learning_rate": 8.031013510376525e-08, "logits/chosen": -18.91142463684082, "logits/rejected": -18.313194274902344, "logps/chosen": -450.45001220703125, "logps/rejected": -384.12939453125, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.872537612915039, "rewards/margins": 1.2812938690185547, "rewards/rejected": 2.5912437438964844, "step": 78900 }, { "epoch": 3.6635869817540274, "grad_norm": 69.60514831542969, "learning_rate": 8.028227865731928e-08, "logits/chosen": -19.58736801147461, "logits/rejected": -18.820810317993164, "logps/chosen": -405.03887939453125, "logps/rejected": -336.2600402832031, "loss": 0.8066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.272075653076172, "rewards/margins": 0.29039469361305237, "rewards/rejected": 2.9816808700561523, "step": 78910 }, { "epoch": 3.6640512558614606, "grad_norm": 186.1656036376953, "learning_rate": 8.025442221087329e-08, "logits/chosen": -19.746122360229492, "logits/rejected": -19.025978088378906, "logps/chosen": -454.83575439453125, "logps/rejected": -398.1180419921875, "loss": 1.1331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.453231334686279, "rewards/margins": 0.668540358543396, "rewards/rejected": 3.784691333770752, "step": 78920 }, { "epoch": 3.6645155299688934, "grad_norm": 186.67579650878906, "learning_rate": 8.022656576442732e-08, "logits/chosen": -19.75888442993164, "logits/rejected": -17.6293888092041, "logps/chosen": -422.4224548339844, "logps/rejected": -311.18798828125, "loss": 0.5765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8814971446990967, "rewards/margins": 1.6945488452911377, "rewards/rejected": 2.186948299407959, "step": 78930 }, { "epoch": 3.6649798040763266, "grad_norm": 69.45528411865234, "learning_rate": 8.019870931798132e-08, "logits/chosen": -18.750268936157227, "logits/rejected": -17.952407836914062, "logps/chosen": -418.40582275390625, "logps/rejected": -330.1827087402344, "loss": 0.4475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.207190752029419, "rewards/margins": 1.5710569620132446, "rewards/rejected": 1.6361334323883057, "step": 78940 }, { "epoch": 3.66544407818376, "grad_norm": 76.39217376708984, "learning_rate": 8.017085287153535e-08, "logits/chosen": -18.942893981933594, "logits/rejected": -18.33676528930664, "logps/chosen": -316.45086669921875, "logps/rejected": -253.00027465820312, "loss": 0.5107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6871676445007324, "rewards/margins": 1.6800724267959595, "rewards/rejected": 2.0070953369140625, "step": 78950 }, { "epoch": 3.6659083522911926, "grad_norm": 155.8890838623047, "learning_rate": 8.014299642508936e-08, "logits/chosen": -18.73779296875, "logits/rejected": -17.828611373901367, "logps/chosen": -334.6639404296875, "logps/rejected": -298.84930419921875, "loss": 0.8567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7414894104003906, "rewards/margins": 0.8952228426933289, "rewards/rejected": 1.8462663888931274, "step": 78960 }, { "epoch": 3.666372626398626, "grad_norm": 15.386038780212402, "learning_rate": 8.011513997864339e-08, "logits/chosen": -18.292598724365234, "logits/rejected": -17.997922897338867, "logps/chosen": -315.7839660644531, "logps/rejected": -320.71099853515625, "loss": 0.4707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8865878582000732, "rewards/margins": 1.0388938188552856, "rewards/rejected": 0.8476940393447876, "step": 78970 }, { "epoch": 3.666836900506059, "grad_norm": 65.48014068603516, "learning_rate": 8.008728353219741e-08, "logits/chosen": -19.933778762817383, "logits/rejected": -18.139827728271484, "logps/chosen": -461.92706298828125, "logps/rejected": -288.4370422363281, "loss": 0.3048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.90029764175415, "rewards/margins": 3.2476210594177246, "rewards/rejected": 1.6526762247085571, "step": 78980 }, { "epoch": 3.667301174613492, "grad_norm": 1.0288140773773193, "learning_rate": 8.005942708575143e-08, "logits/chosen": -19.41609001159668, "logits/rejected": -18.248079299926758, "logps/chosen": -363.7156982421875, "logps/rejected": -310.8329162597656, "loss": 0.6509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9960103034973145, "rewards/margins": 1.5043327808380127, "rewards/rejected": 2.4916772842407227, "step": 78990 }, { "epoch": 3.6677654487209246, "grad_norm": 53.2911262512207, "learning_rate": 8.003157063930544e-08, "logits/chosen": -19.298200607299805, "logits/rejected": -19.291126251220703, "logps/chosen": -358.52947998046875, "logps/rejected": -338.6763000488281, "loss": 0.959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0053772926330566, "rewards/margins": 0.15089797973632812, "rewards/rejected": 2.8544793128967285, "step": 79000 }, { "epoch": 3.668229722828358, "grad_norm": 37.57121658325195, "learning_rate": 8.000371419285946e-08, "logits/chosen": -19.198055267333984, "logits/rejected": -18.44194221496582, "logps/chosen": -380.6505432128906, "logps/rejected": -317.3935852050781, "loss": 0.6106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0026278495788574, "rewards/margins": 0.6145080327987671, "rewards/rejected": 1.3881198167800903, "step": 79010 }, { "epoch": 3.668693996935791, "grad_norm": 58.40058898925781, "learning_rate": 7.997585774641348e-08, "logits/chosen": -18.636886596679688, "logits/rejected": -18.69878578186035, "logps/chosen": -297.147216796875, "logps/rejected": -265.08135986328125, "loss": 0.8706, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.211230754852295, "rewards/margins": 0.5148375034332275, "rewards/rejected": 1.6963932514190674, "step": 79020 }, { "epoch": 3.669158271043224, "grad_norm": 13.498982429504395, "learning_rate": 7.99480012999675e-08, "logits/chosen": -19.31742286682129, "logits/rejected": -18.13406753540039, "logps/chosen": -392.35357666015625, "logps/rejected": -290.29534912109375, "loss": 0.5242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2007651329040527, "rewards/margins": 1.5009647607803345, "rewards/rejected": 1.6998004913330078, "step": 79030 }, { "epoch": 3.669622545150657, "grad_norm": 42.75019454956055, "learning_rate": 7.992014485352152e-08, "logits/chosen": -17.845521926879883, "logits/rejected": -17.69302749633789, "logps/chosen": -275.11773681640625, "logps/rejected": -294.86492919921875, "loss": 1.1047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.639192581176758, "rewards/margins": 0.9162322878837585, "rewards/rejected": 1.7229602336883545, "step": 79040 }, { "epoch": 3.67008681925809, "grad_norm": 12.983989715576172, "learning_rate": 7.989228840707555e-08, "logits/chosen": -19.301158905029297, "logits/rejected": -19.355566024780273, "logps/chosen": -339.56866455078125, "logps/rejected": -354.24945068359375, "loss": 0.4861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5278422832489014, "rewards/margins": 1.5501221418380737, "rewards/rejected": 1.9777202606201172, "step": 79050 }, { "epoch": 3.670551093365523, "grad_norm": 149.64230346679688, "learning_rate": 7.986443196062955e-08, "logits/chosen": -18.97998809814453, "logits/rejected": -18.908571243286133, "logps/chosen": -379.8042907714844, "logps/rejected": -490.63818359375, "loss": 0.875, "rewards/accuracies": 0.5, "rewards/chosen": 4.087154388427734, "rewards/margins": 0.43542805314064026, "rewards/rejected": 3.651726484298706, "step": 79060 }, { "epoch": 3.6710153674729558, "grad_norm": 67.58689880371094, "learning_rate": 7.983657551418356e-08, "logits/chosen": -19.04979705810547, "logits/rejected": -18.91034698486328, "logps/chosen": -404.5726013183594, "logps/rejected": -317.8439636230469, "loss": 0.6133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6786062717437744, "rewards/margins": 0.9107206463813782, "rewards/rejected": 1.767885446548462, "step": 79070 }, { "epoch": 3.671479641580389, "grad_norm": 24.788145065307617, "learning_rate": 7.980871906773759e-08, "logits/chosen": -17.61690330505371, "logits/rejected": -17.22771644592285, "logps/chosen": -238.2860565185547, "logps/rejected": -208.70761108398438, "loss": 0.8025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.386578917503357, "rewards/margins": 0.844883143901825, "rewards/rejected": 0.5416958332061768, "step": 79080 }, { "epoch": 3.671943915687822, "grad_norm": 1.7154392004013062, "learning_rate": 7.97808626212916e-08, "logits/chosen": -18.796602249145508, "logits/rejected": -17.61174964904785, "logps/chosen": -403.35504150390625, "logps/rejected": -252.36746215820312, "loss": 0.2662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.946805477142334, "rewards/margins": 2.2761638164520264, "rewards/rejected": 1.670641541481018, "step": 79090 }, { "epoch": 3.672408189795255, "grad_norm": 4.433302879333496, "learning_rate": 7.975300617484562e-08, "logits/chosen": -19.069049835205078, "logits/rejected": -17.647777557373047, "logps/chosen": -391.23516845703125, "logps/rejected": -271.04461669921875, "loss": 0.5435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8611629009246826, "rewards/margins": 2.0183849334716797, "rewards/rejected": 1.842777967453003, "step": 79100 }, { "epoch": 3.672872463902688, "grad_norm": 326.84906005859375, "learning_rate": 7.972514972839964e-08, "logits/chosen": -20.2563419342041, "logits/rejected": -19.959522247314453, "logps/chosen": -430.28338623046875, "logps/rejected": -409.9612731933594, "loss": 0.974, "rewards/accuracies": 0.5, "rewards/chosen": 4.4739789962768555, "rewards/margins": 0.3664136528968811, "rewards/rejected": 4.107564926147461, "step": 79110 }, { "epoch": 3.6733367380101214, "grad_norm": 44.56371307373047, "learning_rate": 7.969729328195366e-08, "logits/chosen": -18.175262451171875, "logits/rejected": -17.704715728759766, "logps/chosen": -356.09869384765625, "logps/rejected": -278.6910095214844, "loss": 0.5867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4657936096191406, "rewards/margins": 1.013933777809143, "rewards/rejected": 1.451859951019287, "step": 79120 }, { "epoch": 3.673801012117554, "grad_norm": 114.78108215332031, "learning_rate": 7.966943683550768e-08, "logits/chosen": -19.50094985961914, "logits/rejected": -20.18429183959961, "logps/chosen": -362.934814453125, "logps/rejected": -434.83868408203125, "loss": 1.3466, "rewards/accuracies": 0.5, "rewards/chosen": 4.1209259033203125, "rewards/margins": -0.44279584288597107, "rewards/rejected": 4.563721656799316, "step": 79130 }, { "epoch": 3.6742652862249874, "grad_norm": 0.008643311448395252, "learning_rate": 7.96415803890617e-08, "logits/chosen": -19.20351219177246, "logits/rejected": -18.508663177490234, "logps/chosen": -326.6781921386719, "logps/rejected": -297.9100036621094, "loss": 1.1317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.732999086380005, "rewards/margins": 1.2969684600830078, "rewards/rejected": 2.436030149459839, "step": 79140 }, { "epoch": 3.67472956033242, "grad_norm": 153.3782958984375, "learning_rate": 7.961372394261571e-08, "logits/chosen": -18.753679275512695, "logits/rejected": -18.254749298095703, "logps/chosen": -316.9385681152344, "logps/rejected": -242.2031707763672, "loss": 0.8366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4915719032287598, "rewards/margins": 0.7683531641960144, "rewards/rejected": 1.7232189178466797, "step": 79150 }, { "epoch": 3.6751938344398534, "grad_norm": 9.151021957397461, "learning_rate": 7.958586749616973e-08, "logits/chosen": -19.396240234375, "logits/rejected": -18.221237182617188, "logps/chosen": -318.17010498046875, "logps/rejected": -252.59451293945312, "loss": 0.2816, "rewards/accuracies": 1.0, "rewards/chosen": 2.397273063659668, "rewards/margins": 1.5082452297210693, "rewards/rejected": 0.8890278935432434, "step": 79160 }, { "epoch": 3.675658108547286, "grad_norm": 62.30397415161133, "learning_rate": 7.955801104972375e-08, "logits/chosen": -18.41695785522461, "logits/rejected": -17.721364974975586, "logps/chosen": -405.1216125488281, "logps/rejected": -316.27239990234375, "loss": 0.2964, "rewards/accuracies": 1.0, "rewards/chosen": 3.542177200317383, "rewards/margins": 1.242891550064087, "rewards/rejected": 2.299285650253296, "step": 79170 }, { "epoch": 3.6761223826547194, "grad_norm": 8.538778305053711, "learning_rate": 7.953015460327778e-08, "logits/chosen": -19.85409164428711, "logits/rejected": -18.864810943603516, "logps/chosen": -445.92047119140625, "logps/rejected": -321.2705993652344, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.112214088439941, "rewards/margins": 1.3437249660491943, "rewards/rejected": 2.768489360809326, "step": 79180 }, { "epoch": 3.6765866567621526, "grad_norm": 0.7512451410293579, "learning_rate": 7.950229815683179e-08, "logits/chosen": -19.838109970092773, "logits/rejected": -18.154245376586914, "logps/chosen": -464.37774658203125, "logps/rejected": -301.49969482421875, "loss": 0.3632, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.616959095001221, "rewards/margins": 2.7136616706848145, "rewards/rejected": 1.9032968282699585, "step": 79190 }, { "epoch": 3.6770509308695853, "grad_norm": 15.246176719665527, "learning_rate": 7.947444171038582e-08, "logits/chosen": -19.10381507873535, "logits/rejected": -19.126005172729492, "logps/chosen": -392.6201477050781, "logps/rejected": -404.265869140625, "loss": 0.9422, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8304696083068848, "rewards/margins": 0.3074110150337219, "rewards/rejected": 2.5230584144592285, "step": 79200 }, { "epoch": 3.6775152049770186, "grad_norm": 3.895521879196167, "learning_rate": 7.944658526393982e-08, "logits/chosen": -19.926387786865234, "logits/rejected": -19.026355743408203, "logps/chosen": -446.143310546875, "logps/rejected": -419.39923095703125, "loss": 1.0295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6287364959716797, "rewards/margins": 0.1113305538892746, "rewards/rejected": 3.5174057483673096, "step": 79210 }, { "epoch": 3.6779794790844513, "grad_norm": 83.11571502685547, "learning_rate": 7.941872881749385e-08, "logits/chosen": -19.614688873291016, "logits/rejected": -18.54166030883789, "logps/chosen": -356.2709655761719, "logps/rejected": -222.04159545898438, "loss": 0.6548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.648749828338623, "rewards/margins": 1.8416095972061157, "rewards/rejected": 1.8071403503417969, "step": 79220 }, { "epoch": 3.6784437531918845, "grad_norm": 96.84101104736328, "learning_rate": 7.939087237104786e-08, "logits/chosen": -18.776779174804688, "logits/rejected": -18.276710510253906, "logps/chosen": -264.8491516113281, "logps/rejected": -198.6754608154297, "loss": 0.5473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0896613597869873, "rewards/margins": 0.9533129930496216, "rewards/rejected": 1.1363484859466553, "step": 79230 }, { "epoch": 3.6789080272993173, "grad_norm": 164.95700073242188, "learning_rate": 7.936301592460189e-08, "logits/chosen": -19.50025177001953, "logits/rejected": -18.335147857666016, "logps/chosen": -332.916259765625, "logps/rejected": -286.5283203125, "loss": 0.8554, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.934938669204712, "rewards/margins": 0.32121747732162476, "rewards/rejected": 2.6137211322784424, "step": 79240 }, { "epoch": 3.6793723014067505, "grad_norm": 34.657230377197266, "learning_rate": 7.93351594781559e-08, "logits/chosen": -18.725414276123047, "logits/rejected": -18.679672241210938, "logps/chosen": -350.12109375, "logps/rejected": -299.439697265625, "loss": 0.7668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.871910810470581, "rewards/margins": 0.9529268145561218, "rewards/rejected": 1.918983817100525, "step": 79250 }, { "epoch": 3.6798365755141837, "grad_norm": 5.689621925354004, "learning_rate": 7.93073030317099e-08, "logits/chosen": -19.249614715576172, "logits/rejected": -18.535078048706055, "logps/chosen": -326.6142883300781, "logps/rejected": -248.4450225830078, "loss": 0.6592, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2485828399658203, "rewards/margins": 1.7125635147094727, "rewards/rejected": 1.5360195636749268, "step": 79260 }, { "epoch": 3.6803008496216165, "grad_norm": 60.41046142578125, "learning_rate": 7.927944658526393e-08, "logits/chosen": -18.441688537597656, "logits/rejected": -17.350757598876953, "logps/chosen": -386.27630615234375, "logps/rejected": -305.95196533203125, "loss": 0.5326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.247645139694214, "rewards/margins": 1.8097858428955078, "rewards/rejected": 1.4378594160079956, "step": 79270 }, { "epoch": 3.6807651237290497, "grad_norm": 8.14169979095459, "learning_rate": 7.925159013881795e-08, "logits/chosen": -18.22467041015625, "logits/rejected": -17.787649154663086, "logps/chosen": -320.1067199707031, "logps/rejected": -292.60528564453125, "loss": 0.6542, "rewards/accuracies": 0.5, "rewards/chosen": 2.122457981109619, "rewards/margins": 1.2299550771713257, "rewards/rejected": 0.8925027847290039, "step": 79280 }, { "epoch": 3.6812293978364825, "grad_norm": 1.009603500366211, "learning_rate": 7.922373369237198e-08, "logits/chosen": -19.639812469482422, "logits/rejected": -18.8071346282959, "logps/chosen": -366.27935791015625, "logps/rejected": -368.3546447753906, "loss": 0.8283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4584801197052, "rewards/margins": 1.0370023250579834, "rewards/rejected": 2.4214775562286377, "step": 79290 }, { "epoch": 3.6816936719439157, "grad_norm": 102.99702453613281, "learning_rate": 7.919587724592599e-08, "logits/chosen": -18.787395477294922, "logits/rejected": -18.19741439819336, "logps/chosen": -313.22833251953125, "logps/rejected": -246.73681640625, "loss": 0.876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3859589099884033, "rewards/margins": 0.6548571586608887, "rewards/rejected": 1.7311017513275146, "step": 79300 }, { "epoch": 3.6821579460513485, "grad_norm": 22.024316787719727, "learning_rate": 7.916802079948e-08, "logits/chosen": -18.817333221435547, "logits/rejected": -18.071943283081055, "logps/chosen": -342.86126708984375, "logps/rejected": -229.0359344482422, "loss": 0.7978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.938044786453247, "rewards/margins": 1.9612184762954712, "rewards/rejected": 0.9768264889717102, "step": 79310 }, { "epoch": 3.6826222201587817, "grad_norm": 41.00246047973633, "learning_rate": 7.914016435303402e-08, "logits/chosen": -18.408178329467773, "logits/rejected": -18.71673583984375, "logps/chosen": -281.04876708984375, "logps/rejected": -296.42156982421875, "loss": 0.9177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.083852767944336, "rewards/margins": 0.024573028087615967, "rewards/rejected": 2.0592799186706543, "step": 79320 }, { "epoch": 3.683086494266215, "grad_norm": 160.49623107910156, "learning_rate": 7.911230790658805e-08, "logits/chosen": -19.15656089782715, "logits/rejected": -18.366840362548828, "logps/chosen": -459.008056640625, "logps/rejected": -325.6507263183594, "loss": 0.982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8302483558654785, "rewards/margins": 0.6495392918586731, "rewards/rejected": 3.1807093620300293, "step": 79330 }, { "epoch": 3.6835507683736477, "grad_norm": 113.29724884033203, "learning_rate": 7.908445146014206e-08, "logits/chosen": -18.95256996154785, "logits/rejected": -18.34981346130371, "logps/chosen": -327.12054443359375, "logps/rejected": -261.5392150878906, "loss": 0.4886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2028403282165527, "rewards/margins": 1.0351173877716064, "rewards/rejected": 2.1677231788635254, "step": 79340 }, { "epoch": 3.684015042481081, "grad_norm": 78.2276382446289, "learning_rate": 7.905659501369609e-08, "logits/chosen": -20.909984588623047, "logits/rejected": -19.00830078125, "logps/chosen": -543.1603393554688, "logps/rejected": -407.8939208984375, "loss": 0.2208, "rewards/accuracies": 1.0, "rewards/chosen": 5.137493133544922, "rewards/margins": 2.2115955352783203, "rewards/rejected": 2.9258975982666016, "step": 79350 }, { "epoch": 3.684479316588514, "grad_norm": 208.69268798828125, "learning_rate": 7.902873856725009e-08, "logits/chosen": -20.256553649902344, "logits/rejected": -18.848403930664062, "logps/chosen": -409.9676818847656, "logps/rejected": -306.2114562988281, "loss": 0.7383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.180245399475098, "rewards/margins": 1.6607481241226196, "rewards/rejected": 2.5194976329803467, "step": 79360 }, { "epoch": 3.684943590695947, "grad_norm": 33.804405212402344, "learning_rate": 7.900088212080412e-08, "logits/chosen": -18.698375701904297, "logits/rejected": -18.605945587158203, "logps/chosen": -377.2809143066406, "logps/rejected": -352.08514404296875, "loss": 0.5271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6479332447052, "rewards/margins": 1.3195600509643555, "rewards/rejected": 2.3283731937408447, "step": 79370 }, { "epoch": 3.6854078648033797, "grad_norm": 11.8799409866333, "learning_rate": 7.897302567435813e-08, "logits/chosen": -19.185840606689453, "logits/rejected": -18.381641387939453, "logps/chosen": -432.2201232910156, "logps/rejected": -315.14019775390625, "loss": 0.7106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.900412082672119, "rewards/margins": 1.1644834280014038, "rewards/rejected": 2.735928773880005, "step": 79380 }, { "epoch": 3.685872138910813, "grad_norm": 199.0531463623047, "learning_rate": 7.894516922791216e-08, "logits/chosen": -20.01216697692871, "logits/rejected": -20.110502243041992, "logps/chosen": -478.31005859375, "logps/rejected": -440.04022216796875, "loss": 0.8848, "rewards/accuracies": 0.5, "rewards/chosen": 3.8872008323669434, "rewards/margins": 0.7345393896102905, "rewards/rejected": 3.1526615619659424, "step": 79390 }, { "epoch": 3.686336413018246, "grad_norm": 1.4259446859359741, "learning_rate": 7.891731278146618e-08, "logits/chosen": -19.425716400146484, "logits/rejected": -18.76688003540039, "logps/chosen": -500.79571533203125, "logps/rejected": -384.1384582519531, "loss": 0.5314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2920308113098145, "rewards/margins": 1.192406415939331, "rewards/rejected": 3.0996248722076416, "step": 79400 }, { "epoch": 3.686800687125679, "grad_norm": 55.317073822021484, "learning_rate": 7.88894563350202e-08, "logits/chosen": -19.057374954223633, "logits/rejected": -18.36306381225586, "logps/chosen": -359.7552490234375, "logps/rejected": -320.8369445800781, "loss": 0.5557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.107027769088745, "rewards/margins": 0.569354236125946, "rewards/rejected": 1.5376735925674438, "step": 79410 }, { "epoch": 3.687264961233112, "grad_norm": 3.530433416366577, "learning_rate": 7.88615998885742e-08, "logits/chosen": -18.89885902404785, "logits/rejected": -17.767017364501953, "logps/chosen": -326.49749755859375, "logps/rejected": -266.4833068847656, "loss": 0.6522, "rewards/accuracies": 0.5, "rewards/chosen": 2.2952263355255127, "rewards/margins": 0.4648050367832184, "rewards/rejected": 1.8304212093353271, "step": 79420 }, { "epoch": 3.6877292353405453, "grad_norm": 42.77751159667969, "learning_rate": 7.883374344212823e-08, "logits/chosen": -18.881092071533203, "logits/rejected": -17.701749801635742, "logps/chosen": -405.81121826171875, "logps/rejected": -334.48077392578125, "loss": 0.3878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.34327507019043, "rewards/margins": 1.6223684549331665, "rewards/rejected": 2.7209067344665527, "step": 79430 }, { "epoch": 3.688193509447978, "grad_norm": 60.23033142089844, "learning_rate": 7.880588699568225e-08, "logits/chosen": -18.359630584716797, "logits/rejected": -17.692691802978516, "logps/chosen": -439.46612548828125, "logps/rejected": -305.23651123046875, "loss": 0.6507, "rewards/accuracies": 0.5, "rewards/chosen": 2.48689866065979, "rewards/margins": 0.8203698992729187, "rewards/rejected": 1.6665289402008057, "step": 79440 }, { "epoch": 3.688657783555411, "grad_norm": 37.345088958740234, "learning_rate": 7.877803054923626e-08, "logits/chosen": -19.130186080932617, "logits/rejected": -17.981477737426758, "logps/chosen": -406.1402893066406, "logps/rejected": -294.6868591308594, "loss": 0.5338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.998377561569214, "rewards/margins": 1.727582573890686, "rewards/rejected": 2.270794630050659, "step": 79450 }, { "epoch": 3.689122057662844, "grad_norm": 36.320499420166016, "learning_rate": 7.875017410279029e-08, "logits/chosen": -18.84789276123047, "logits/rejected": -17.872669219970703, "logps/chosen": -540.4607543945312, "logps/rejected": -451.962646484375, "loss": 0.5551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.180048942565918, "rewards/margins": 2.0962469577789307, "rewards/rejected": 2.0838019847869873, "step": 79460 }, { "epoch": 3.6895863317702773, "grad_norm": 35.552852630615234, "learning_rate": 7.872231765634429e-08, "logits/chosen": -19.261911392211914, "logits/rejected": -18.237079620361328, "logps/chosen": -406.5285339355469, "logps/rejected": -301.46905517578125, "loss": 0.3618, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1045479774475098, "rewards/margins": 1.4625418186187744, "rewards/rejected": 1.642006278038025, "step": 79470 }, { "epoch": 3.69005060587771, "grad_norm": 21.126075744628906, "learning_rate": 7.869446120989832e-08, "logits/chosen": -18.669994354248047, "logits/rejected": -18.090131759643555, "logps/chosen": -327.2933349609375, "logps/rejected": -285.0617980957031, "loss": 0.4864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4003143310546875, "rewards/margins": 1.6261411905288696, "rewards/rejected": 1.7741730213165283, "step": 79480 }, { "epoch": 3.6905148799851433, "grad_norm": 111.05313873291016, "learning_rate": 7.866660476345233e-08, "logits/chosen": -19.34605598449707, "logits/rejected": -18.653676986694336, "logps/chosen": -438.0724182128906, "logps/rejected": -313.72296142578125, "loss": 0.4881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.460974931716919, "rewards/margins": 1.2357779741287231, "rewards/rejected": 2.2251968383789062, "step": 79490 }, { "epoch": 3.6909791540925765, "grad_norm": 18.682151794433594, "learning_rate": 7.863874831700636e-08, "logits/chosen": -19.728107452392578, "logits/rejected": -18.728450775146484, "logps/chosen": -382.9960632324219, "logps/rejected": -353.75714111328125, "loss": 0.4827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.25724720954895, "rewards/margins": 1.7728922367095947, "rewards/rejected": 1.484355092048645, "step": 79500 }, { "epoch": 3.6914434282000093, "grad_norm": 124.38137817382812, "learning_rate": 7.861089187056038e-08, "logits/chosen": -20.02963638305664, "logits/rejected": -18.888031005859375, "logps/chosen": -510.2308654785156, "logps/rejected": -416.33599853515625, "loss": 0.8648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8835415840148926, "rewards/margins": 0.4443284869194031, "rewards/rejected": 3.439213275909424, "step": 79510 }, { "epoch": 3.6919077023074425, "grad_norm": 278.1385192871094, "learning_rate": 7.858303542411439e-08, "logits/chosen": -18.957763671875, "logits/rejected": -18.715293884277344, "logps/chosen": -474.3814392089844, "logps/rejected": -433.9356384277344, "loss": 1.1479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9426345825195312, "rewards/margins": 1.2664365768432617, "rewards/rejected": 2.6761977672576904, "step": 79520 }, { "epoch": 3.6923719764148752, "grad_norm": 1.7846875190734863, "learning_rate": 7.85551789776684e-08, "logits/chosen": -19.40585708618164, "logits/rejected": -19.016340255737305, "logps/chosen": -431.9414978027344, "logps/rejected": -353.6875915527344, "loss": 0.4436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5429904460906982, "rewards/margins": 1.7956939935684204, "rewards/rejected": 1.747296690940857, "step": 79530 }, { "epoch": 3.6928362505223085, "grad_norm": 32.60715103149414, "learning_rate": 7.852732253122243e-08, "logits/chosen": -19.281452178955078, "logits/rejected": -18.53672218322754, "logps/chosen": -398.7576599121094, "logps/rejected": -278.09228515625, "loss": 0.6074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.732877016067505, "rewards/margins": 0.5504438281059265, "rewards/rejected": 3.1824326515197754, "step": 79540 }, { "epoch": 3.6933005246297412, "grad_norm": 33.99908447265625, "learning_rate": 7.849946608477645e-08, "logits/chosen": -18.977184295654297, "logits/rejected": -18.383291244506836, "logps/chosen": -346.1292419433594, "logps/rejected": -240.2144012451172, "loss": 1.0342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.144340991973877, "rewards/margins": 1.1229760646820068, "rewards/rejected": 2.021365165710449, "step": 79550 }, { "epoch": 3.6937647987371744, "grad_norm": 56.73384094238281, "learning_rate": 7.847160963833048e-08, "logits/chosen": -19.220998764038086, "logits/rejected": -18.551008224487305, "logps/chosen": -380.0047302246094, "logps/rejected": -251.92056274414062, "loss": 0.971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.912415027618408, "rewards/margins": 1.291071891784668, "rewards/rejected": 1.6213430166244507, "step": 79560 }, { "epoch": 3.6942290728446077, "grad_norm": 152.16944885253906, "learning_rate": 7.844375319188448e-08, "logits/chosen": -18.496206283569336, "logits/rejected": -18.912757873535156, "logps/chosen": -356.18109130859375, "logps/rejected": -360.863525390625, "loss": 0.9583, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.967808246612549, "rewards/margins": 0.29313957691192627, "rewards/rejected": 2.674668788909912, "step": 79570 }, { "epoch": 3.6946933469520404, "grad_norm": 263.9489440917969, "learning_rate": 7.84158967454385e-08, "logits/chosen": -18.988140106201172, "logits/rejected": -18.599674224853516, "logps/chosen": -425.40020751953125, "logps/rejected": -416.89080810546875, "loss": 1.1035, "rewards/accuracies": 0.5, "rewards/chosen": 3.711650848388672, "rewards/margins": 0.387241929769516, "rewards/rejected": 3.324408769607544, "step": 79580 }, { "epoch": 3.6951576210594737, "grad_norm": 80.58430480957031, "learning_rate": 7.838804029899252e-08, "logits/chosen": -17.786754608154297, "logits/rejected": -18.431880950927734, "logps/chosen": -350.2012634277344, "logps/rejected": -248.5128936767578, "loss": 1.1349, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.938291311264038, "rewards/margins": 0.6835526823997498, "rewards/rejected": 2.2547385692596436, "step": 79590 }, { "epoch": 3.6956218951669064, "grad_norm": 7.140209197998047, "learning_rate": 7.836018385254655e-08, "logits/chosen": -19.01894760131836, "logits/rejected": -18.18929100036621, "logps/chosen": -388.64508056640625, "logps/rejected": -287.94976806640625, "loss": 0.2875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.494795799255371, "rewards/margins": 1.628909707069397, "rewards/rejected": 0.8658859133720398, "step": 79600 }, { "epoch": 3.6960861692743396, "grad_norm": 12.382866859436035, "learning_rate": 7.833232740610056e-08, "logits/chosen": -18.560882568359375, "logits/rejected": -18.055316925048828, "logps/chosen": -341.0267028808594, "logps/rejected": -279.8731994628906, "loss": 0.7338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9130043983459473, "rewards/margins": 0.7122847437858582, "rewards/rejected": 2.2007198333740234, "step": 79610 }, { "epoch": 3.6965504433817724, "grad_norm": 3.5263173580169678, "learning_rate": 7.830447095965459e-08, "logits/chosen": -19.449604034423828, "logits/rejected": -18.344152450561523, "logps/chosen": -360.5947265625, "logps/rejected": -230.8075714111328, "loss": 0.385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.042691707611084, "rewards/margins": 2.466181993484497, "rewards/rejected": 0.5765097737312317, "step": 79620 }, { "epoch": 3.6970147174892056, "grad_norm": 68.00355529785156, "learning_rate": 7.827661451320859e-08, "logits/chosen": -19.274019241333008, "logits/rejected": -17.844669342041016, "logps/chosen": -353.02728271484375, "logps/rejected": -151.2042236328125, "loss": 0.1868, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1441330909729, "rewards/margins": 3.794008255004883, "rewards/rejected": 0.3501249849796295, "step": 79630 }, { "epoch": 3.697478991596639, "grad_norm": 101.41291046142578, "learning_rate": 7.824875806676262e-08, "logits/chosen": -20.62417221069336, "logits/rejected": -20.40737533569336, "logps/chosen": -399.069091796875, "logps/rejected": -344.52734375, "loss": 0.7966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.230419635772705, "rewards/margins": 0.7434395551681519, "rewards/rejected": 3.4869797229766846, "step": 79640 }, { "epoch": 3.6979432657040716, "grad_norm": 203.6280517578125, "learning_rate": 7.822090162031663e-08, "logits/chosen": -18.867244720458984, "logits/rejected": -18.662240982055664, "logps/chosen": -358.50433349609375, "logps/rejected": -346.3291931152344, "loss": 0.708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.904517650604248, "rewards/margins": 0.5719575881958008, "rewards/rejected": 2.3325603008270264, "step": 79650 }, { "epoch": 3.698407539811505, "grad_norm": 32.80546569824219, "learning_rate": 7.819304517387065e-08, "logits/chosen": -18.900564193725586, "logits/rejected": -18.187511444091797, "logps/chosen": -438.406494140625, "logps/rejected": -365.55291748046875, "loss": 0.3381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6998069286346436, "rewards/margins": 1.5533983707427979, "rewards/rejected": 2.1464085578918457, "step": 79660 }, { "epoch": 3.6988718139189376, "grad_norm": 16.963430404663086, "learning_rate": 7.816518872742467e-08, "logits/chosen": -18.60024642944336, "logits/rejected": -18.190444946289062, "logps/chosen": -371.32745361328125, "logps/rejected": -372.0337829589844, "loss": 1.1261, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9758880138397217, "rewards/margins": 0.7375578284263611, "rewards/rejected": 3.238330125808716, "step": 79670 }, { "epoch": 3.699336088026371, "grad_norm": 4.963045120239258, "learning_rate": 7.813733228097868e-08, "logits/chosen": -18.684755325317383, "logits/rejected": -18.0799503326416, "logps/chosen": -544.7514038085938, "logps/rejected": -413.53814697265625, "loss": 0.4388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.503665447235107, "rewards/margins": 1.52090322971344, "rewards/rejected": 2.982761859893799, "step": 79680 }, { "epoch": 3.6998003621338036, "grad_norm": 102.97701263427734, "learning_rate": 7.81094758345327e-08, "logits/chosen": -19.545812606811523, "logits/rejected": -18.702116012573242, "logps/chosen": -526.0553588867188, "logps/rejected": -425.09539794921875, "loss": 0.4289, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.883443832397461, "rewards/margins": 1.0485254526138306, "rewards/rejected": 3.834918975830078, "step": 79690 }, { "epoch": 3.700264636241237, "grad_norm": 269.61590576171875, "learning_rate": 7.808161938808672e-08, "logits/chosen": -19.201526641845703, "logits/rejected": -18.707670211791992, "logps/chosen": -420.7980041503906, "logps/rejected": -373.1255798339844, "loss": 1.1878, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.245756149291992, "rewards/margins": -0.21552856266498566, "rewards/rejected": 3.461284637451172, "step": 79700 }, { "epoch": 3.70072891034867, "grad_norm": 16.09191131591797, "learning_rate": 7.805376294164075e-08, "logits/chosen": -19.035030364990234, "logits/rejected": -17.184085845947266, "logps/chosen": -372.3052673339844, "logps/rejected": -241.4918975830078, "loss": 0.3368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6738245487213135, "rewards/margins": 2.4559919834136963, "rewards/rejected": 0.21783223748207092, "step": 79710 }, { "epoch": 3.701193184456103, "grad_norm": 4.924525260925293, "learning_rate": 7.802590649519476e-08, "logits/chosen": -19.001022338867188, "logits/rejected": -17.8049373626709, "logps/chosen": -423.55938720703125, "logps/rejected": -350.59552001953125, "loss": 0.5797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8903236389160156, "rewards/margins": 1.3317493200302124, "rewards/rejected": 2.5585744380950928, "step": 79720 }, { "epoch": 3.701657458563536, "grad_norm": 82.5755844116211, "learning_rate": 7.799805004874878e-08, "logits/chosen": -19.879703521728516, "logits/rejected": -18.657001495361328, "logps/chosen": -356.68426513671875, "logps/rejected": -300.59161376953125, "loss": 0.5598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5786900520324707, "rewards/margins": 0.9289430379867554, "rewards/rejected": 2.649747133255005, "step": 79730 }, { "epoch": 3.702121732670969, "grad_norm": 13.713175773620605, "learning_rate": 7.797019360230279e-08, "logits/chosen": -19.04106903076172, "logits/rejected": -18.652847290039062, "logps/chosen": -394.9888000488281, "logps/rejected": -333.2342224121094, "loss": 0.4097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8342556953430176, "rewards/margins": 1.3919265270233154, "rewards/rejected": 2.442329168319702, "step": 79740 }, { "epoch": 3.702586006778402, "grad_norm": 0.06918759644031525, "learning_rate": 7.794233715585682e-08, "logits/chosen": -18.610641479492188, "logits/rejected": -17.806638717651367, "logps/chosen": -439.62567138671875, "logps/rejected": -304.6480712890625, "loss": 0.5804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.752450942993164, "rewards/margins": 1.561727523803711, "rewards/rejected": 1.1907234191894531, "step": 79750 }, { "epoch": 3.7030502808858348, "grad_norm": NaN, "learning_rate": 7.791726635405544e-08, "logits/chosen": -18.736873626708984, "logits/rejected": -18.545833587646484, "logps/chosen": -299.64703369140625, "logps/rejected": -342.5162353515625, "loss": 1.1371, "rewards/accuracies": 0.5, "rewards/chosen": 2.006995677947998, "rewards/margins": -0.15355277061462402, "rewards/rejected": 2.1605489253997803, "step": 79760 }, { "epoch": 3.703514554993268, "grad_norm": 194.49578857421875, "learning_rate": 7.788940990760944e-08, "logits/chosen": -18.99220848083496, "logits/rejected": -18.14852523803711, "logps/chosen": -315.9915466308594, "logps/rejected": -301.67584228515625, "loss": 0.8754, "rewards/accuracies": 0.5, "rewards/chosen": 2.8111674785614014, "rewards/margins": 1.567720651626587, "rewards/rejected": 1.243447184562683, "step": 79770 }, { "epoch": 3.703978829100701, "grad_norm": 167.72235107421875, "learning_rate": 7.786155346116347e-08, "logits/chosen": -18.450687408447266, "logits/rejected": -17.781505584716797, "logps/chosen": -431.02874755859375, "logps/rejected": -386.4737243652344, "loss": 0.6061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.026619911193848, "rewards/margins": 1.5334365367889404, "rewards/rejected": 2.4931833744049072, "step": 79780 }, { "epoch": 3.704443103208134, "grad_norm": 46.549468994140625, "learning_rate": 7.783369701471748e-08, "logits/chosen": -19.564517974853516, "logits/rejected": -18.601295471191406, "logps/chosen": -330.1167907714844, "logps/rejected": -242.7604522705078, "loss": 0.4318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0861656665802, "rewards/margins": 1.5083339214324951, "rewards/rejected": 1.5778316259384155, "step": 79790 }, { "epoch": 3.704907377315567, "grad_norm": 39.8072509765625, "learning_rate": 7.780584056827151e-08, "logits/chosen": -19.7285213470459, "logits/rejected": -18.64249038696289, "logps/chosen": -360.4390869140625, "logps/rejected": -243.4883270263672, "loss": 0.4141, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.037522792816162, "rewards/margins": 2.3165860176086426, "rewards/rejected": 0.7209369540214539, "step": 79800 }, { "epoch": 3.7053716514230004, "grad_norm": 33.08473205566406, "learning_rate": 7.777798412182552e-08, "logits/chosen": -19.096302032470703, "logits/rejected": -19.45956802368164, "logps/chosen": -477.212158203125, "logps/rejected": -422.1541442871094, "loss": 0.8411, "rewards/accuracies": 0.5, "rewards/chosen": 3.9867336750030518, "rewards/margins": 0.3266112506389618, "rewards/rejected": 3.6601219177246094, "step": 79810 }, { "epoch": 3.705835925530433, "grad_norm": 60.37605667114258, "learning_rate": 7.775012767537954e-08, "logits/chosen": -19.24728775024414, "logits/rejected": -18.44388198852539, "logps/chosen": -267.6622009277344, "logps/rejected": -185.4906005859375, "loss": 0.8131, "rewards/accuracies": 0.5, "rewards/chosen": 1.669435739517212, "rewards/margins": 0.725477397441864, "rewards/rejected": 0.9439584612846375, "step": 79820 }, { "epoch": 3.706300199637866, "grad_norm": 175.59286499023438, "learning_rate": 7.772227122893355e-08, "logits/chosen": -19.458309173583984, "logits/rejected": -19.078899383544922, "logps/chosen": -412.7230529785156, "logps/rejected": -418.2840270996094, "loss": 1.0283, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.534575939178467, "rewards/margins": -0.002846088958904147, "rewards/rejected": 3.5374221801757812, "step": 79830 }, { "epoch": 3.706764473745299, "grad_norm": 1.257524847984314, "learning_rate": 7.769441478248758e-08, "logits/chosen": -19.310815811157227, "logits/rejected": -17.452865600585938, "logps/chosen": -347.57415771484375, "logps/rejected": -170.482666015625, "loss": 0.1683, "rewards/accuracies": 1.0, "rewards/chosen": 4.7466044425964355, "rewards/margins": 3.6356310844421387, "rewards/rejected": 1.1109734773635864, "step": 79840 }, { "epoch": 3.7072287478527324, "grad_norm": 52.583656311035156, "learning_rate": 7.76665583360416e-08, "logits/chosen": -20.095308303833008, "logits/rejected": -19.47956085205078, "logps/chosen": -479.55743408203125, "logps/rejected": -367.47479248046875, "loss": 0.483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.229727745056152, "rewards/margins": 1.9824771881103516, "rewards/rejected": 3.247251033782959, "step": 79850 }, { "epoch": 3.707693021960165, "grad_norm": 79.67198181152344, "learning_rate": 7.763870188959562e-08, "logits/chosen": -19.477123260498047, "logits/rejected": -19.2343692779541, "logps/chosen": -427.44866943359375, "logps/rejected": -366.5780334472656, "loss": 1.036, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.040371894836426, "rewards/margins": 0.5160449743270874, "rewards/rejected": 3.524326801300049, "step": 79860 }, { "epoch": 3.7081572960675984, "grad_norm": 210.79367065429688, "learning_rate": 7.761084544314962e-08, "logits/chosen": -17.793460845947266, "logits/rejected": -17.771366119384766, "logps/chosen": -357.87200927734375, "logps/rejected": -355.1518859863281, "loss": 1.2772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3398401737213135, "rewards/margins": -0.68136066198349, "rewards/rejected": 3.0212008953094482, "step": 79870 }, { "epoch": 3.7086215701750316, "grad_norm": 36.62587356567383, "learning_rate": 7.758298899670365e-08, "logits/chosen": -20.129179000854492, "logits/rejected": -19.833696365356445, "logps/chosen": -449.6236267089844, "logps/rejected": -410.405517578125, "loss": 0.3971, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.9732842445373535, "rewards/margins": 1.094949722290039, "rewards/rejected": 3.8783340454101562, "step": 79880 }, { "epoch": 3.7090858442824644, "grad_norm": 22.785585403442383, "learning_rate": 7.755513255025767e-08, "logits/chosen": -19.043832778930664, "logits/rejected": -18.190820693969727, "logps/chosen": -499.94158935546875, "logps/rejected": -467.0162658691406, "loss": 0.6248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.984492301940918, "rewards/margins": 2.002401828765869, "rewards/rejected": 2.982090473175049, "step": 79890 }, { "epoch": 3.709550118389897, "grad_norm": 11.052196502685547, "learning_rate": 7.75272761038117e-08, "logits/chosen": -18.364904403686523, "logits/rejected": -17.793149948120117, "logps/chosen": -268.33282470703125, "logps/rejected": -240.4998779296875, "loss": 1.3663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1087191104888916, "rewards/margins": 0.7395505309104919, "rewards/rejected": 1.3691685199737549, "step": 79900 }, { "epoch": 3.7100143924973303, "grad_norm": 41.412879943847656, "learning_rate": 7.749941965736571e-08, "logits/chosen": -19.176868438720703, "logits/rejected": -18.779293060302734, "logps/chosen": -418.10845947265625, "logps/rejected": -367.435302734375, "loss": 0.7022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.691620349884033, "rewards/margins": 1.1315174102783203, "rewards/rejected": 2.5601024627685547, "step": 79910 }, { "epoch": 3.7104786666047636, "grad_norm": 3.3802225589752197, "learning_rate": 7.747156321091971e-08, "logits/chosen": -20.169166564941406, "logits/rejected": -19.74330711364746, "logps/chosen": -337.50177001953125, "logps/rejected": -335.3215637207031, "loss": 0.6744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3125858306884766, "rewards/margins": 1.4096853733062744, "rewards/rejected": 1.9029000997543335, "step": 79920 }, { "epoch": 3.7109429407121963, "grad_norm": 0.07730582356452942, "learning_rate": 7.744370676447374e-08, "logits/chosen": -19.03472137451172, "logits/rejected": -18.14975357055664, "logps/chosen": -335.80035400390625, "logps/rejected": -259.80328369140625, "loss": 0.5362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0621674060821533, "rewards/margins": 1.7726471424102783, "rewards/rejected": 1.2895203828811646, "step": 79930 }, { "epoch": 3.7114072148196295, "grad_norm": 161.45266723632812, "learning_rate": 7.741585031802775e-08, "logits/chosen": -20.5819149017334, "logits/rejected": -19.744606018066406, "logps/chosen": -388.6631774902344, "logps/rejected": -278.57794189453125, "loss": 0.9119, "rewards/accuracies": 0.5, "rewards/chosen": 3.203303098678589, "rewards/margins": 0.5486569404602051, "rewards/rejected": 2.654646396636963, "step": 79940 }, { "epoch": 3.7118714889270628, "grad_norm": 41.143096923828125, "learning_rate": 7.738799387158178e-08, "logits/chosen": -18.722009658813477, "logits/rejected": -18.270122528076172, "logps/chosen": -451.32696533203125, "logps/rejected": -375.4990234375, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.002485752105713, "rewards/margins": 1.5373517274856567, "rewards/rejected": 1.4651339054107666, "step": 79950 }, { "epoch": 3.7123357630344955, "grad_norm": 68.15341186523438, "learning_rate": 7.73601374251358e-08, "logits/chosen": -18.54060935974121, "logits/rejected": -17.388513565063477, "logps/chosen": -541.0950927734375, "logps/rejected": -361.6543273925781, "loss": 0.3454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.195219993591309, "rewards/margins": 3.2379183769226074, "rewards/rejected": 1.9573014974594116, "step": 79960 }, { "epoch": 3.7128000371419287, "grad_norm": 14.101422309875488, "learning_rate": 7.733228097868982e-08, "logits/chosen": -18.299556732177734, "logits/rejected": -18.292476654052734, "logps/chosen": -313.142822265625, "logps/rejected": -329.9859313964844, "loss": 1.1128, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.187788486480713, "rewards/margins": 0.05962109565734863, "rewards/rejected": 2.1281676292419434, "step": 79970 }, { "epoch": 3.7132643112493615, "grad_norm": 37.8192024230957, "learning_rate": 7.730442453224382e-08, "logits/chosen": -18.996631622314453, "logits/rejected": -17.68010902404785, "logps/chosen": -343.93328857421875, "logps/rejected": -212.68252563476562, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": 2.786548376083374, "rewards/margins": 2.170837163925171, "rewards/rejected": 0.6157113313674927, "step": 79980 }, { "epoch": 3.7137285853567947, "grad_norm": 179.81434631347656, "learning_rate": 7.727656808579785e-08, "logits/chosen": -19.181381225585938, "logits/rejected": -18.233673095703125, "logps/chosen": -367.0752258300781, "logps/rejected": -296.85504150390625, "loss": 0.4335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8435099124908447, "rewards/margins": 1.9609479904174805, "rewards/rejected": 0.8825620412826538, "step": 79990 }, { "epoch": 3.7141928594642275, "grad_norm": 30.024768829345703, "learning_rate": 7.724871163935187e-08, "logits/chosen": -18.404165267944336, "logits/rejected": -17.62653160095215, "logps/chosen": -306.834716796875, "logps/rejected": -205.2470245361328, "loss": 0.5167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0222716331481934, "rewards/margins": 1.4205667972564697, "rewards/rejected": 0.6017052531242371, "step": 80000 }, { "epoch": 3.7146571335716607, "grad_norm": 86.58438873291016, "learning_rate": 7.72208551929059e-08, "logits/chosen": -19.827009201049805, "logits/rejected": -19.21384048461914, "logps/chosen": -420.7603454589844, "logps/rejected": -337.6572570800781, "loss": 1.4567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4617984294891357, "rewards/margins": 0.2627226412296295, "rewards/rejected": 3.19907546043396, "step": 80010 }, { "epoch": 3.715121407679094, "grad_norm": 12.350931167602539, "learning_rate": 7.719299874645991e-08, "logits/chosen": -19.771190643310547, "logits/rejected": -18.90361785888672, "logps/chosen": -284.122314453125, "logps/rejected": -238.837158203125, "loss": 0.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8488556146621704, "rewards/margins": 0.7233521342277527, "rewards/rejected": 1.125503420829773, "step": 80020 }, { "epoch": 3.7155856817865267, "grad_norm": 20.861600875854492, "learning_rate": 7.716514230001392e-08, "logits/chosen": -19.2392635345459, "logits/rejected": -18.944047927856445, "logps/chosen": -291.5246887207031, "logps/rejected": -274.50836181640625, "loss": 0.4692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4921317100524902, "rewards/margins": 1.4768444299697876, "rewards/rejected": 1.0152876377105713, "step": 80030 }, { "epoch": 3.71604995589396, "grad_norm": 139.72006225585938, "learning_rate": 7.713728585356794e-08, "logits/chosen": -19.624534606933594, "logits/rejected": -18.024938583374023, "logps/chosen": -370.3970642089844, "logps/rejected": -236.984619140625, "loss": 0.2388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4860846996307373, "rewards/margins": 2.4318583011627197, "rewards/rejected": 1.0542266368865967, "step": 80040 }, { "epoch": 3.7165142300013927, "grad_norm": 103.16248321533203, "learning_rate": 7.710942940712197e-08, "logits/chosen": -18.94993019104004, "logits/rejected": -18.419147491455078, "logps/chosen": -362.88006591796875, "logps/rejected": -264.6207580566406, "loss": 0.5739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7913730144500732, "rewards/margins": 1.5249862670898438, "rewards/rejected": 1.2663867473602295, "step": 80050 }, { "epoch": 3.716978504108826, "grad_norm": 48.07693099975586, "learning_rate": 7.708157296067598e-08, "logits/chosen": -20.006303787231445, "logits/rejected": -18.551233291625977, "logps/chosen": -450.1678161621094, "logps/rejected": -320.5057373046875, "loss": 0.2886, "rewards/accuracies": 1.0, "rewards/chosen": 4.178220272064209, "rewards/margins": 1.78398859500885, "rewards/rejected": 2.3942315578460693, "step": 80060 }, { "epoch": 3.7174427782162587, "grad_norm": 68.69119262695312, "learning_rate": 7.705371651423001e-08, "logits/chosen": -17.67819595336914, "logits/rejected": -18.254175186157227, "logps/chosen": -297.0690002441406, "logps/rejected": -430.3876037597656, "loss": 1.7152, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8684349060058594, "rewards/margins": -0.41081342101097107, "rewards/rejected": 3.279247999191284, "step": 80070 }, { "epoch": 3.717907052323692, "grad_norm": 159.82017517089844, "learning_rate": 7.702586006778401e-08, "logits/chosen": -18.9824275970459, "logits/rejected": -18.779048919677734, "logps/chosen": -273.1308288574219, "logps/rejected": -248.1937255859375, "loss": 1.2037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.054100513458252, "rewards/margins": 0.5900176167488098, "rewards/rejected": 2.464083194732666, "step": 80080 }, { "epoch": 3.718371326431125, "grad_norm": 127.29637145996094, "learning_rate": 7.699800362133804e-08, "logits/chosen": -20.218713760375977, "logits/rejected": -19.183879852294922, "logps/chosen": -356.82781982421875, "logps/rejected": -295.132080078125, "loss": 1.0336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.059473991394043, "rewards/margins": 0.43367043137550354, "rewards/rejected": 3.625802993774414, "step": 80090 }, { "epoch": 3.718835600538558, "grad_norm": 44.50246047973633, "learning_rate": 7.697014717489205e-08, "logits/chosen": -20.311845779418945, "logits/rejected": -19.048892974853516, "logps/chosen": -325.5381164550781, "logps/rejected": -297.251708984375, "loss": 0.9553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.737492322921753, "rewards/margins": 0.9525747299194336, "rewards/rejected": 1.7849174737930298, "step": 80100 }, { "epoch": 3.719299874645991, "grad_norm": 119.2596206665039, "learning_rate": 7.694229072844607e-08, "logits/chosen": -19.094532012939453, "logits/rejected": -18.835601806640625, "logps/chosen": -367.45599365234375, "logps/rejected": -317.3043212890625, "loss": 0.6087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9237313270568848, "rewards/margins": 0.722917377948761, "rewards/rejected": 2.2008137702941895, "step": 80110 }, { "epoch": 3.719764148753424, "grad_norm": 271.75054931640625, "learning_rate": 7.69144342820001e-08, "logits/chosen": -19.483257293701172, "logits/rejected": -18.950916290283203, "logps/chosen": -405.42584228515625, "logps/rejected": -348.0904235839844, "loss": 0.7056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6417438983917236, "rewards/margins": 0.9731674194335938, "rewards/rejected": 2.668576717376709, "step": 80120 }, { "epoch": 3.720228422860857, "grad_norm": 213.642822265625, "learning_rate": 7.68865778355541e-08, "logits/chosen": -18.14404296875, "logits/rejected": -17.414913177490234, "logps/chosen": -331.1921081542969, "logps/rejected": -246.94699096679688, "loss": 0.6007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.817021131515503, "rewards/margins": 0.8543431162834167, "rewards/rejected": 0.9626784324645996, "step": 80130 }, { "epoch": 3.72069269696829, "grad_norm": 211.13169860839844, "learning_rate": 7.685872138910812e-08, "logits/chosen": -19.010581970214844, "logits/rejected": -19.845476150512695, "logps/chosen": -255.33914184570312, "logps/rejected": -290.2727966308594, "loss": 1.303, "rewards/accuracies": 0.5, "rewards/chosen": 2.459719181060791, "rewards/margins": -0.584535539150238, "rewards/rejected": 3.044254779815674, "step": 80140 }, { "epoch": 3.721156971075723, "grad_norm": 22.134061813354492, "learning_rate": 7.683086494266214e-08, "logits/chosen": -18.939861297607422, "logits/rejected": -18.56740379333496, "logps/chosen": -217.9744415283203, "logps/rejected": -210.30630493164062, "loss": 1.1981, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.6358663439750671, "rewards/margins": -0.02009117603302002, "rewards/rejected": 0.6559574604034424, "step": 80150 }, { "epoch": 3.7216212451831563, "grad_norm": 197.63941955566406, "learning_rate": 7.680300849621617e-08, "logits/chosen": -19.597280502319336, "logits/rejected": -19.1873836517334, "logps/chosen": -377.40594482421875, "logps/rejected": -380.366943359375, "loss": 1.0372, "rewards/accuracies": 0.5, "rewards/chosen": 4.181873798370361, "rewards/margins": 0.667035698890686, "rewards/rejected": 3.5148377418518066, "step": 80160 }, { "epoch": 3.722085519290589, "grad_norm": 31.306602478027344, "learning_rate": 7.677515204977018e-08, "logits/chosen": -18.097328186035156, "logits/rejected": -17.980480194091797, "logps/chosen": -382.2121276855469, "logps/rejected": -326.25054931640625, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9438843727111816, "rewards/margins": 0.46249979734420776, "rewards/rejected": 2.481384754180908, "step": 80170 }, { "epoch": 3.7225497933980223, "grad_norm": 8.639196395874023, "learning_rate": 7.674729560332421e-08, "logits/chosen": -20.1732177734375, "logits/rejected": -20.694604873657227, "logps/chosen": -361.4267883300781, "logps/rejected": -364.60101318359375, "loss": 0.9775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9030022621154785, "rewards/margins": 0.25836068391799927, "rewards/rejected": 2.644641399383545, "step": 80180 }, { "epoch": 3.7230140675054555, "grad_norm": 131.44093322753906, "learning_rate": 7.671943915687821e-08, "logits/chosen": -19.091571807861328, "logits/rejected": -17.949390411376953, "logps/chosen": -357.6022644042969, "logps/rejected": -230.33859252929688, "loss": 0.5325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3027846813201904, "rewards/margins": 1.1590383052825928, "rewards/rejected": 1.1437463760375977, "step": 80190 }, { "epoch": 3.7234783416128883, "grad_norm": 35.55690383911133, "learning_rate": 7.669158271043224e-08, "logits/chosen": -19.749130249023438, "logits/rejected": -18.73070526123047, "logps/chosen": -388.3750305175781, "logps/rejected": -308.00115966796875, "loss": 0.4849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.703533172607422, "rewards/margins": 1.0897960662841797, "rewards/rejected": 2.613737106323242, "step": 80200 }, { "epoch": 3.723942615720321, "grad_norm": 2.8206820487976074, "learning_rate": 7.666372626398625e-08, "logits/chosen": -19.243946075439453, "logits/rejected": -18.26187515258789, "logps/chosen": -559.6276245117188, "logps/rejected": -380.58953857421875, "loss": 0.294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.643138885498047, "rewards/margins": 2.9623637199401855, "rewards/rejected": 2.6807751655578613, "step": 80210 }, { "epoch": 3.7244068898277543, "grad_norm": 57.47500991821289, "learning_rate": 7.663586981754028e-08, "logits/chosen": -19.0944881439209, "logits/rejected": -18.71457862854004, "logps/chosen": -359.4554748535156, "logps/rejected": -292.744873046875, "loss": 0.63, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1140804290771484, "rewards/margins": 1.0922409296035767, "rewards/rejected": 2.021839141845703, "step": 80220 }, { "epoch": 3.7248711639351875, "grad_norm": 87.91526794433594, "learning_rate": 7.66080133710943e-08, "logits/chosen": -19.100223541259766, "logits/rejected": -18.5672550201416, "logps/chosen": -406.9883117675781, "logps/rejected": -392.18231201171875, "loss": 0.5392, "rewards/accuracies": 0.5, "rewards/chosen": 3.4017727375030518, "rewards/margins": 0.9053341150283813, "rewards/rejected": 2.49643874168396, "step": 80230 }, { "epoch": 3.7253354380426202, "grad_norm": 36.13347625732422, "learning_rate": 7.658015692464831e-08, "logits/chosen": -19.229101181030273, "logits/rejected": -18.85591697692871, "logps/chosen": -397.57000732421875, "logps/rejected": -428.60455322265625, "loss": 1.1723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2514374256134033, "rewards/margins": 0.10986447334289551, "rewards/rejected": 3.1415724754333496, "step": 80240 }, { "epoch": 3.7257997121500535, "grad_norm": 54.605323791503906, "learning_rate": 7.655230047820232e-08, "logits/chosen": -18.617019653320312, "logits/rejected": -18.540428161621094, "logps/chosen": -281.55023193359375, "logps/rejected": -241.2694091796875, "loss": 0.4486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7896106243133545, "rewards/margins": 1.3529404401779175, "rewards/rejected": 1.4366700649261475, "step": 80250 }, { "epoch": 3.7262639862574867, "grad_norm": 20.042240142822266, "learning_rate": 7.652444403175635e-08, "logits/chosen": -18.04880142211914, "logits/rejected": -17.393112182617188, "logps/chosen": -387.00018310546875, "logps/rejected": -272.2597961425781, "loss": 0.6822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8218960762023926, "rewards/margins": 0.9648901224136353, "rewards/rejected": 1.857006311416626, "step": 80260 }, { "epoch": 3.7267282603649194, "grad_norm": 128.01937866210938, "learning_rate": 7.649658758531037e-08, "logits/chosen": -19.500865936279297, "logits/rejected": -18.329130172729492, "logps/chosen": -413.76385498046875, "logps/rejected": -390.8173828125, "loss": 0.2059, "rewards/accuracies": 1.0, "rewards/chosen": 3.4451732635498047, "rewards/margins": 1.951780915260315, "rewards/rejected": 1.4933923482894897, "step": 80270 }, { "epoch": 3.727192534472352, "grad_norm": 238.98208618164062, "learning_rate": 7.646873113886439e-08, "logits/chosen": -19.65506935119629, "logits/rejected": -18.688350677490234, "logps/chosen": -454.64007568359375, "logps/rejected": -418.0555725097656, "loss": 1.2162, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3418374061584473, "rewards/margins": 0.3252384662628174, "rewards/rejected": 3.01659893989563, "step": 80280 }, { "epoch": 3.7276568085797854, "grad_norm": 218.38121032714844, "learning_rate": 7.64408746924184e-08, "logits/chosen": -18.298553466796875, "logits/rejected": -17.857067108154297, "logps/chosen": -405.59942626953125, "logps/rejected": -287.83599853515625, "loss": 1.0548, "rewards/accuracies": 0.5, "rewards/chosen": 3.4955825805664062, "rewards/margins": 0.9870842695236206, "rewards/rejected": 2.5084989070892334, "step": 80290 }, { "epoch": 3.7281210826872186, "grad_norm": 83.60043334960938, "learning_rate": 7.641301824597241e-08, "logits/chosen": -19.042327880859375, "logits/rejected": -18.81039810180664, "logps/chosen": -302.315185546875, "logps/rejected": -241.3612823486328, "loss": 0.7212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5737335681915283, "rewards/margins": 0.6457588076591492, "rewards/rejected": 0.9279748797416687, "step": 80300 }, { "epoch": 3.7285853567946514, "grad_norm": 262.7124328613281, "learning_rate": 7.638516179952644e-08, "logits/chosen": -18.87456703186035, "logits/rejected": -18.86232566833496, "logps/chosen": -345.4625244140625, "logps/rejected": -392.2843017578125, "loss": 0.7722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.323516845703125, "rewards/margins": 0.23086532950401306, "rewards/rejected": 2.092651605606079, "step": 80310 }, { "epoch": 3.7290496309020846, "grad_norm": 35.56182861328125, "learning_rate": 7.635730535308045e-08, "logits/chosen": -18.981403350830078, "logits/rejected": -17.91879653930664, "logps/chosen": -463.58526611328125, "logps/rejected": -313.40911865234375, "loss": 0.4719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0644946098327637, "rewards/margins": 1.5086463689804077, "rewards/rejected": 1.5558481216430664, "step": 80320 }, { "epoch": 3.729513905009518, "grad_norm": 42.26586151123047, "learning_rate": 7.632944890663448e-08, "logits/chosen": -19.52411460876465, "logits/rejected": -18.62028694152832, "logps/chosen": -442.6119689941406, "logps/rejected": -353.77984619140625, "loss": 0.3734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.455591917037964, "rewards/margins": 1.5358213186264038, "rewards/rejected": 1.9197708368301392, "step": 80330 }, { "epoch": 3.7299781791169506, "grad_norm": 163.9945831298828, "learning_rate": 7.630159246018848e-08, "logits/chosen": -20.616098403930664, "logits/rejected": -19.500890731811523, "logps/chosen": -399.71661376953125, "logps/rejected": -402.0232849121094, "loss": 0.9017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3254151344299316, "rewards/margins": 0.5400999784469604, "rewards/rejected": 2.7853147983551025, "step": 80340 }, { "epoch": 3.730442453224384, "grad_norm": 193.08151245117188, "learning_rate": 7.627373601374251e-08, "logits/chosen": -19.237163543701172, "logits/rejected": -18.417104721069336, "logps/chosen": -444.2068786621094, "logps/rejected": -434.319580078125, "loss": 0.5009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1652913093566895, "rewards/margins": 1.371285319328308, "rewards/rejected": 2.794005870819092, "step": 80350 }, { "epoch": 3.7309067273318166, "grad_norm": 42.006473541259766, "learning_rate": 7.624587956729652e-08, "logits/chosen": -19.367307662963867, "logits/rejected": -18.825748443603516, "logps/chosen": -466.37530517578125, "logps/rejected": -408.6501159667969, "loss": 0.6361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5128142833709717, "rewards/margins": 0.6245058178901672, "rewards/rejected": 2.888308525085449, "step": 80360 }, { "epoch": 3.73137100143925, "grad_norm": 63.10228729248047, "learning_rate": 7.621802312085055e-08, "logits/chosen": -19.913114547729492, "logits/rejected": -19.26374626159668, "logps/chosen": -377.0909118652344, "logps/rejected": -296.8323059082031, "loss": 0.8303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.862973690032959, "rewards/margins": 0.8950302004814148, "rewards/rejected": 2.9679436683654785, "step": 80370 }, { "epoch": 3.7318352755466826, "grad_norm": 27.516115188598633, "learning_rate": 7.619016667440456e-08, "logits/chosen": -19.197860717773438, "logits/rejected": -18.558292388916016, "logps/chosen": -438.01507568359375, "logps/rejected": -325.61737060546875, "loss": 0.6321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.549694061279297, "rewards/margins": 1.1704400777816772, "rewards/rejected": 2.37925386428833, "step": 80380 }, { "epoch": 3.732299549654116, "grad_norm": 6.852163791656494, "learning_rate": 7.616231022795859e-08, "logits/chosen": -19.16490364074707, "logits/rejected": -16.860971450805664, "logps/chosen": -508.90087890625, "logps/rejected": -270.43536376953125, "loss": 0.5456, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7512588500976562, "rewards/margins": 2.428722381591797, "rewards/rejected": 1.3225364685058594, "step": 80390 }, { "epoch": 3.732763823761549, "grad_norm": 172.40487670898438, "learning_rate": 7.61344537815126e-08, "logits/chosen": -18.577716827392578, "logits/rejected": -19.088918685913086, "logps/chosen": -305.8524169921875, "logps/rejected": -379.24859619140625, "loss": 2.1888, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.357799530029297, "rewards/margins": -1.326183557510376, "rewards/rejected": 3.6839828491210938, "step": 80400 }, { "epoch": 3.733228097868982, "grad_norm": 128.8633575439453, "learning_rate": 7.610659733506662e-08, "logits/chosen": -19.184818267822266, "logits/rejected": -18.490999221801758, "logps/chosen": -405.9876403808594, "logps/rejected": -297.33709716796875, "loss": 0.4899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.229583263397217, "rewards/margins": 1.8777501583099365, "rewards/rejected": 2.351832866668701, "step": 80410 }, { "epoch": 3.733692371976415, "grad_norm": 18.398874282836914, "learning_rate": 7.607874088862064e-08, "logits/chosen": -19.00554847717285, "logits/rejected": -18.50956153869629, "logps/chosen": -479.4895935058594, "logps/rejected": -400.5852966308594, "loss": 0.5564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8475277423858643, "rewards/margins": 1.7284578084945679, "rewards/rejected": 2.119070291519165, "step": 80420 }, { "epoch": 3.734156646083848, "grad_norm": 19.948144912719727, "learning_rate": 7.605088444217466e-08, "logits/chosen": -18.84633445739746, "logits/rejected": -18.173175811767578, "logps/chosen": -289.20361328125, "logps/rejected": -248.0714874267578, "loss": 0.4226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4263033866882324, "rewards/margins": 1.0333127975463867, "rewards/rejected": 1.3929908275604248, "step": 80430 }, { "epoch": 3.734620920191281, "grad_norm": 78.28591918945312, "learning_rate": 7.602302799572868e-08, "logits/chosen": -18.51506805419922, "logits/rejected": -17.258769989013672, "logps/chosen": -345.957275390625, "logps/rejected": -216.77978515625, "loss": 0.2501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.086343765258789, "rewards/margins": 2.562098979949951, "rewards/rejected": 0.5242449641227722, "step": 80440 }, { "epoch": 3.7350851942987138, "grad_norm": 12.995226860046387, "learning_rate": 7.599517154928269e-08, "logits/chosen": -18.855016708374023, "logits/rejected": -17.983068466186523, "logps/chosen": -443.93731689453125, "logps/rejected": -361.394775390625, "loss": 0.4218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1929233074188232, "rewards/margins": 1.3572105169296265, "rewards/rejected": 1.8357126712799072, "step": 80450 }, { "epoch": 3.735549468406147, "grad_norm": 25.36522674560547, "learning_rate": 7.596731510283671e-08, "logits/chosen": -19.100793838500977, "logits/rejected": -18.57035255432129, "logps/chosen": -346.49249267578125, "logps/rejected": -303.8271179199219, "loss": 0.3507, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.243365526199341, "rewards/margins": 1.4932838678359985, "rewards/rejected": 1.7500814199447632, "step": 80460 }, { "epoch": 3.73601374251358, "grad_norm": 4.583748817443848, "learning_rate": 7.593945865639074e-08, "logits/chosen": -18.313579559326172, "logits/rejected": -17.866670608520508, "logps/chosen": -260.88653564453125, "logps/rejected": -304.25115966796875, "loss": 1.3735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8978865146636963, "rewards/margins": 0.9386280179023743, "rewards/rejected": 0.9592584371566772, "step": 80470 }, { "epoch": 3.736478016621013, "grad_norm": 1.1334303617477417, "learning_rate": 7.591160220994475e-08, "logits/chosen": -19.216289520263672, "logits/rejected": -18.973419189453125, "logps/chosen": -465.0982971191406, "logps/rejected": -428.24346923828125, "loss": 0.873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.596139430999756, "rewards/margins": 0.757688045501709, "rewards/rejected": 3.838451862335205, "step": 80480 }, { "epoch": 3.736942290728446, "grad_norm": 108.11204528808594, "learning_rate": 7.588374576349878e-08, "logits/chosen": -18.745697021484375, "logits/rejected": -17.79984474182129, "logps/chosen": -435.1597595214844, "logps/rejected": -379.38909912109375, "loss": 0.6954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8847053050994873, "rewards/margins": 1.9565349817276, "rewards/rejected": 1.9281704425811768, "step": 80490 }, { "epoch": 3.737406564835879, "grad_norm": 0.1226217970252037, "learning_rate": 7.585588931705278e-08, "logits/chosen": -19.072750091552734, "logits/rejected": -17.684246063232422, "logps/chosen": -443.760986328125, "logps/rejected": -300.76776123046875, "loss": 0.5757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.513517379760742, "rewards/margins": 2.570420026779175, "rewards/rejected": 1.943097710609436, "step": 80500 }, { "epoch": 3.737870838943312, "grad_norm": 18.30752182006836, "learning_rate": 7.58280328706068e-08, "logits/chosen": -18.534238815307617, "logits/rejected": -17.330297470092773, "logps/chosen": -351.8349914550781, "logps/rejected": -288.927001953125, "loss": 0.5975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1223390102386475, "rewards/margins": 1.5046660900115967, "rewards/rejected": 1.6176729202270508, "step": 80510 }, { "epoch": 3.738335113050745, "grad_norm": 0.9129414558410645, "learning_rate": 7.580017642416082e-08, "logits/chosen": -19.050655364990234, "logits/rejected": -17.346981048583984, "logps/chosen": -488.1080017089844, "logps/rejected": -328.0333557128906, "loss": 0.3778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.770571231842041, "rewards/margins": 1.9812963008880615, "rewards/rejected": 1.7892745733261108, "step": 80520 }, { "epoch": 3.738799387158178, "grad_norm": 138.61871337890625, "learning_rate": 7.577231997771484e-08, "logits/chosen": -19.357593536376953, "logits/rejected": -18.353862762451172, "logps/chosen": -543.2179565429688, "logps/rejected": -400.2008361816406, "loss": 0.51, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.458963871002197, "rewards/margins": 2.2105250358581543, "rewards/rejected": 3.2484383583068848, "step": 80530 }, { "epoch": 3.7392636612656114, "grad_norm": 20.642013549804688, "learning_rate": 7.574446353126886e-08, "logits/chosen": -18.953527450561523, "logits/rejected": -18.630834579467773, "logps/chosen": -325.1871643066406, "logps/rejected": -237.5931396484375, "loss": 0.6933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.606407642364502, "rewards/margins": 0.4914734363555908, "rewards/rejected": 2.1149344444274902, "step": 80540 }, { "epoch": 3.739727935373044, "grad_norm": 86.58158874511719, "learning_rate": 7.571660708482286e-08, "logits/chosen": -19.47341537475586, "logits/rejected": -18.157373428344727, "logps/chosen": -403.4973449707031, "logps/rejected": -334.966064453125, "loss": 0.7354, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4780337810516357, "rewards/margins": 1.997302770614624, "rewards/rejected": 1.4807311296463013, "step": 80550 }, { "epoch": 3.7401922094804774, "grad_norm": 15.692829132080078, "learning_rate": 7.568875063837689e-08, "logits/chosen": -19.248470306396484, "logits/rejected": -18.69715690612793, "logps/chosen": -336.62335205078125, "logps/rejected": -310.263916015625, "loss": 0.7989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7321345806121826, "rewards/margins": 1.0535869598388672, "rewards/rejected": 2.678546667098999, "step": 80560 }, { "epoch": 3.74065648358791, "grad_norm": 20.54981231689453, "learning_rate": 7.566089419193091e-08, "logits/chosen": -18.34830093383789, "logits/rejected": -18.48288345336914, "logps/chosen": -400.6360778808594, "logps/rejected": -351.26715087890625, "loss": 0.5397, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.087930679321289, "rewards/margins": 1.5580620765686035, "rewards/rejected": 1.529868483543396, "step": 80570 }, { "epoch": 3.7411207576953434, "grad_norm": 76.3666763305664, "learning_rate": 7.563303774548493e-08, "logits/chosen": -19.143903732299805, "logits/rejected": -18.006710052490234, "logps/chosen": -478.897216796875, "logps/rejected": -351.4316101074219, "loss": 0.4275, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9732489585876465, "rewards/margins": 1.8803876638412476, "rewards/rejected": 2.0928618907928467, "step": 80580 }, { "epoch": 3.741585031802776, "grad_norm": 69.78885650634766, "learning_rate": 7.560518129903895e-08, "logits/chosen": -19.276363372802734, "logits/rejected": -19.009130477905273, "logps/chosen": -341.3001708984375, "logps/rejected": -340.95111083984375, "loss": 0.6657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9627058506011963, "rewards/margins": 0.6706322431564331, "rewards/rejected": 2.2920732498168945, "step": 80590 }, { "epoch": 3.7420493059102093, "grad_norm": 45.43616485595703, "learning_rate": 7.557732485259298e-08, "logits/chosen": -19.152393341064453, "logits/rejected": -17.916095733642578, "logps/chosen": -514.6732788085938, "logps/rejected": -337.8052978515625, "loss": 0.5827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.782007694244385, "rewards/margins": 3.7624008655548096, "rewards/rejected": 3.0196080207824707, "step": 80600 }, { "epoch": 3.7425135800176426, "grad_norm": 106.57882690429688, "learning_rate": 7.554946840614698e-08, "logits/chosen": -19.1831111907959, "logits/rejected": -17.994421005249023, "logps/chosen": -428.41009521484375, "logps/rejected": -330.8617248535156, "loss": 0.3657, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1491758823394775, "rewards/margins": 1.7217161655426025, "rewards/rejected": 1.427459955215454, "step": 80610 }, { "epoch": 3.7429778541250753, "grad_norm": 6.5356059074401855, "learning_rate": 7.5521611959701e-08, "logits/chosen": -20.663148880004883, "logits/rejected": -20.31642723083496, "logps/chosen": -284.53448486328125, "logps/rejected": -260.3475036621094, "loss": 0.4547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.017547130584717, "rewards/margins": 0.9821378588676453, "rewards/rejected": 2.0354092121124268, "step": 80620 }, { "epoch": 3.7434421282325085, "grad_norm": 17.598526000976562, "learning_rate": 7.549375551325502e-08, "logits/chosen": -18.03492546081543, "logits/rejected": -17.311767578125, "logps/chosen": -376.81927490234375, "logps/rejected": -330.7624206542969, "loss": 0.6056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.685110569000244, "rewards/margins": 1.1571223735809326, "rewards/rejected": 1.5279881954193115, "step": 80630 }, { "epoch": 3.7439064023399418, "grad_norm": 15.764968872070312, "learning_rate": 7.546589906680905e-08, "logits/chosen": -19.2712459564209, "logits/rejected": -18.367605209350586, "logps/chosen": -451.99652099609375, "logps/rejected": -365.0743713378906, "loss": 0.4104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.569823265075684, "rewards/margins": 1.718692421913147, "rewards/rejected": 2.851130723953247, "step": 80640 }, { "epoch": 3.7443706764473745, "grad_norm": 85.80660247802734, "learning_rate": 7.543804262036306e-08, "logits/chosen": -19.529356002807617, "logits/rejected": -18.981531143188477, "logps/chosen": -386.2467346191406, "logps/rejected": -340.64306640625, "loss": 0.4789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.766171932220459, "rewards/margins": 1.180657148361206, "rewards/rejected": 2.585514783859253, "step": 80650 }, { "epoch": 3.7448349505548073, "grad_norm": 177.19764709472656, "learning_rate": 7.541018617391708e-08, "logits/chosen": -18.426597595214844, "logits/rejected": -18.535320281982422, "logps/chosen": -326.65667724609375, "logps/rejected": -394.9989929199219, "loss": 1.4252, "rewards/accuracies": 0.5, "rewards/chosen": 1.9515520334243774, "rewards/margins": -0.23762276768684387, "rewards/rejected": 2.1891746520996094, "step": 80660 }, { "epoch": 3.7452992246622405, "grad_norm": 157.40745544433594, "learning_rate": 7.538232972747109e-08, "logits/chosen": -19.539411544799805, "logits/rejected": -18.713909149169922, "logps/chosen": -319.64385986328125, "logps/rejected": -276.5811767578125, "loss": 0.9855, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.731416702270508, "rewards/margins": 0.2759692370891571, "rewards/rejected": 2.455447196960449, "step": 80670 }, { "epoch": 3.7457634987696737, "grad_norm": 120.22444915771484, "learning_rate": 7.535447328102512e-08, "logits/chosen": -18.80569076538086, "logits/rejected": -17.745595932006836, "logps/chosen": -426.919921875, "logps/rejected": -307.206787109375, "loss": 0.5731, "rewards/accuracies": 0.5, "rewards/chosen": 3.639232635498047, "rewards/margins": 2.2657899856567383, "rewards/rejected": 1.3734428882598877, "step": 80680 }, { "epoch": 3.7462277728771065, "grad_norm": 19.24198341369629, "learning_rate": 7.532661683457913e-08, "logits/chosen": -19.197742462158203, "logits/rejected": -17.877582550048828, "logps/chosen": -298.5828857421875, "logps/rejected": -194.5599822998047, "loss": 0.4359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8878743648529053, "rewards/margins": 1.624159574508667, "rewards/rejected": 0.26371484994888306, "step": 80690 }, { "epoch": 3.7466920469845397, "grad_norm": 67.20944213867188, "learning_rate": 7.529876038813314e-08, "logits/chosen": -18.645626068115234, "logits/rejected": -17.792490005493164, "logps/chosen": -355.01995849609375, "logps/rejected": -286.2786560058594, "loss": 0.6783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.125882387161255, "rewards/margins": 1.4675118923187256, "rewards/rejected": 1.6583706140518188, "step": 80700 }, { "epoch": 3.747156321091973, "grad_norm": 0.9598274230957031, "learning_rate": 7.527090394168716e-08, "logits/chosen": -18.580955505371094, "logits/rejected": -17.499767303466797, "logps/chosen": -383.26007080078125, "logps/rejected": -240.3294219970703, "loss": 0.6125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9411425590515137, "rewards/margins": 1.9705082178115845, "rewards/rejected": 0.9706344604492188, "step": 80710 }, { "epoch": 3.7476205951994057, "grad_norm": 0.16352681815624237, "learning_rate": 7.524304749524118e-08, "logits/chosen": -19.244747161865234, "logits/rejected": -17.87112045288086, "logps/chosen": -349.76202392578125, "logps/rejected": -264.5469970703125, "loss": 0.2413, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8756394386291504, "rewards/margins": 2.651366949081421, "rewards/rejected": 1.2242720127105713, "step": 80720 }, { "epoch": 3.7480848693068385, "grad_norm": 167.6126708984375, "learning_rate": 7.52151910487952e-08, "logits/chosen": -19.86083221435547, "logits/rejected": -17.82242774963379, "logps/chosen": -496.1966247558594, "logps/rejected": -302.56866455078125, "loss": 0.2861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.849331378936768, "rewards/margins": 3.3399269580841064, "rewards/rejected": 2.509403705596924, "step": 80730 }, { "epoch": 3.7485491434142717, "grad_norm": 140.05825805664062, "learning_rate": 7.518733460234922e-08, "logits/chosen": -19.042926788330078, "logits/rejected": -18.94528579711914, "logps/chosen": -408.1148986816406, "logps/rejected": -371.3479919433594, "loss": 0.861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2570481300354004, "rewards/margins": 1.1434999704360962, "rewards/rejected": 2.1135478019714355, "step": 80740 }, { "epoch": 3.749013417521705, "grad_norm": 185.80892944335938, "learning_rate": 7.515947815590325e-08, "logits/chosen": -19.491313934326172, "logits/rejected": -18.9408016204834, "logps/chosen": -478.03289794921875, "logps/rejected": -384.89288330078125, "loss": 0.6824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.751152515411377, "rewards/margins": 0.517158031463623, "rewards/rejected": 3.233994722366333, "step": 80750 }, { "epoch": 3.7494776916291377, "grad_norm": 55.678123474121094, "learning_rate": 7.513162170945725e-08, "logits/chosen": -18.733333587646484, "logits/rejected": -17.670188903808594, "logps/chosen": -332.69573974609375, "logps/rejected": -229.74380493164062, "loss": 0.2954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5746357440948486, "rewards/margins": 1.6996898651123047, "rewards/rejected": 1.8749454021453857, "step": 80760 }, { "epoch": 3.749941965736571, "grad_norm": 16.83530616760254, "learning_rate": 7.510376526301128e-08, "logits/chosen": -18.933292388916016, "logits/rejected": -18.32974624633789, "logps/chosen": -305.2939147949219, "logps/rejected": -216.8625946044922, "loss": 0.4206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.583366870880127, "rewards/margins": 1.298945426940918, "rewards/rejected": 1.2844213247299194, "step": 80770 }, { "epoch": 3.750406239844004, "grad_norm": 0.11494865268468857, "learning_rate": 7.507590881656529e-08, "logits/chosen": -19.54646873474121, "logits/rejected": -18.386877059936523, "logps/chosen": -389.0319519042969, "logps/rejected": -292.50543212890625, "loss": 0.5211, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6096062660217285, "rewards/margins": 1.8039848804473877, "rewards/rejected": 1.8056217432022095, "step": 80780 }, { "epoch": 3.750870513951437, "grad_norm": 0.46137428283691406, "learning_rate": 7.504805237011932e-08, "logits/chosen": -20.137344360351562, "logits/rejected": -19.5108642578125, "logps/chosen": -430.18865966796875, "logps/rejected": -375.03985595703125, "loss": 0.3986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.467349052429199, "rewards/margins": 2.4919543266296387, "rewards/rejected": 1.9753954410552979, "step": 80790 }, { "epoch": 3.75133478805887, "grad_norm": 83.46963500976562, "learning_rate": 7.502019592367333e-08, "logits/chosen": -18.2054386138916, "logits/rejected": -17.633581161499023, "logps/chosen": -302.7144470214844, "logps/rejected": -251.31503295898438, "loss": 0.3949, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4422059059143066, "rewards/margins": 1.4448556900024414, "rewards/rejected": 0.9973505735397339, "step": 80800 }, { "epoch": 3.751799062166303, "grad_norm": 42.69755172729492, "learning_rate": 7.499233947722735e-08, "logits/chosen": -18.636465072631836, "logits/rejected": -17.981605529785156, "logps/chosen": -438.802001953125, "logps/rejected": -307.8101806640625, "loss": 0.7081, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.029845714569092, "rewards/margins": 1.5549838542938232, "rewards/rejected": 2.4748618602752686, "step": 80810 }, { "epoch": 3.752263336273736, "grad_norm": 1.1808593273162842, "learning_rate": 7.496448303078136e-08, "logits/chosen": -19.740997314453125, "logits/rejected": -18.663631439208984, "logps/chosen": -435.0269470214844, "logps/rejected": -349.0586242675781, "loss": 0.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2547993659973145, "rewards/margins": 1.8579193353652954, "rewards/rejected": 2.3968796730041504, "step": 80820 }, { "epoch": 3.752727610381169, "grad_norm": 74.0251693725586, "learning_rate": 7.493662658433539e-08, "logits/chosen": -19.160539627075195, "logits/rejected": -18.707317352294922, "logps/chosen": -374.10723876953125, "logps/rejected": -333.2857360839844, "loss": 0.6072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.077730417251587, "rewards/margins": 0.9164009094238281, "rewards/rejected": 2.161329507827759, "step": 80830 }, { "epoch": 3.753191884488602, "grad_norm": 3.225233316421509, "learning_rate": 7.49087701378894e-08, "logits/chosen": -18.913516998291016, "logits/rejected": -18.212909698486328, "logps/chosen": -350.0008850097656, "logps/rejected": -265.89849853515625, "loss": 0.5787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392176389694214, "rewards/margins": 1.1995604038238525, "rewards/rejected": 2.192615509033203, "step": 80840 }, { "epoch": 3.7536561585960353, "grad_norm": 73.93233489990234, "learning_rate": 7.488091369144342e-08, "logits/chosen": -19.683597564697266, "logits/rejected": -18.528085708618164, "logps/chosen": -393.6938171386719, "logps/rejected": -308.9371643066406, "loss": 0.4858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.687283754348755, "rewards/margins": 1.9905017614364624, "rewards/rejected": 1.6967817544937134, "step": 80850 }, { "epoch": 3.754120432703468, "grad_norm": 6.466360569000244, "learning_rate": 7.485305724499745e-08, "logits/chosen": -18.621681213378906, "logits/rejected": -18.30130386352539, "logps/chosen": -403.5025634765625, "logps/rejected": -369.18487548828125, "loss": 0.5865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.346818447113037, "rewards/margins": 1.0761890411376953, "rewards/rejected": 2.270629405975342, "step": 80860 }, { "epoch": 3.7545847068109013, "grad_norm": 127.8771743774414, "learning_rate": 7.482520079855146e-08, "logits/chosen": -19.08169174194336, "logits/rejected": -19.45529556274414, "logps/chosen": -450.0899353027344, "logps/rejected": -412.01861572265625, "loss": 0.9503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.051419734954834, "rewards/margins": 0.5388593673706055, "rewards/rejected": 2.5125601291656494, "step": 80870 }, { "epoch": 3.755048980918334, "grad_norm": 55.117862701416016, "learning_rate": 7.479734435210548e-08, "logits/chosen": -19.289440155029297, "logits/rejected": -18.43449592590332, "logps/chosen": -446.03851318359375, "logps/rejected": -320.76495361328125, "loss": 0.4084, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.619206190109253, "rewards/margins": 1.5182619094848633, "rewards/rejected": 2.1009445190429688, "step": 80880 }, { "epoch": 3.7555132550257673, "grad_norm": 163.4297637939453, "learning_rate": 7.476948790565949e-08, "logits/chosen": -19.433773040771484, "logits/rejected": -18.90707778930664, "logps/chosen": -360.8135986328125, "logps/rejected": -289.83917236328125, "loss": 0.764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5122110843658447, "rewards/margins": 0.8636471033096313, "rewards/rejected": 1.648564100265503, "step": 80890 }, { "epoch": 3.7559775291332, "grad_norm": 0.05659255012869835, "learning_rate": 7.474163145921352e-08, "logits/chosen": -18.88442039489746, "logits/rejected": -18.478832244873047, "logps/chosen": -387.48248291015625, "logps/rejected": -366.5782165527344, "loss": 0.6852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.851539134979248, "rewards/margins": 2.0618529319763184, "rewards/rejected": 1.7896859645843506, "step": 80900 }, { "epoch": 3.7564418032406333, "grad_norm": 107.7719497680664, "learning_rate": 7.471377501276753e-08, "logits/chosen": -19.43744468688965, "logits/rejected": -18.43043327331543, "logps/chosen": -389.32452392578125, "logps/rejected": -335.5246276855469, "loss": 1.4029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.802971601486206, "rewards/margins": 0.4428127706050873, "rewards/rejected": 3.360159397125244, "step": 80910 }, { "epoch": 3.7569060773480665, "grad_norm": 95.68220520019531, "learning_rate": 7.468591856632155e-08, "logits/chosen": -19.14371109008789, "logits/rejected": -17.61166000366211, "logps/chosen": -353.9314880371094, "logps/rejected": -244.6324920654297, "loss": 0.2607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.661768674850464, "rewards/margins": 1.676719307899475, "rewards/rejected": 0.9850494265556335, "step": 80920 }, { "epoch": 3.7573703514554992, "grad_norm": 40.46425247192383, "learning_rate": 7.465806211987558e-08, "logits/chosen": -18.898954391479492, "logits/rejected": -18.245126724243164, "logps/chosen": -433.1605529785156, "logps/rejected": -341.9924011230469, "loss": 0.1569, "rewards/accuracies": 1.0, "rewards/chosen": 3.7426559925079346, "rewards/margins": 2.2113540172576904, "rewards/rejected": 1.5313016176223755, "step": 80930 }, { "epoch": 3.7578346255629325, "grad_norm": 9.703835487365723, "learning_rate": 7.463020567342959e-08, "logits/chosen": -19.447404861450195, "logits/rejected": -18.866029739379883, "logps/chosen": -402.9479675292969, "logps/rejected": -345.19384765625, "loss": 0.7241, "rewards/accuracies": 0.5, "rewards/chosen": 3.6407687664031982, "rewards/margins": 0.8749243021011353, "rewards/rejected": 2.7658443450927734, "step": 80940 }, { "epoch": 3.7582988996703652, "grad_norm": 203.7744903564453, "learning_rate": 7.46023492269836e-08, "logits/chosen": -19.190250396728516, "logits/rejected": -17.960264205932617, "logps/chosen": -356.1336975097656, "logps/rejected": -256.4476318359375, "loss": 0.537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2860374450683594, "rewards/margins": 2.39713454246521, "rewards/rejected": 0.8889029622077942, "step": 80950 }, { "epoch": 3.7587631737777984, "grad_norm": 45.4366340637207, "learning_rate": 7.457449278053763e-08, "logits/chosen": -18.721506118774414, "logits/rejected": -18.058792114257812, "logps/chosen": -264.5223388671875, "logps/rejected": -211.9499053955078, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.375082492828369, "rewards/margins": 1.06183660030365, "rewards/rejected": 1.3132458925247192, "step": 80960 }, { "epoch": 3.759227447885231, "grad_norm": 75.79374694824219, "learning_rate": 7.454663633409165e-08, "logits/chosen": -18.89845848083496, "logits/rejected": -18.693645477294922, "logps/chosen": -363.8795471191406, "logps/rejected": -327.95318603515625, "loss": 1.1433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.774388551712036, "rewards/margins": 0.02975916862487793, "rewards/rejected": 2.744629383087158, "step": 80970 }, { "epoch": 3.7596917219926644, "grad_norm": 228.25985717773438, "learning_rate": 7.451877988764566e-08, "logits/chosen": -19.236984252929688, "logits/rejected": -19.07794952392578, "logps/chosen": -486.40606689453125, "logps/rejected": -363.9485778808594, "loss": 0.9953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5537407398223877, "rewards/margins": 1.479143738746643, "rewards/rejected": 2.074596881866455, "step": 80980 }, { "epoch": 3.7601559961000977, "grad_norm": 87.97264099121094, "learning_rate": 7.449092344119969e-08, "logits/chosen": -19.468280792236328, "logits/rejected": -17.838848114013672, "logps/chosen": -451.95343017578125, "logps/rejected": -328.5057067871094, "loss": 0.5354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.37279748916626, "rewards/margins": 2.1746745109558105, "rewards/rejected": 2.1981234550476074, "step": 80990 }, { "epoch": 3.7606202702075304, "grad_norm": 5.894491195678711, "learning_rate": 7.446306699475369e-08, "logits/chosen": -18.40823745727539, "logits/rejected": -17.550067901611328, "logps/chosen": -444.12237548828125, "logps/rejected": -340.41485595703125, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": 4.1600446701049805, "rewards/margins": 2.4894886016845703, "rewards/rejected": 1.6705564260482788, "step": 81000 }, { "epoch": 3.7610845443149636, "grad_norm": 86.57478332519531, "learning_rate": 7.443521054830772e-08, "logits/chosen": -19.058551788330078, "logits/rejected": -18.602842330932617, "logps/chosen": -406.6027526855469, "logps/rejected": -396.71624755859375, "loss": 0.2774, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.440883636474609, "rewards/margins": 1.9414021968841553, "rewards/rejected": 2.4994821548461914, "step": 81010 }, { "epoch": 3.761548818422397, "grad_norm": 94.69452667236328, "learning_rate": 7.440735410186173e-08, "logits/chosen": -19.09832000732422, "logits/rejected": -18.582761764526367, "logps/chosen": -418.0313415527344, "logps/rejected": -307.4343566894531, "loss": 0.844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7061142921447754, "rewards/margins": 0.9884273409843445, "rewards/rejected": 1.7176870107650757, "step": 81020 }, { "epoch": 3.7620130925298296, "grad_norm": 8.596595764160156, "learning_rate": 7.437949765541575e-08, "logits/chosen": -19.749746322631836, "logits/rejected": -19.510385513305664, "logps/chosen": -433.56695556640625, "logps/rejected": -337.22381591796875, "loss": 0.5266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.87225604057312, "rewards/margins": 1.5694029331207275, "rewards/rejected": 2.3028531074523926, "step": 81030 }, { "epoch": 3.7624773666372624, "grad_norm": 294.83172607421875, "learning_rate": 7.435164120896978e-08, "logits/chosen": -18.467670440673828, "logits/rejected": -18.35182762145996, "logps/chosen": -389.5602722167969, "logps/rejected": -399.88555908203125, "loss": 1.7071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4579243659973145, "rewards/margins": 0.023809481412172318, "rewards/rejected": 3.434114933013916, "step": 81040 }, { "epoch": 3.7629416407446956, "grad_norm": 80.28314971923828, "learning_rate": 7.432378476252379e-08, "logits/chosen": -19.278881072998047, "logits/rejected": -17.942350387573242, "logps/chosen": -474.8836364746094, "logps/rejected": -323.92724609375, "loss": 0.5633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.78908634185791, "rewards/margins": 2.016911029815674, "rewards/rejected": 2.772174835205078, "step": 81050 }, { "epoch": 3.763405914852129, "grad_norm": 204.63937377929688, "learning_rate": 7.42959283160778e-08, "logits/chosen": -18.776981353759766, "logits/rejected": -18.25510025024414, "logps/chosen": -365.0211486816406, "logps/rejected": -344.4299621582031, "loss": 1.2424, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.817847728729248, "rewards/margins": 0.09720981121063232, "rewards/rejected": 2.7206380367279053, "step": 81060 }, { "epoch": 3.7638701889595616, "grad_norm": 133.14842224121094, "learning_rate": 7.426807186963183e-08, "logits/chosen": -19.693056106567383, "logits/rejected": -18.553131103515625, "logps/chosen": -382.0245361328125, "logps/rejected": -352.15716552734375, "loss": 0.623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0970985889434814, "rewards/margins": 0.6841471195220947, "rewards/rejected": 2.412951707839966, "step": 81070 }, { "epoch": 3.764334463066995, "grad_norm": 84.52191925048828, "learning_rate": 7.424021542318585e-08, "logits/chosen": -19.277118682861328, "logits/rejected": -18.747873306274414, "logps/chosen": -345.26055908203125, "logps/rejected": -290.8685302734375, "loss": 0.7673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4107251167297363, "rewards/margins": 0.5985733866691589, "rewards/rejected": 1.8121519088745117, "step": 81080 }, { "epoch": 3.764798737174428, "grad_norm": 107.70600891113281, "learning_rate": 7.421235897673986e-08, "logits/chosen": -18.234834671020508, "logits/rejected": -17.696216583251953, "logps/chosen": -425.118408203125, "logps/rejected": -325.52935791015625, "loss": 0.4932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6244239807128906, "rewards/margins": 1.611577033996582, "rewards/rejected": 2.0128467082977295, "step": 81090 }, { "epoch": 3.765263011281861, "grad_norm": 83.07746887207031, "learning_rate": 7.418450253029388e-08, "logits/chosen": -20.435562133789062, "logits/rejected": -19.66710090637207, "logps/chosen": -516.6585693359375, "logps/rejected": -476.54461669921875, "loss": 0.7444, "rewards/accuracies": 0.5, "rewards/chosen": 4.05357027053833, "rewards/margins": 0.3998032212257385, "rewards/rejected": 3.6537671089172363, "step": 81100 }, { "epoch": 3.7657272853892936, "grad_norm": 49.16649627685547, "learning_rate": 7.41566460838479e-08, "logits/chosen": -18.77407455444336, "logits/rejected": -18.420007705688477, "logps/chosen": -398.0693359375, "logps/rejected": -336.89605712890625, "loss": 0.5952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1102042198181152, "rewards/margins": 0.8993104696273804, "rewards/rejected": 2.210893392562866, "step": 81110 }, { "epoch": 3.766191559496727, "grad_norm": 14.257735252380371, "learning_rate": 7.412878963740192e-08, "logits/chosen": -19.48148536682129, "logits/rejected": -18.99397087097168, "logps/chosen": -323.31280517578125, "logps/rejected": -327.115234375, "loss": 0.7173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.758702039718628, "rewards/margins": 0.7662016749382019, "rewards/rejected": 1.9925005435943604, "step": 81120 }, { "epoch": 3.76665583360416, "grad_norm": 215.68382263183594, "learning_rate": 7.410093319095593e-08, "logits/chosen": -18.055709838867188, "logits/rejected": -18.418315887451172, "logps/chosen": -365.14569091796875, "logps/rejected": -393.58319091796875, "loss": 1.3462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.751171112060547, "rewards/margins": -0.0612398162484169, "rewards/rejected": 2.812410831451416, "step": 81130 }, { "epoch": 3.767120107711593, "grad_norm": 33.830745697021484, "learning_rate": 7.407307674450996e-08, "logits/chosen": -19.04193687438965, "logits/rejected": -18.47982406616211, "logps/chosen": -432.62884521484375, "logps/rejected": -401.0542907714844, "loss": 0.4681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.749331474304199, "rewards/margins": 1.3299652338027954, "rewards/rejected": 3.419365644454956, "step": 81140 }, { "epoch": 3.767584381819026, "grad_norm": 50.05875015258789, "learning_rate": 7.404522029806398e-08, "logits/chosen": -18.615524291992188, "logits/rejected": -17.986743927001953, "logps/chosen": -423.4156799316406, "logps/rejected": -408.6387023925781, "loss": 0.8224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3720927238464355, "rewards/margins": 0.8062281608581543, "rewards/rejected": 2.565864324569702, "step": 81150 }, { "epoch": 3.768048655926459, "grad_norm": 149.5019989013672, "learning_rate": 7.401736385161799e-08, "logits/chosen": -19.294979095458984, "logits/rejected": -19.549480438232422, "logps/chosen": -262.13800048828125, "logps/rejected": -250.2930450439453, "loss": 0.9619, "rewards/accuracies": 0.5, "rewards/chosen": 2.529510021209717, "rewards/margins": 0.46972113847732544, "rewards/rejected": 2.0597891807556152, "step": 81160 }, { "epoch": 3.768512930033892, "grad_norm": 42.042564392089844, "learning_rate": 7.398950740517202e-08, "logits/chosen": -18.917400360107422, "logits/rejected": -19.166038513183594, "logps/chosen": -333.4600830078125, "logps/rejected": -286.4678649902344, "loss": 0.8338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5225799083709717, "rewards/margins": 0.7120054364204407, "rewards/rejected": 1.8105742931365967, "step": 81170 }, { "epoch": 3.768977204141325, "grad_norm": 0.6847004890441895, "learning_rate": 7.396165095872603e-08, "logits/chosen": -18.19166374206543, "logits/rejected": -17.605802536010742, "logps/chosen": -532.53466796875, "logps/rejected": -441.72332763671875, "loss": 0.7406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1290812492370605, "rewards/margins": 0.6807810068130493, "rewards/rejected": 3.44830060005188, "step": 81180 }, { "epoch": 3.769441478248758, "grad_norm": 187.72085571289062, "learning_rate": 7.393379451228005e-08, "logits/chosen": -18.36954116821289, "logits/rejected": -18.896522521972656, "logps/chosen": -277.22442626953125, "logps/rejected": -316.604248046875, "loss": 1.4893, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.0117619037628174, "rewards/margins": -0.2432374507188797, "rewards/rejected": 1.2549992799758911, "step": 81190 }, { "epoch": 3.769905752356191, "grad_norm": 105.98175811767578, "learning_rate": 7.390593806583406e-08, "logits/chosen": -19.308530807495117, "logits/rejected": -19.243560791015625, "logps/chosen": -387.4250183105469, "logps/rejected": -330.41156005859375, "loss": 0.4815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.915771484375, "rewards/margins": 1.0278373956680298, "rewards/rejected": 2.8879342079162598, "step": 81200 }, { "epoch": 3.770370026463624, "grad_norm": 0.04406260326504707, "learning_rate": 7.387808161938808e-08, "logits/chosen": -20.012096405029297, "logits/rejected": -18.0240421295166, "logps/chosen": -431.88055419921875, "logps/rejected": -257.8759765625, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 5.810463905334473, "rewards/margins": 4.32356071472168, "rewards/rejected": 1.4869033098220825, "step": 81210 }, { "epoch": 3.770834300571057, "grad_norm": 234.95640563964844, "learning_rate": 7.38502251729421e-08, "logits/chosen": -19.76458168029785, "logits/rejected": -18.53069305419922, "logps/chosen": -429.69537353515625, "logps/rejected": -343.03143310546875, "loss": 0.4494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.357844591140747, "rewards/margins": 1.6854569911956787, "rewards/rejected": 1.672387719154358, "step": 81220 }, { "epoch": 3.7712985746784904, "grad_norm": 93.25713348388672, "learning_rate": 7.382236872649612e-08, "logits/chosen": -19.742061614990234, "logits/rejected": -18.90190887451172, "logps/chosen": -502.017333984375, "logps/rejected": -429.8575134277344, "loss": 0.7313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.610203742980957, "rewards/margins": 1.06894850730896, "rewards/rejected": 3.541254758834839, "step": 81230 }, { "epoch": 3.771762848785923, "grad_norm": 85.83174896240234, "learning_rate": 7.379451228005013e-08, "logits/chosen": -19.20917510986328, "logits/rejected": -17.605785369873047, "logps/chosen": -382.474609375, "logps/rejected": -283.0791931152344, "loss": 0.3743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.16713285446167, "rewards/margins": 1.845249891281128, "rewards/rejected": 1.3218828439712524, "step": 81240 }, { "epoch": 3.7722271228933564, "grad_norm": 24.388139724731445, "learning_rate": 7.376665583360416e-08, "logits/chosen": -19.720355987548828, "logits/rejected": -19.2959041595459, "logps/chosen": -423.8096618652344, "logps/rejected": -372.96697998046875, "loss": 0.8269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8990931510925293, "rewards/margins": 0.3346601128578186, "rewards/rejected": 3.5644333362579346, "step": 81250 }, { "epoch": 3.772691397000789, "grad_norm": 24.817649841308594, "learning_rate": 7.373879938715817e-08, "logits/chosen": -20.042339324951172, "logits/rejected": -19.314498901367188, "logps/chosen": -342.7456970214844, "logps/rejected": -276.8060607910156, "loss": 0.3416, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9709877967834473, "rewards/margins": 1.5113096237182617, "rewards/rejected": 2.4596779346466064, "step": 81260 }, { "epoch": 3.7731556711082224, "grad_norm": 191.4666290283203, "learning_rate": 7.371094294071219e-08, "logits/chosen": -17.931793212890625, "logits/rejected": -17.394853591918945, "logps/chosen": -435.33673095703125, "logps/rejected": -291.14544677734375, "loss": 0.4685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8326504230499268, "rewards/margins": 1.9058812856674194, "rewards/rejected": 0.9267688989639282, "step": 81270 }, { "epoch": 3.773619945215655, "grad_norm": 12.722603797912598, "learning_rate": 7.368308649426622e-08, "logits/chosen": -19.242483139038086, "logits/rejected": -18.92997932434082, "logps/chosen": -323.36676025390625, "logps/rejected": -311.2186584472656, "loss": 0.6835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9973583221435547, "rewards/margins": 0.4800412058830261, "rewards/rejected": 2.517317056655884, "step": 81280 }, { "epoch": 3.7740842193230884, "grad_norm": 32.58916473388672, "learning_rate": 7.365523004782023e-08, "logits/chosen": -18.96955108642578, "logits/rejected": -20.0441951751709, "logps/chosen": -452.17156982421875, "logps/rejected": -436.6429138183594, "loss": 1.6401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7184174060821533, "rewards/margins": -0.7492320537567139, "rewards/rejected": 4.467648983001709, "step": 81290 }, { "epoch": 3.7745484934305216, "grad_norm": 144.99948120117188, "learning_rate": 7.362737360137425e-08, "logits/chosen": -18.323204040527344, "logits/rejected": -18.51750373840332, "logps/chosen": -361.08978271484375, "logps/rejected": -337.5216369628906, "loss": 0.9203, "rewards/accuracies": 0.5, "rewards/chosen": 2.666599750518799, "rewards/margins": 0.5985466837882996, "rewards/rejected": 2.0680532455444336, "step": 81300 }, { "epoch": 3.7750127675379543, "grad_norm": 320.1716613769531, "learning_rate": 7.359951715492826e-08, "logits/chosen": -19.05833625793457, "logits/rejected": -18.54880142211914, "logps/chosen": -527.0325927734375, "logps/rejected": -437.11639404296875, "loss": 0.9708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.299391984939575, "rewards/margins": 0.29064902663230896, "rewards/rejected": 3.0087428092956543, "step": 81310 }, { "epoch": 3.7754770416453876, "grad_norm": 63.23134231567383, "learning_rate": 7.357166070848229e-08, "logits/chosen": -19.693954467773438, "logits/rejected": -18.749845504760742, "logps/chosen": -488.7808532714844, "logps/rejected": -379.09869384765625, "loss": 0.5117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3636977672576904, "rewards/margins": 0.928211510181427, "rewards/rejected": 2.435486078262329, "step": 81320 }, { "epoch": 3.7759413157528203, "grad_norm": 90.76832580566406, "learning_rate": 7.35438042620363e-08, "logits/chosen": -19.57375717163086, "logits/rejected": -19.39854621887207, "logps/chosen": -362.3243103027344, "logps/rejected": -426.4932556152344, "loss": 0.8113, "rewards/accuracies": 0.5, "rewards/chosen": 4.013264179229736, "rewards/margins": -0.10933598130941391, "rewards/rejected": 4.122600555419922, "step": 81330 }, { "epoch": 3.7764055898602535, "grad_norm": 3.445429801940918, "learning_rate": 7.351594781559032e-08, "logits/chosen": -18.33827018737793, "logits/rejected": -18.043413162231445, "logps/chosen": -295.54632568359375, "logps/rejected": -283.2851867675781, "loss": 0.3395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.526092052459717, "rewards/margins": 1.496573567390442, "rewards/rejected": 2.0295186042785645, "step": 81340 }, { "epoch": 3.7768698639676863, "grad_norm": 452.20770263671875, "learning_rate": 7.348809136914435e-08, "logits/chosen": -18.91176414489746, "logits/rejected": -17.940216064453125, "logps/chosen": -532.1943359375, "logps/rejected": -455.3314514160156, "loss": 0.5273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.408870220184326, "rewards/margins": 1.503629446029663, "rewards/rejected": 2.905240774154663, "step": 81350 }, { "epoch": 3.7773341380751195, "grad_norm": 75.47703552246094, "learning_rate": 7.346023492269836e-08, "logits/chosen": -19.395475387573242, "logits/rejected": -18.857177734375, "logps/chosen": -499.0545349121094, "logps/rejected": -362.91522216796875, "loss": 0.9957, "rewards/accuracies": 0.5, "rewards/chosen": 3.764313220977783, "rewards/margins": 0.5357156991958618, "rewards/rejected": 3.228597640991211, "step": 81360 }, { "epoch": 3.7777984121825527, "grad_norm": 52.8315544128418, "learning_rate": 7.343237847625237e-08, "logits/chosen": -19.308002471923828, "logits/rejected": -17.967544555664062, "logps/chosen": -400.99578857421875, "logps/rejected": -268.0631408691406, "loss": 0.5827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.234342575073242, "rewards/margins": 1.8742955923080444, "rewards/rejected": 1.3600469827651978, "step": 81370 }, { "epoch": 3.7782626862899855, "grad_norm": 0.6685777306556702, "learning_rate": 7.34045220298064e-08, "logits/chosen": -19.534137725830078, "logits/rejected": -18.35832977294922, "logps/chosen": -392.4898986816406, "logps/rejected": -269.7132873535156, "loss": 0.1991, "rewards/accuracies": 1.0, "rewards/chosen": 4.105935096740723, "rewards/margins": 2.9551093578338623, "rewards/rejected": 1.1508256196975708, "step": 81380 }, { "epoch": 3.7787269603974187, "grad_norm": 0.10648783296346664, "learning_rate": 7.33766655833604e-08, "logits/chosen": -18.679973602294922, "logits/rejected": -17.85983657836914, "logps/chosen": -516.6061401367188, "logps/rejected": -406.23553466796875, "loss": 0.7793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.383951187133789, "rewards/margins": 1.2874284982681274, "rewards/rejected": 2.096522808074951, "step": 81390 }, { "epoch": 3.7791912345048515, "grad_norm": 21.397003173828125, "learning_rate": 7.334880913691443e-08, "logits/chosen": -19.219165802001953, "logits/rejected": -19.188074111938477, "logps/chosen": -336.72100830078125, "logps/rejected": -297.28863525390625, "loss": 0.5998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5055673122406006, "rewards/margins": 1.1803288459777832, "rewards/rejected": 1.3252383470535278, "step": 81400 }, { "epoch": 3.7796555086122847, "grad_norm": 70.26849365234375, "learning_rate": 7.332095269046845e-08, "logits/chosen": -18.350433349609375, "logits/rejected": -18.110687255859375, "logps/chosen": -308.2024230957031, "logps/rejected": -246.10678100585938, "loss": 0.7549, "rewards/accuracies": 0.5, "rewards/chosen": 1.3403258323669434, "rewards/margins": 0.40831151604652405, "rewards/rejected": 0.9320142865180969, "step": 81410 }, { "epoch": 3.7801197827197175, "grad_norm": 43.474124908447266, "learning_rate": 7.329309624402246e-08, "logits/chosen": -18.729839324951172, "logits/rejected": -18.111774444580078, "logps/chosen": -335.8220520019531, "logps/rejected": -271.45050048828125, "loss": 0.5712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0204055309295654, "rewards/margins": 0.7458893060684204, "rewards/rejected": 2.2745163440704346, "step": 81420 }, { "epoch": 3.7805840568271507, "grad_norm": 51.380516052246094, "learning_rate": 7.326523979757649e-08, "logits/chosen": -18.937904357910156, "logits/rejected": -17.875003814697266, "logps/chosen": -352.7769470214844, "logps/rejected": -204.053955078125, "loss": 0.8276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1428418159484863, "rewards/margins": 1.5145546197891235, "rewards/rejected": 1.6282875537872314, "step": 81430 }, { "epoch": 3.781048330934584, "grad_norm": 47.924713134765625, "learning_rate": 7.32373833511305e-08, "logits/chosen": -19.1070499420166, "logits/rejected": -18.875282287597656, "logps/chosen": -369.0557556152344, "logps/rejected": -366.54742431640625, "loss": 0.903, "rewards/accuracies": 0.5, "rewards/chosen": 4.328028202056885, "rewards/margins": 0.9165267944335938, "rewards/rejected": 3.41150164604187, "step": 81440 }, { "epoch": 3.7815126050420167, "grad_norm": 0.26547205448150635, "learning_rate": 7.320952690468452e-08, "logits/chosen": -19.694807052612305, "logits/rejected": -18.666379928588867, "logps/chosen": -382.7240295410156, "logps/rejected": -337.88238525390625, "loss": 0.5681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.216339111328125, "rewards/margins": 1.7595579624176025, "rewards/rejected": 2.4567818641662598, "step": 81450 }, { "epoch": 3.78197687914945, "grad_norm": 297.37713623046875, "learning_rate": 7.318167045823854e-08, "logits/chosen": -19.202342987060547, "logits/rejected": -17.855728149414062, "logps/chosen": -281.0910339355469, "logps/rejected": -216.8686065673828, "loss": 1.119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.558293581008911, "rewards/margins": 2.0985870361328125, "rewards/rejected": 1.459706425666809, "step": 81460 }, { "epoch": 3.782441153256883, "grad_norm": 0.6896206736564636, "learning_rate": 7.315381401179256e-08, "logits/chosen": -19.303293228149414, "logits/rejected": -18.541728973388672, "logps/chosen": -284.68304443359375, "logps/rejected": -245.36434936523438, "loss": 0.5532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.310166597366333, "rewards/margins": 1.1430060863494873, "rewards/rejected": 1.1671603918075562, "step": 81470 }, { "epoch": 3.782905427364316, "grad_norm": 2.4553582668304443, "learning_rate": 7.312595756534657e-08, "logits/chosen": -19.314647674560547, "logits/rejected": -17.992523193359375, "logps/chosen": -393.8476867675781, "logps/rejected": -224.99227905273438, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 4.469977855682373, "rewards/margins": 3.2224204540252686, "rewards/rejected": 1.2475574016571045, "step": 81480 }, { "epoch": 3.7833697014717487, "grad_norm": 88.48316955566406, "learning_rate": 7.30981011189006e-08, "logits/chosen": -19.72555923461914, "logits/rejected": -18.44325828552246, "logps/chosen": -362.92816162109375, "logps/rejected": -258.2245178222656, "loss": 0.5742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4537367820739746, "rewards/margins": 1.3182369470596313, "rewards/rejected": 2.1354997158050537, "step": 81490 }, { "epoch": 3.783833975579182, "grad_norm": 129.03282165527344, "learning_rate": 7.307024467245462e-08, "logits/chosen": -19.36740493774414, "logits/rejected": -19.077089309692383, "logps/chosen": -391.9637451171875, "logps/rejected": -360.6651916503906, "loss": 0.734, "rewards/accuracies": 0.5, "rewards/chosen": 3.2743935585021973, "rewards/margins": 0.48140794038772583, "rewards/rejected": 2.792985439300537, "step": 81500 }, { "epoch": 3.784298249686615, "grad_norm": 49.0450553894043, "learning_rate": 7.304238822600863e-08, "logits/chosen": -18.243274688720703, "logits/rejected": -17.464799880981445, "logps/chosen": -402.82391357421875, "logps/rejected": -335.5608215332031, "loss": 0.7031, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.191455602645874, "rewards/margins": 1.2424366474151611, "rewards/rejected": 1.9490190744400024, "step": 81510 }, { "epoch": 3.784762523794048, "grad_norm": 176.12364196777344, "learning_rate": 7.301453177956265e-08, "logits/chosen": -18.397554397583008, "logits/rejected": -18.53060531616211, "logps/chosen": -318.8247375488281, "logps/rejected": -352.8179931640625, "loss": 1.5711, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.0779247283935547, "rewards/margins": -0.981576144695282, "rewards/rejected": 3.0595006942749023, "step": 81520 }, { "epoch": 3.785226797901481, "grad_norm": 253.6454315185547, "learning_rate": 7.298667533311667e-08, "logits/chosen": -19.306285858154297, "logits/rejected": -18.769062042236328, "logps/chosen": -464.35772705078125, "logps/rejected": -367.2562561035156, "loss": 0.4725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.420945644378662, "rewards/margins": 2.0652382373809814, "rewards/rejected": 2.3557076454162598, "step": 81530 }, { "epoch": 3.7856910720089143, "grad_norm": 10.0171480178833, "learning_rate": 7.295881888667069e-08, "logits/chosen": -19.078290939331055, "logits/rejected": -17.32601547241211, "logps/chosen": -394.99896240234375, "logps/rejected": -213.32723999023438, "loss": 0.3885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7421865463256836, "rewards/margins": 1.9413543939590454, "rewards/rejected": 0.8008321523666382, "step": 81540 }, { "epoch": 3.786155346116347, "grad_norm": 24.90880012512207, "learning_rate": 7.29309624402247e-08, "logits/chosen": -19.40207290649414, "logits/rejected": -19.42517852783203, "logps/chosen": -392.7533264160156, "logps/rejected": -354.24871826171875, "loss": 0.4558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.669409990310669, "rewards/margins": 1.4220030307769775, "rewards/rejected": 2.2474071979522705, "step": 81550 }, { "epoch": 3.78661962022378, "grad_norm": 4.568849563598633, "learning_rate": 7.290310599377873e-08, "logits/chosen": -19.7992000579834, "logits/rejected": -18.553077697753906, "logps/chosen": -339.7998352050781, "logps/rejected": -278.3177490234375, "loss": 0.5961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.15708327293396, "rewards/margins": 1.2801443338394165, "rewards/rejected": 1.876938819885254, "step": 81560 }, { "epoch": 3.787083894331213, "grad_norm": 52.892276763916016, "learning_rate": 7.287524954733274e-08, "logits/chosen": -17.743419647216797, "logits/rejected": -17.702693939208984, "logps/chosen": -409.6136169433594, "logps/rejected": -403.1097412109375, "loss": 1.4165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.758892059326172, "rewards/margins": 0.09782291948795319, "rewards/rejected": 2.66106915473938, "step": 81570 }, { "epoch": 3.7875481684386463, "grad_norm": 25.282787322998047, "learning_rate": 7.284739310088676e-08, "logits/chosen": -19.486286163330078, "logits/rejected": -17.749237060546875, "logps/chosen": -431.30413818359375, "logps/rejected": -276.86651611328125, "loss": 0.2691, "rewards/accuracies": 1.0, "rewards/chosen": 4.104004859924316, "rewards/margins": 2.4737799167633057, "rewards/rejected": 1.630224585533142, "step": 81580 }, { "epoch": 3.788012442546079, "grad_norm": 112.03762817382812, "learning_rate": 7.281953665444077e-08, "logits/chosen": -18.99207878112793, "logits/rejected": -18.167715072631836, "logps/chosen": -380.11260986328125, "logps/rejected": -313.0476379394531, "loss": 0.437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.634955644607544, "rewards/margins": 1.9086929559707642, "rewards/rejected": 1.7262630462646484, "step": 81590 }, { "epoch": 3.7884767166535123, "grad_norm": 55.63211441040039, "learning_rate": 7.279168020799479e-08, "logits/chosen": -19.327579498291016, "logits/rejected": -18.587255477905273, "logps/chosen": -339.2114562988281, "logps/rejected": -299.6545104980469, "loss": 0.5819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.411137104034424, "rewards/margins": 0.8332456350326538, "rewards/rejected": 2.5778913497924805, "step": 81600 }, { "epoch": 3.7889409907609455, "grad_norm": 55.98026657104492, "learning_rate": 7.276382376154882e-08, "logits/chosen": -19.332782745361328, "logits/rejected": -18.345962524414062, "logps/chosen": -421.9366149902344, "logps/rejected": -343.05072021484375, "loss": 0.7255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3090925216674805, "rewards/margins": 1.4069976806640625, "rewards/rejected": 2.902094602584839, "step": 81610 }, { "epoch": 3.7894052648683783, "grad_norm": 70.85002899169922, "learning_rate": 7.273596731510283e-08, "logits/chosen": -18.794633865356445, "logits/rejected": -18.962039947509766, "logps/chosen": -375.05914306640625, "logps/rejected": -410.3081970214844, "loss": 0.9728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3362228870391846, "rewards/margins": 0.016074752435088158, "rewards/rejected": 3.320148468017578, "step": 81620 }, { "epoch": 3.7898695389758115, "grad_norm": 42.812355041503906, "learning_rate": 7.270811086865685e-08, "logits/chosen": -19.89420509338379, "logits/rejected": -19.040380477905273, "logps/chosen": -417.36810302734375, "logps/rejected": -351.615478515625, "loss": 0.5281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3178157806396484, "rewards/margins": 1.092887043952942, "rewards/rejected": 2.224929094314575, "step": 81630 }, { "epoch": 3.7903338130832442, "grad_norm": 44.750022888183594, "learning_rate": 7.268025442221087e-08, "logits/chosen": -19.4060001373291, "logits/rejected": -18.476118087768555, "logps/chosen": -400.4785461425781, "logps/rejected": -364.4305725097656, "loss": 0.8743, "rewards/accuracies": 0.5, "rewards/chosen": 2.6336193084716797, "rewards/margins": 0.26585355401039124, "rewards/rejected": 2.367765426635742, "step": 81640 }, { "epoch": 3.7907980871906775, "grad_norm": 0.256931334733963, "learning_rate": 7.265239797576489e-08, "logits/chosen": -20.001415252685547, "logits/rejected": -18.76461410522461, "logps/chosen": -526.3750610351562, "logps/rejected": -352.74517822265625, "loss": 0.3245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.370645523071289, "rewards/margins": 2.7282776832580566, "rewards/rejected": 2.6423683166503906, "step": 81650 }, { "epoch": 3.7912623612981102, "grad_norm": 19.856157302856445, "learning_rate": 7.26245415293189e-08, "logits/chosen": -20.081161499023438, "logits/rejected": -18.38712501525879, "logps/chosen": -311.5973205566406, "logps/rejected": -234.2265167236328, "loss": 0.3557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6788859367370605, "rewards/margins": 1.9790947437286377, "rewards/rejected": 0.6997911334037781, "step": 81660 }, { "epoch": 3.7917266354055434, "grad_norm": 191.31581115722656, "learning_rate": 7.259668508287293e-08, "logits/chosen": -18.870746612548828, "logits/rejected": -18.243253707885742, "logps/chosen": -556.6491088867188, "logps/rejected": -553.5169677734375, "loss": 0.8294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.962802410125732, "rewards/margins": 0.7887300848960876, "rewards/rejected": 4.174072265625, "step": 81670 }, { "epoch": 3.7921909095129767, "grad_norm": 77.88939666748047, "learning_rate": 7.256882863642694e-08, "logits/chosen": -18.605009078979492, "logits/rejected": -17.752588272094727, "logps/chosen": -461.1160583496094, "logps/rejected": -342.9286804199219, "loss": 0.5022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.829075574874878, "rewards/margins": 1.2251359224319458, "rewards/rejected": 2.6039397716522217, "step": 81680 }, { "epoch": 3.7926551836204094, "grad_norm": 69.69046020507812, "learning_rate": 7.254097218998096e-08, "logits/chosen": -18.92365074157715, "logits/rejected": -18.034910202026367, "logps/chosen": -453.42041015625, "logps/rejected": -305.08184814453125, "loss": 0.6945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5693938732147217, "rewards/margins": 1.4165425300598145, "rewards/rejected": 2.1528515815734863, "step": 81690 }, { "epoch": 3.7931194577278426, "grad_norm": 87.26325225830078, "learning_rate": 7.251311574353499e-08, "logits/chosen": -18.876163482666016, "logits/rejected": -18.848743438720703, "logps/chosen": -367.37506103515625, "logps/rejected": -352.06915283203125, "loss": 0.9338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.781637668609619, "rewards/margins": 0.24224753677845, "rewards/rejected": 2.5393900871276855, "step": 81700 }, { "epoch": 3.7935837318352754, "grad_norm": 3.9501328468322754, "learning_rate": 7.2485259297089e-08, "logits/chosen": -17.998092651367188, "logits/rejected": -17.617088317871094, "logps/chosen": -409.28167724609375, "logps/rejected": -343.9869384765625, "loss": 0.9429, "rewards/accuracies": 0.5, "rewards/chosen": 3.240940809249878, "rewards/margins": 0.5602307915687561, "rewards/rejected": 2.6807103157043457, "step": 81710 }, { "epoch": 3.7940480059427086, "grad_norm": 8.202629089355469, "learning_rate": 7.245740285064302e-08, "logits/chosen": -18.453067779541016, "logits/rejected": -17.933584213256836, "logps/chosen": -438.813232421875, "logps/rejected": -286.2715148925781, "loss": 0.5937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.751051664352417, "rewards/margins": 2.169766902923584, "rewards/rejected": 1.581284761428833, "step": 81720 }, { "epoch": 3.7945122800501414, "grad_norm": 54.12489700317383, "learning_rate": 7.242954640419703e-08, "logits/chosen": -20.150516510009766, "logits/rejected": -19.152082443237305, "logps/chosen": -443.6412048339844, "logps/rejected": -380.0003967285156, "loss": 0.54, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.474862575531006, "rewards/margins": 1.4697500467300415, "rewards/rejected": 3.005112409591675, "step": 81730 }, { "epoch": 3.7949765541575746, "grad_norm": 24.29770851135254, "learning_rate": 7.240168995775106e-08, "logits/chosen": -19.769733428955078, "logits/rejected": -18.44588851928711, "logps/chosen": -470.189697265625, "logps/rejected": -335.23992919921875, "loss": 0.2194, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.42379093170166, "rewards/margins": 2.8110971450805664, "rewards/rejected": 2.612694025039673, "step": 81740 }, { "epoch": 3.795440828265008, "grad_norm": 22.439577102661133, "learning_rate": 7.237383351130507e-08, "logits/chosen": -19.347309112548828, "logits/rejected": -19.330486297607422, "logps/chosen": -370.2178649902344, "logps/rejected": -434.77386474609375, "loss": 1.1954, "rewards/accuracies": 0.5, "rewards/chosen": 3.894578456878662, "rewards/margins": -0.050442636013031006, "rewards/rejected": 3.945021390914917, "step": 81750 }, { "epoch": 3.7959051023724406, "grad_norm": 169.029541015625, "learning_rate": 7.234597706485909e-08, "logits/chosen": -18.983041763305664, "logits/rejected": -17.73227310180664, "logps/chosen": -393.9122009277344, "logps/rejected": -279.85748291015625, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2573046684265137, "rewards/margins": 1.5350587368011475, "rewards/rejected": 1.7222461700439453, "step": 81760 }, { "epoch": 3.796369376479874, "grad_norm": 54.73313903808594, "learning_rate": 7.231812061841311e-08, "logits/chosen": -19.18032455444336, "logits/rejected": -18.40502166748047, "logps/chosen": -348.178466796875, "logps/rejected": -305.19805908203125, "loss": 0.434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2765049934387207, "rewards/margins": 1.052802562713623, "rewards/rejected": 2.2237026691436768, "step": 81770 }, { "epoch": 3.7968336505873066, "grad_norm": 47.54803466796875, "learning_rate": 7.229026417196713e-08, "logits/chosen": -18.74370574951172, "logits/rejected": -17.94809341430664, "logps/chosen": -368.72320556640625, "logps/rejected": -252.3836669921875, "loss": 0.7294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.614694595336914, "rewards/margins": 0.6353092193603516, "rewards/rejected": 1.9793856143951416, "step": 81780 }, { "epoch": 3.79729792469474, "grad_norm": 116.24020385742188, "learning_rate": 7.226240772552114e-08, "logits/chosen": -19.271106719970703, "logits/rejected": -18.287260055541992, "logps/chosen": -425.97662353515625, "logps/rejected": -366.4681701660156, "loss": 0.9519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6486239433288574, "rewards/margins": 1.3105838298797607, "rewards/rejected": 2.3380401134490967, "step": 81790 }, { "epoch": 3.7977621988021726, "grad_norm": 54.86404800415039, "learning_rate": 7.223455127907516e-08, "logits/chosen": -19.164981842041016, "logits/rejected": -17.877927780151367, "logps/chosen": -403.67608642578125, "logps/rejected": -305.91107177734375, "loss": 0.4738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4774391651153564, "rewards/margins": 1.976409673690796, "rewards/rejected": 1.5010299682617188, "step": 81800 }, { "epoch": 3.798226472909606, "grad_norm": 69.43089294433594, "learning_rate": 7.220669483262917e-08, "logits/chosen": -19.932987213134766, "logits/rejected": -18.503549575805664, "logps/chosen": -545.6474609375, "logps/rejected": -417.41705322265625, "loss": 0.3686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.883698463439941, "rewards/margins": 1.9424673318862915, "rewards/rejected": 2.9412314891815186, "step": 81810 }, { "epoch": 3.798690747017039, "grad_norm": 45.762027740478516, "learning_rate": 7.21788383861832e-08, "logits/chosen": -20.075387954711914, "logits/rejected": -18.33304214477539, "logps/chosen": -507.69293212890625, "logps/rejected": -312.21209716796875, "loss": 0.5108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3345112800598145, "rewards/margins": 2.403982639312744, "rewards/rejected": 1.9305286407470703, "step": 81820 }, { "epoch": 3.799155021124472, "grad_norm": 162.93260192871094, "learning_rate": 7.215098193973722e-08, "logits/chosen": -19.272769927978516, "logits/rejected": -18.324697494506836, "logps/chosen": -383.51470947265625, "logps/rejected": -336.9106140136719, "loss": 0.7805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.190993309020996, "rewards/margins": 0.985021710395813, "rewards/rejected": 3.2059714794158936, "step": 81830 }, { "epoch": 3.799619295231905, "grad_norm": 90.94599151611328, "learning_rate": 7.212312549329123e-08, "logits/chosen": -18.8597469329834, "logits/rejected": -18.312721252441406, "logps/chosen": -316.73016357421875, "logps/rejected": -281.4776306152344, "loss": 0.7553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.675105571746826, "rewards/margins": 0.9841381311416626, "rewards/rejected": 1.6909677982330322, "step": 81840 }, { "epoch": 3.800083569339338, "grad_norm": 139.18704223632812, "learning_rate": 7.209526904684526e-08, "logits/chosen": -18.802236557006836, "logits/rejected": -18.35398292541504, "logps/chosen": -358.44024658203125, "logps/rejected": -335.3746337890625, "loss": 0.9104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.595407009124756, "rewards/margins": 0.5485605001449585, "rewards/rejected": 2.046846389770508, "step": 81850 }, { "epoch": 3.800547843446771, "grad_norm": 65.92853546142578, "learning_rate": 7.206741260039927e-08, "logits/chosen": -20.347570419311523, "logits/rejected": -18.9876708984375, "logps/chosen": -438.26116943359375, "logps/rejected": -312.5609130859375, "loss": 0.6148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9665443897247314, "rewards/margins": 1.694239616394043, "rewards/rejected": 2.2723050117492676, "step": 81860 }, { "epoch": 3.8010121175542038, "grad_norm": 51.51165008544922, "learning_rate": 7.203955615395329e-08, "logits/chosen": -19.294795989990234, "logits/rejected": -18.515722274780273, "logps/chosen": -349.9157409667969, "logps/rejected": -298.19561767578125, "loss": 1.1242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.568363666534424, "rewards/margins": -0.2254050225019455, "rewards/rejected": 2.7937684059143066, "step": 81870 }, { "epoch": 3.801476391661637, "grad_norm": 114.62417602539062, "learning_rate": 7.201169970750731e-08, "logits/chosen": -18.628902435302734, "logits/rejected": -18.419631958007812, "logps/chosen": -431.55389404296875, "logps/rejected": -338.031982421875, "loss": 0.3513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6077141761779785, "rewards/margins": 1.3588502407073975, "rewards/rejected": 2.248863697052002, "step": 81880 }, { "epoch": 3.80194066576907, "grad_norm": 8.707931518554688, "learning_rate": 7.198384326106133e-08, "logits/chosen": -18.421018600463867, "logits/rejected": -17.51738929748535, "logps/chosen": -360.81683349609375, "logps/rejected": -276.6018371582031, "loss": 0.9869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4039719104766846, "rewards/margins": 1.7682307958602905, "rewards/rejected": 1.6357412338256836, "step": 81890 }, { "epoch": 3.802404939876503, "grad_norm": 61.056156158447266, "learning_rate": 7.195598681461534e-08, "logits/chosen": -20.448884963989258, "logits/rejected": -19.504531860351562, "logps/chosen": -474.30755615234375, "logps/rejected": -454.49578857421875, "loss": 0.4249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.549198627471924, "rewards/margins": 1.1643483638763428, "rewards/rejected": 3.3848495483398438, "step": 81900 }, { "epoch": 3.802869213983936, "grad_norm": 0.42397540807724, "learning_rate": 7.192813036816937e-08, "logits/chosen": -19.69815444946289, "logits/rejected": -18.641353607177734, "logps/chosen": -392.0938720703125, "logps/rejected": -320.1042785644531, "loss": 0.4746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0273499488830566, "rewards/margins": 1.4366868734359741, "rewards/rejected": 1.5906628370285034, "step": 81910 }, { "epoch": 3.8033334880913694, "grad_norm": 7.146401405334473, "learning_rate": 7.190027392172339e-08, "logits/chosen": -19.532352447509766, "logits/rejected": -18.98209571838379, "logps/chosen": -441.876708984375, "logps/rejected": -241.70053100585938, "loss": 0.485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.113354206085205, "rewards/margins": 2.089829444885254, "rewards/rejected": 1.023524522781372, "step": 81920 }, { "epoch": 3.803797762198802, "grad_norm": 275.1770324707031, "learning_rate": 7.1875203119922e-08, "logits/chosen": -19.048118591308594, "logits/rejected": -18.9750919342041, "logps/chosen": -380.04998779296875, "logps/rejected": -378.6296691894531, "loss": 1.4789, "rewards/accuracies": 0.5, "rewards/chosen": 3.7463295459747314, "rewards/margins": -0.15475448966026306, "rewards/rejected": 3.9010844230651855, "step": 81930 }, { "epoch": 3.804262036306235, "grad_norm": 30.887882232666016, "learning_rate": 7.184734667347602e-08, "logits/chosen": -19.746150970458984, "logits/rejected": -17.578285217285156, "logps/chosen": -479.7765197753906, "logps/rejected": -294.3229675292969, "loss": 0.3286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.340086936950684, "rewards/margins": 3.4759726524353027, "rewards/rejected": 1.8641144037246704, "step": 81940 }, { "epoch": 3.804726310413668, "grad_norm": 204.41400146484375, "learning_rate": 7.181949022703004e-08, "logits/chosen": -18.861297607421875, "logits/rejected": -17.940872192382812, "logps/chosen": -455.862060546875, "logps/rejected": -370.7510681152344, "loss": 0.3996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.778565406799316, "rewards/margins": 2.806967258453369, "rewards/rejected": 1.9715986251831055, "step": 81950 }, { "epoch": 3.8051905845211014, "grad_norm": 2.063878297805786, "learning_rate": 7.179163378058405e-08, "logits/chosen": -19.337726593017578, "logits/rejected": -18.82372283935547, "logps/chosen": -422.194580078125, "logps/rejected": -375.1431884765625, "loss": 0.5263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4640398025512695, "rewards/margins": 1.2328274250030518, "rewards/rejected": 3.2312121391296387, "step": 81960 }, { "epoch": 3.805654858628534, "grad_norm": 2.2005326747894287, "learning_rate": 7.176377733413808e-08, "logits/chosen": -18.98495864868164, "logits/rejected": -17.834625244140625, "logps/chosen": -486.0481872558594, "logps/rejected": -223.33627319335938, "loss": 0.4561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.003100395202637, "rewards/margins": 2.646143913269043, "rewards/rejected": 1.3569563627243042, "step": 81970 }, { "epoch": 3.8061191327359674, "grad_norm": 97.68190002441406, "learning_rate": 7.173592088769209e-08, "logits/chosen": -18.933124542236328, "logits/rejected": -18.278837203979492, "logps/chosen": -286.30517578125, "logps/rejected": -183.11782836914062, "loss": 0.2694, "rewards/accuracies": 1.0, "rewards/chosen": 2.3881611824035645, "rewards/margins": 1.900445580482483, "rewards/rejected": 0.48771578073501587, "step": 81980 }, { "epoch": 3.8065834068434006, "grad_norm": 176.7742156982422, "learning_rate": 7.170806444124611e-08, "logits/chosen": -19.14612579345703, "logits/rejected": -18.04421615600586, "logps/chosen": -358.6403503417969, "logps/rejected": -268.126220703125, "loss": 1.1007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.36279296875, "rewards/margins": 1.3015401363372803, "rewards/rejected": 2.061253070831299, "step": 81990 }, { "epoch": 3.8070476809508333, "grad_norm": 16.00275230407715, "learning_rate": 7.168020799480013e-08, "logits/chosen": -19.0761775970459, "logits/rejected": -18.7122802734375, "logps/chosen": -452.5655822753906, "logps/rejected": -317.9239196777344, "loss": 0.5075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7523295879364014, "rewards/margins": 1.4084991216659546, "rewards/rejected": 2.3438308238983154, "step": 82000 }, { "epoch": 3.8075119550582666, "grad_norm": 32.957855224609375, "learning_rate": 7.165235154835415e-08, "logits/chosen": -18.133867263793945, "logits/rejected": -18.34723472595215, "logps/chosen": -381.8232421875, "logps/rejected": -337.53125, "loss": 1.3241, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1497788429260254, "rewards/margins": 0.3451521396636963, "rewards/rejected": 2.804626941680908, "step": 82010 }, { "epoch": 3.8079762291656993, "grad_norm": 0.10291578620672226, "learning_rate": 7.162449510190816e-08, "logits/chosen": -18.57122230529785, "logits/rejected": -16.768360137939453, "logps/chosen": -412.3367614746094, "logps/rejected": -232.1903533935547, "loss": 0.4019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2135708332061768, "rewards/margins": 2.7677721977233887, "rewards/rejected": 0.4457985460758209, "step": 82020 }, { "epoch": 3.8084405032731325, "grad_norm": 0.49877414107322693, "learning_rate": 7.159663865546219e-08, "logits/chosen": -21.70430564880371, "logits/rejected": -20.872982025146484, "logps/chosen": -326.2623291015625, "logps/rejected": -223.26986694335938, "loss": 0.7738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6495800018310547, "rewards/margins": 1.760349988937378, "rewards/rejected": 1.8892300128936768, "step": 82030 }, { "epoch": 3.8089047773805653, "grad_norm": 27.278362274169922, "learning_rate": 7.156878220901619e-08, "logits/chosen": -19.67150115966797, "logits/rejected": -19.425561904907227, "logps/chosen": -405.4195861816406, "logps/rejected": -356.4561462402344, "loss": 0.8454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6678237915039062, "rewards/margins": 1.2992000579833984, "rewards/rejected": 2.368623971939087, "step": 82040 }, { "epoch": 3.8093690514879985, "grad_norm": 12.479241371154785, "learning_rate": 7.154092576257022e-08, "logits/chosen": -20.947086334228516, "logits/rejected": -20.323152542114258, "logps/chosen": -444.3826599121094, "logps/rejected": -364.50567626953125, "loss": 0.6106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8947672843933105, "rewards/margins": 0.5430909395217896, "rewards/rejected": 3.3516762256622314, "step": 82050 }, { "epoch": 3.8098333255954318, "grad_norm": 0.12456444650888443, "learning_rate": 7.151306931612424e-08, "logits/chosen": -18.313880920410156, "logits/rejected": -18.450706481933594, "logps/chosen": -417.190673828125, "logps/rejected": -382.03582763671875, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7795214653015137, "rewards/margins": 1.7072359323501587, "rewards/rejected": 2.072286367416382, "step": 82060 }, { "epoch": 3.8102975997028645, "grad_norm": 97.3000717163086, "learning_rate": 7.148521286967825e-08, "logits/chosen": -18.949398040771484, "logits/rejected": -19.423782348632812, "logps/chosen": -334.0573425292969, "logps/rejected": -391.04022216796875, "loss": 1.2555, "rewards/accuracies": 0.5, "rewards/chosen": 3.0831658840179443, "rewards/margins": -0.2240900695323944, "rewards/rejected": 3.3072562217712402, "step": 82070 }, { "epoch": 3.8107618738102977, "grad_norm": 182.2950439453125, "learning_rate": 7.145735642323226e-08, "logits/chosen": -19.37521743774414, "logits/rejected": -18.74843978881836, "logps/chosen": -570.0477294921875, "logps/rejected": -482.843017578125, "loss": 0.5785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.380824565887451, "rewards/margins": 0.46605080366134644, "rewards/rejected": 3.914773941040039, "step": 82080 }, { "epoch": 3.8112261479177305, "grad_norm": 117.5119857788086, "learning_rate": 7.142949997678629e-08, "logits/chosen": -17.76833152770996, "logits/rejected": -18.63912010192871, "logps/chosen": -341.9315490722656, "logps/rejected": -383.87615966796875, "loss": 1.461, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.914381980895996, "rewards/margins": -0.5646637678146362, "rewards/rejected": 2.479045867919922, "step": 82090 }, { "epoch": 3.8116904220251637, "grad_norm": 3.1896450519561768, "learning_rate": 7.140164353034031e-08, "logits/chosen": -17.885181427001953, "logits/rejected": -17.223520278930664, "logps/chosen": -360.74053955078125, "logps/rejected": -298.69000244140625, "loss": 0.4051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0250351428985596, "rewards/margins": 1.1218042373657227, "rewards/rejected": 1.903230905532837, "step": 82100 }, { "epoch": 3.8121546961325965, "grad_norm": 27.43022918701172, "learning_rate": 7.137378708389432e-08, "logits/chosen": -18.968624114990234, "logits/rejected": -18.623456954956055, "logps/chosen": -368.54754638671875, "logps/rejected": -331.7667541503906, "loss": 0.8911, "rewards/accuracies": 0.5, "rewards/chosen": 3.021996021270752, "rewards/margins": 0.4855195879936218, "rewards/rejected": 2.5364766120910645, "step": 82110 }, { "epoch": 3.8126189702400297, "grad_norm": 43.01313400268555, "learning_rate": 7.134593063744835e-08, "logits/chosen": -18.953380584716797, "logits/rejected": -18.899709701538086, "logps/chosen": -362.5790100097656, "logps/rejected": -352.0711669921875, "loss": 0.5807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.657104730606079, "rewards/margins": 0.7963340282440186, "rewards/rejected": 1.86077082157135, "step": 82120 }, { "epoch": 3.813083244347463, "grad_norm": 53.499874114990234, "learning_rate": 7.131807419100236e-08, "logits/chosen": -18.983423233032227, "logits/rejected": -19.247270584106445, "logps/chosen": -382.52337646484375, "logps/rejected": -421.9571838378906, "loss": 1.0955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3570384979248047, "rewards/margins": -0.08641424030065536, "rewards/rejected": 3.443453311920166, "step": 82130 }, { "epoch": 3.8135475184548957, "grad_norm": 56.236236572265625, "learning_rate": 7.129021774455638e-08, "logits/chosen": -18.863012313842773, "logits/rejected": -18.330114364624023, "logps/chosen": -429.1302795410156, "logps/rejected": -375.1617126464844, "loss": 0.8461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.767744541168213, "rewards/margins": 0.8496776819229126, "rewards/rejected": 2.9180667400360107, "step": 82140 }, { "epoch": 3.814011792562329, "grad_norm": 213.4974822998047, "learning_rate": 7.12623612981104e-08, "logits/chosen": -20.183727264404297, "logits/rejected": -18.09307861328125, "logps/chosen": -443.9306640625, "logps/rejected": -333.58404541015625, "loss": 0.3027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.600578308105469, "rewards/margins": 2.9766459465026855, "rewards/rejected": 1.623932123184204, "step": 82150 }, { "epoch": 3.8144760666697617, "grad_norm": 114.68730926513672, "learning_rate": 7.123450485166442e-08, "logits/chosen": -19.860286712646484, "logits/rejected": -19.315988540649414, "logps/chosen": -386.3365173339844, "logps/rejected": -259.04010009765625, "loss": 1.0571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6245932579040527, "rewards/margins": 1.3655476570129395, "rewards/rejected": 2.259045124053955, "step": 82160 }, { "epoch": 3.814940340777195, "grad_norm": 177.20863342285156, "learning_rate": 7.120664840521844e-08, "logits/chosen": -18.292442321777344, "logits/rejected": -17.581920623779297, "logps/chosen": -509.7037048339844, "logps/rejected": -366.3202209472656, "loss": 0.5263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.009946346282959, "rewards/margins": 1.8736684322357178, "rewards/rejected": 2.136277914047241, "step": 82170 }, { "epoch": 3.8154046148846277, "grad_norm": 84.27967834472656, "learning_rate": 7.117879195877246e-08, "logits/chosen": -18.986143112182617, "logits/rejected": -19.08443832397461, "logps/chosen": -268.15325927734375, "logps/rejected": -320.68988037109375, "loss": 0.9162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9438812732696533, "rewards/margins": 0.10877492278814316, "rewards/rejected": 1.8351062536239624, "step": 82180 }, { "epoch": 3.815868888992061, "grad_norm": 148.27099609375, "learning_rate": 7.115093551232648e-08, "logits/chosen": -19.11415672302246, "logits/rejected": -18.844432830810547, "logps/chosen": -372.9560852050781, "logps/rejected": -312.80609130859375, "loss": 0.8626, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.1424198150634766, "rewards/margins": 0.1825236827135086, "rewards/rejected": 2.9598960876464844, "step": 82190 }, { "epoch": 3.816333163099494, "grad_norm": 12.664292335510254, "learning_rate": 7.112307906588049e-08, "logits/chosen": -19.27114486694336, "logits/rejected": -17.671192169189453, "logps/chosen": -410.3798828125, "logps/rejected": -250.8873748779297, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": 3.5256800651550293, "rewards/margins": 2.0230255126953125, "rewards/rejected": 1.5026546716690063, "step": 82200 }, { "epoch": 3.816797437206927, "grad_norm": 1.3083951473236084, "learning_rate": 7.109522261943452e-08, "logits/chosen": -19.94371795654297, "logits/rejected": -18.09233856201172, "logps/chosen": -393.0466003417969, "logps/rejected": -272.7481994628906, "loss": 0.3196, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.40596866607666, "rewards/margins": 2.825697183609009, "rewards/rejected": 1.5802717208862305, "step": 82210 }, { "epoch": 3.81726171131436, "grad_norm": 79.11154174804688, "learning_rate": 7.106736617298853e-08, "logits/chosen": -18.427722930908203, "logits/rejected": -17.45330238342285, "logps/chosen": -453.0399475097656, "logps/rejected": -320.97991943359375, "loss": 0.464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.439830303192139, "rewards/margins": 2.761138439178467, "rewards/rejected": 1.6786916255950928, "step": 82220 }, { "epoch": 3.817725985421793, "grad_norm": 12.292957305908203, "learning_rate": 7.103950972654255e-08, "logits/chosen": -18.441802978515625, "logits/rejected": -17.647932052612305, "logps/chosen": -430.9087829589844, "logps/rejected": -322.00836181640625, "loss": 1.0542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.745699644088745, "rewards/margins": 1.7434009313583374, "rewards/rejected": 2.0022988319396973, "step": 82230 }, { "epoch": 3.818190259529226, "grad_norm": 9.573587417602539, "learning_rate": 7.101165328009656e-08, "logits/chosen": -19.564130783081055, "logits/rejected": -19.156902313232422, "logps/chosen": -460.884521484375, "logps/rejected": -389.51007080078125, "loss": 0.5132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.046665668487549, "rewards/margins": 2.017944097518921, "rewards/rejected": 2.028721332550049, "step": 82240 }, { "epoch": 3.818654533636659, "grad_norm": 107.51573944091797, "learning_rate": 7.098379683365058e-08, "logits/chosen": -20.066564559936523, "logits/rejected": -19.492891311645508, "logps/chosen": -283.71136474609375, "logps/rejected": -284.30328369140625, "loss": 1.3061, "rewards/accuracies": 0.5, "rewards/chosen": 1.8368151187896729, "rewards/margins": -0.5393203496932983, "rewards/rejected": 2.3761353492736816, "step": 82250 }, { "epoch": 3.819118807744092, "grad_norm": 16.71050453186035, "learning_rate": 7.09559403872046e-08, "logits/chosen": -19.73574447631836, "logits/rejected": -17.555505752563477, "logps/chosen": -500.1593322753906, "logps/rejected": -333.6190490722656, "loss": 0.277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.281133651733398, "rewards/margins": 2.2747206687927246, "rewards/rejected": 2.006412982940674, "step": 82260 }, { "epoch": 3.8195830818515253, "grad_norm": 0.4180022180080414, "learning_rate": 7.092808394075862e-08, "logits/chosen": -19.54971694946289, "logits/rejected": -17.358596801757812, "logps/chosen": -416.67645263671875, "logps/rejected": -246.68508911132812, "loss": 0.4879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.266383171081543, "rewards/margins": 3.0710766315460205, "rewards/rejected": 1.1953065395355225, "step": 82270 }, { "epoch": 3.820047355958958, "grad_norm": 7.000195026397705, "learning_rate": 7.090022749431263e-08, "logits/chosen": -18.367061614990234, "logits/rejected": -18.03989601135254, "logps/chosen": -419.59661865234375, "logps/rejected": -415.8079528808594, "loss": 1.4473, "rewards/accuracies": 0.5, "rewards/chosen": 2.922262668609619, "rewards/margins": -0.3138119578361511, "rewards/rejected": 3.236074924468994, "step": 82280 }, { "epoch": 3.8205116300663913, "grad_norm": 14.284148216247559, "learning_rate": 7.087237104786665e-08, "logits/chosen": -19.211589813232422, "logits/rejected": -18.212648391723633, "logps/chosen": -291.7978515625, "logps/rejected": -251.59634399414062, "loss": 0.9367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.046281576156616, "rewards/margins": 0.2675929367542267, "rewards/rejected": 1.7786884307861328, "step": 82290 }, { "epoch": 3.8209759041738245, "grad_norm": 14.194896697998047, "learning_rate": 7.084451460142068e-08, "logits/chosen": -19.109310150146484, "logits/rejected": -18.558500289916992, "logps/chosen": -397.4279479980469, "logps/rejected": -387.727294921875, "loss": 0.5772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5929858684539795, "rewards/margins": 1.0046290159225464, "rewards/rejected": 2.5883564949035645, "step": 82300 }, { "epoch": 3.8214401782812573, "grad_norm": 126.37739562988281, "learning_rate": 7.081665815497469e-08, "logits/chosen": -20.063095092773438, "logits/rejected": -18.947429656982422, "logps/chosen": -404.60296630859375, "logps/rejected": -363.2660827636719, "loss": 0.3468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.165379524230957, "rewards/margins": 1.9381053447723389, "rewards/rejected": 3.227273941040039, "step": 82310 }, { "epoch": 3.82190445238869, "grad_norm": 12.133460998535156, "learning_rate": 7.07888017085287e-08, "logits/chosen": -17.955677032470703, "logits/rejected": -18.31940269470215, "logps/chosen": -349.3394775390625, "logps/rejected": -396.844482421875, "loss": 1.913, "rewards/accuracies": 0.5, "rewards/chosen": 2.4023518562316895, "rewards/margins": -0.7155725359916687, "rewards/rejected": 3.117924213409424, "step": 82320 }, { "epoch": 3.8223687264961232, "grad_norm": 11.02440357208252, "learning_rate": 7.076094526208273e-08, "logits/chosen": -19.000751495361328, "logits/rejected": -17.98678970336914, "logps/chosen": -344.60638427734375, "logps/rejected": -316.0876770019531, "loss": 0.8234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.937171220779419, "rewards/margins": 0.7431475520133972, "rewards/rejected": 2.194023609161377, "step": 82330 }, { "epoch": 3.8228330006035565, "grad_norm": 51.88401794433594, "learning_rate": 7.073308881563675e-08, "logits/chosen": -18.981887817382812, "logits/rejected": -17.42548370361328, "logps/chosen": -429.4554748535156, "logps/rejected": -315.20794677734375, "loss": 0.2326, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.792298316955566, "rewards/margins": 2.76112699508667, "rewards/rejected": 2.0311713218688965, "step": 82340 }, { "epoch": 3.8232972747109892, "grad_norm": 1.2900806665420532, "learning_rate": 7.070523236919076e-08, "logits/chosen": -19.562158584594727, "logits/rejected": -18.99528694152832, "logps/chosen": -393.9217224121094, "logps/rejected": -377.8933410644531, "loss": 0.8283, "rewards/accuracies": 0.5, "rewards/chosen": 3.9203357696533203, "rewards/margins": 0.3563304543495178, "rewards/rejected": 3.5640056133270264, "step": 82350 }, { "epoch": 3.8237615488184225, "grad_norm": 110.6402587890625, "learning_rate": 7.067737592274479e-08, "logits/chosen": -18.65554428100586, "logits/rejected": -18.21358299255371, "logps/chosen": -436.2640075683594, "logps/rejected": -371.2799377441406, "loss": 0.7777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.002465009689331, "rewards/margins": 0.7205615639686584, "rewards/rejected": 2.2819037437438965, "step": 82360 }, { "epoch": 3.8242258229258557, "grad_norm": 260.3298034667969, "learning_rate": 7.06495194762988e-08, "logits/chosen": -18.837533950805664, "logits/rejected": -18.114465713500977, "logps/chosen": -474.663818359375, "logps/rejected": -384.0081481933594, "loss": 0.5196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.446362495422363, "rewards/margins": 1.4708093404769897, "rewards/rejected": 2.975553274154663, "step": 82370 }, { "epoch": 3.8246900970332884, "grad_norm": 128.034912109375, "learning_rate": 7.062166302985282e-08, "logits/chosen": -19.013219833374023, "logits/rejected": -19.14425277709961, "logps/chosen": -333.64703369140625, "logps/rejected": -285.40447998046875, "loss": 0.9153, "rewards/accuracies": 0.5, "rewards/chosen": 3.4288291931152344, "rewards/margins": 0.7404853105545044, "rewards/rejected": 2.6883437633514404, "step": 82380 }, { "epoch": 3.825154371140721, "grad_norm": 55.639102935791016, "learning_rate": 7.059380658340685e-08, "logits/chosen": -19.22445297241211, "logits/rejected": -18.660865783691406, "logps/chosen": -506.1905822753906, "logps/rejected": -377.66339111328125, "loss": 0.786, "rewards/accuracies": 0.5, "rewards/chosen": 4.120203018188477, "rewards/margins": 0.30864447355270386, "rewards/rejected": 3.8115592002868652, "step": 82390 }, { "epoch": 3.8256186452481544, "grad_norm": 41.517555236816406, "learning_rate": 7.056595013696086e-08, "logits/chosen": -18.584102630615234, "logits/rejected": -17.599870681762695, "logps/chosen": -483.8095703125, "logps/rejected": -351.42803955078125, "loss": 0.6491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.231851577758789, "rewards/margins": 2.070077419281006, "rewards/rejected": 2.1617748737335205, "step": 82400 }, { "epoch": 3.8260829193555876, "grad_norm": 14.27036190032959, "learning_rate": 7.053809369051488e-08, "logits/chosen": -19.499797821044922, "logits/rejected": -19.660146713256836, "logps/chosen": -459.662353515625, "logps/rejected": -414.9339904785156, "loss": 1.3814, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.923334121704102, "rewards/margins": 0.7676913738250732, "rewards/rejected": 4.155642032623291, "step": 82410 }, { "epoch": 3.8265471934630204, "grad_norm": 146.96234130859375, "learning_rate": 7.05102372440689e-08, "logits/chosen": -19.14623260498047, "logits/rejected": -18.055021286010742, "logps/chosen": -410.00994873046875, "logps/rejected": -311.0628662109375, "loss": 0.7786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2835235595703125, "rewards/margins": 1.8967844247817993, "rewards/rejected": 2.386739492416382, "step": 82420 }, { "epoch": 3.8270114675704536, "grad_norm": 93.91610717773438, "learning_rate": 7.048238079762292e-08, "logits/chosen": -18.363201141357422, "logits/rejected": -17.962648391723633, "logps/chosen": -354.4610900878906, "logps/rejected": -326.6798095703125, "loss": 0.6727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.710627317428589, "rewards/margins": 0.7296725511550903, "rewards/rejected": 1.980954885482788, "step": 82430 }, { "epoch": 3.827475741677887, "grad_norm": 0.3927411735057831, "learning_rate": 7.045452435117693e-08, "logits/chosen": -19.562156677246094, "logits/rejected": -18.408475875854492, "logps/chosen": -434.14111328125, "logps/rejected": -332.3562927246094, "loss": 0.3537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6580452919006348, "rewards/margins": 1.8588228225708008, "rewards/rejected": 1.7992223501205444, "step": 82440 }, { "epoch": 3.8279400157853196, "grad_norm": 27.359100341796875, "learning_rate": 7.042666790473095e-08, "logits/chosen": -19.75546646118164, "logits/rejected": -19.984468460083008, "logps/chosen": -356.64996337890625, "logps/rejected": -339.6038818359375, "loss": 0.5531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.576166868209839, "rewards/margins": 0.8352494239807129, "rewards/rejected": 1.7409178018569946, "step": 82450 }, { "epoch": 3.828404289892753, "grad_norm": 9.942727088928223, "learning_rate": 7.039881145828496e-08, "logits/chosen": -20.119691848754883, "logits/rejected": -19.131847381591797, "logps/chosen": -409.7999572753906, "logps/rejected": -383.2306213378906, "loss": 0.2974, "rewards/accuracies": 1.0, "rewards/chosen": 4.209656238555908, "rewards/margins": 1.4738091230392456, "rewards/rejected": 2.735846996307373, "step": 82460 }, { "epoch": 3.8288685640001856, "grad_norm": 12.329623222351074, "learning_rate": 7.037095501183899e-08, "logits/chosen": -19.08394432067871, "logits/rejected": -18.482242584228516, "logps/chosen": -444.70745849609375, "logps/rejected": -400.25909423828125, "loss": 0.7601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2663192749023438, "rewards/margins": 0.6896953582763672, "rewards/rejected": 2.5766239166259766, "step": 82470 }, { "epoch": 3.829332838107619, "grad_norm": 52.25526809692383, "learning_rate": 7.0343098565393e-08, "logits/chosen": -18.090774536132812, "logits/rejected": -17.765146255493164, "logps/chosen": -398.1746520996094, "logps/rejected": -345.3794860839844, "loss": 0.8805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8573246002197266, "rewards/margins": 0.25017452239990234, "rewards/rejected": 2.607150077819824, "step": 82480 }, { "epoch": 3.8297971122150516, "grad_norm": 217.29884338378906, "learning_rate": 7.031524211894702e-08, "logits/chosen": -17.173236846923828, "logits/rejected": -18.414264678955078, "logps/chosen": -306.713134765625, "logps/rejected": -397.88824462890625, "loss": 1.5962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.799683928489685, "rewards/margins": -0.5501223802566528, "rewards/rejected": 2.349806547164917, "step": 82490 }, { "epoch": 3.830261386322485, "grad_norm": 48.69525146484375, "learning_rate": 7.028738567250103e-08, "logits/chosen": -18.97088623046875, "logits/rejected": -18.152002334594727, "logps/chosen": -348.8006286621094, "logps/rejected": -301.3504638671875, "loss": 0.5823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.61426043510437, "rewards/margins": 1.5467411279678345, "rewards/rejected": 2.067518949508667, "step": 82500 }, { "epoch": 3.830725660429918, "grad_norm": 1.1056164503097534, "learning_rate": 7.025952922605506e-08, "logits/chosen": -18.754961013793945, "logits/rejected": -18.00307846069336, "logps/chosen": -389.14825439453125, "logps/rejected": -291.0267333984375, "loss": 0.6225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.620593547821045, "rewards/margins": 1.293046236038208, "rewards/rejected": 1.327547311782837, "step": 82510 }, { "epoch": 3.831189934537351, "grad_norm": 51.21971893310547, "learning_rate": 7.023167277960908e-08, "logits/chosen": -20.373157501220703, "logits/rejected": -19.12271499633789, "logps/chosen": -338.8224182128906, "logps/rejected": -229.65780639648438, "loss": 0.7301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6657798290252686, "rewards/margins": 1.4338505268096924, "rewards/rejected": 1.2319295406341553, "step": 82520 }, { "epoch": 3.831654208644784, "grad_norm": 33.71019744873047, "learning_rate": 7.020381633316309e-08, "logits/chosen": -19.08970069885254, "logits/rejected": -18.793405532836914, "logps/chosen": -354.87164306640625, "logps/rejected": -322.63470458984375, "loss": 0.7005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.605485200881958, "rewards/margins": 0.5574624538421631, "rewards/rejected": 2.048022747039795, "step": 82530 }, { "epoch": 3.832118482752217, "grad_norm": 0.23749887943267822, "learning_rate": 7.017595988671712e-08, "logits/chosen": -20.71894645690918, "logits/rejected": -19.039356231689453, "logps/chosen": -349.72662353515625, "logps/rejected": -277.60638427734375, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 4.330567836761475, "rewards/margins": 2.688969850540161, "rewards/rejected": 1.641598105430603, "step": 82540 }, { "epoch": 3.83258275685965, "grad_norm": 171.24151611328125, "learning_rate": 7.014810344027113e-08, "logits/chosen": -19.505146026611328, "logits/rejected": -19.413570404052734, "logps/chosen": -448.23114013671875, "logps/rejected": -466.84136962890625, "loss": 1.319, "rewards/accuracies": 0.5, "rewards/chosen": 3.486403226852417, "rewards/margins": -0.2791774868965149, "rewards/rejected": 3.7655811309814453, "step": 82550 }, { "epoch": 3.8330470309670828, "grad_norm": 59.93989562988281, "learning_rate": 7.012024699382515e-08, "logits/chosen": -18.785152435302734, "logits/rejected": -17.646635055541992, "logps/chosen": -415.28631591796875, "logps/rejected": -321.59906005859375, "loss": 1.0389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.186659574508667, "rewards/margins": 1.4736508131027222, "rewards/rejected": 1.7130088806152344, "step": 82560 }, { "epoch": 3.833511305074516, "grad_norm": 160.68240356445312, "learning_rate": 7.009239054737918e-08, "logits/chosen": -19.065868377685547, "logits/rejected": -17.64626121520996, "logps/chosen": -398.6656799316406, "logps/rejected": -269.60638427734375, "loss": 0.4965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.918419361114502, "rewards/margins": 1.766502022743225, "rewards/rejected": 1.1519174575805664, "step": 82570 }, { "epoch": 3.833975579181949, "grad_norm": 36.12257766723633, "learning_rate": 7.006453410093319e-08, "logits/chosen": -18.51624870300293, "logits/rejected": -18.661407470703125, "logps/chosen": -399.67218017578125, "logps/rejected": -271.6795959472656, "loss": 0.6503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.189495086669922, "rewards/margins": 1.6202056407928467, "rewards/rejected": 1.569289207458496, "step": 82580 }, { "epoch": 3.834439853289382, "grad_norm": 261.952392578125, "learning_rate": 7.00366776544872e-08, "logits/chosen": -19.288982391357422, "logits/rejected": -18.72334861755371, "logps/chosen": -447.3333435058594, "logps/rejected": -416.8650817871094, "loss": 0.7486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.06430721282959, "rewards/margins": 0.8444395065307617, "rewards/rejected": 3.219867706298828, "step": 82590 }, { "epoch": 3.834904127396815, "grad_norm": 58.89418411254883, "learning_rate": 7.000882120804123e-08, "logits/chosen": -20.183061599731445, "logits/rejected": -18.31332015991211, "logps/chosen": -470.034423828125, "logps/rejected": -321.8089599609375, "loss": 0.1898, "rewards/accuracies": 1.0, "rewards/chosen": 4.243399620056152, "rewards/margins": 2.5922701358795166, "rewards/rejected": 1.6511293649673462, "step": 82600 }, { "epoch": 3.835368401504248, "grad_norm": 181.40138244628906, "learning_rate": 6.998096476159525e-08, "logits/chosen": -18.27383041381836, "logits/rejected": -17.943849563598633, "logps/chosen": -361.53985595703125, "logps/rejected": -291.445556640625, "loss": 0.8413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5411901473999023, "rewards/margins": 1.3178002834320068, "rewards/rejected": 1.2233898639678955, "step": 82610 }, { "epoch": 3.835832675611681, "grad_norm": 30.183530807495117, "learning_rate": 6.995310831514926e-08, "logits/chosen": -19.2929630279541, "logits/rejected": -18.32431983947754, "logps/chosen": -312.3221435546875, "logps/rejected": -247.35971069335938, "loss": 0.7231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0250134468078613, "rewards/margins": 1.0411348342895508, "rewards/rejected": 0.9838787317276001, "step": 82620 }, { "epoch": 3.836296949719114, "grad_norm": 0.6766113042831421, "learning_rate": 6.992525186870329e-08, "logits/chosen": -18.58698272705078, "logits/rejected": -17.33920669555664, "logps/chosen": -364.9551696777344, "logps/rejected": -261.0218811035156, "loss": 0.4045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.597961902618408, "rewards/margins": 2.315060615539551, "rewards/rejected": 0.28290146589279175, "step": 82630 }, { "epoch": 3.836761223826547, "grad_norm": 99.9471664428711, "learning_rate": 6.989739542225729e-08, "logits/chosen": -19.274372100830078, "logits/rejected": -17.93207550048828, "logps/chosen": -362.78814697265625, "logps/rejected": -280.93231201171875, "loss": 0.4212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4357521533966064, "rewards/margins": 1.4559326171875, "rewards/rejected": 1.9798195362091064, "step": 82640 }, { "epoch": 3.8372254979339804, "grad_norm": 120.45852661132812, "learning_rate": 6.986953897581132e-08, "logits/chosen": -19.471088409423828, "logits/rejected": -18.49178695678711, "logps/chosen": -397.043701171875, "logps/rejected": -309.7095642089844, "loss": 0.7326, "rewards/accuracies": 0.5, "rewards/chosen": 3.275519847869873, "rewards/margins": 0.6010184288024902, "rewards/rejected": 2.674501657485962, "step": 82650 }, { "epoch": 3.837689772041413, "grad_norm": 70.49151611328125, "learning_rate": 6.984168252936533e-08, "logits/chosen": -19.57760238647461, "logits/rejected": -19.160747528076172, "logps/chosen": -450.6854553222656, "logps/rejected": -380.4737854003906, "loss": 0.3058, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.461874961853027, "rewards/margins": 1.9291120767593384, "rewards/rejected": 2.5327634811401367, "step": 82660 }, { "epoch": 3.8381540461488464, "grad_norm": 42.82469177246094, "learning_rate": 6.981382608291935e-08, "logits/chosen": -18.088375091552734, "logits/rejected": -17.35291290283203, "logps/chosen": -290.5821228027344, "logps/rejected": -213.4430389404297, "loss": 0.5617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1144065856933594, "rewards/margins": 1.092315673828125, "rewards/rejected": 1.022091031074524, "step": 82670 }, { "epoch": 3.8386183202562796, "grad_norm": 4.600039005279541, "learning_rate": 6.978596963647337e-08, "logits/chosen": -18.99445915222168, "logits/rejected": -18.967159271240234, "logps/chosen": -340.31634521484375, "logps/rejected": -341.93115234375, "loss": 0.5864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6950762271881104, "rewards/margins": 0.7602640390396118, "rewards/rejected": 1.934812307357788, "step": 82680 }, { "epoch": 3.8390825943637124, "grad_norm": 10.847960472106934, "learning_rate": 6.9760898834672e-08, "logits/chosen": -20.044147491455078, "logits/rejected": -18.668994903564453, "logps/chosen": -582.8841552734375, "logps/rejected": -447.3695373535156, "loss": 0.7838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9466094970703125, "rewards/margins": 1.3439959287643433, "rewards/rejected": 2.6026134490966797, "step": 82690 }, { "epoch": 3.839546868471145, "grad_norm": 126.28250122070312, "learning_rate": 6.9733042388226e-08, "logits/chosen": -18.881481170654297, "logits/rejected": -17.604923248291016, "logps/chosen": -405.63677978515625, "logps/rejected": -300.080810546875, "loss": 0.3521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.858733654022217, "rewards/margins": 2.0382885932922363, "rewards/rejected": 1.8204450607299805, "step": 82700 }, { "epoch": 3.8400111425785783, "grad_norm": 28.481517791748047, "learning_rate": 6.970518594178003e-08, "logits/chosen": -19.69443130493164, "logits/rejected": -19.084129333496094, "logps/chosen": -417.6935119628906, "logps/rejected": -367.85650634765625, "loss": 0.5812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.360226631164551, "rewards/margins": 1.4344505071640015, "rewards/rejected": 2.9257760047912598, "step": 82710 }, { "epoch": 3.8404754166860116, "grad_norm": 83.33032989501953, "learning_rate": 6.967732949533404e-08, "logits/chosen": -18.69788932800293, "logits/rejected": -19.314544677734375, "logps/chosen": -363.0544128417969, "logps/rejected": -378.24224853515625, "loss": 1.2493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7553935050964355, "rewards/margins": 0.67647784948349, "rewards/rejected": 3.07891583442688, "step": 82720 }, { "epoch": 3.8409396907934443, "grad_norm": 77.97649383544922, "learning_rate": 6.964947304888805e-08, "logits/chosen": -19.328075408935547, "logits/rejected": -19.308700561523438, "logps/chosen": -303.0702819824219, "logps/rejected": -351.7948303222656, "loss": 1.1082, "rewards/accuracies": 0.5, "rewards/chosen": 3.134178400039673, "rewards/margins": 0.36484095454216003, "rewards/rejected": 2.7693374156951904, "step": 82730 }, { "epoch": 3.8414039649008775, "grad_norm": 63.315250396728516, "learning_rate": 6.962161660244208e-08, "logits/chosen": -19.174121856689453, "logits/rejected": -17.912303924560547, "logps/chosen": -342.0198669433594, "logps/rejected": -189.30014038085938, "loss": 0.4031, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0451254844665527, "rewards/margins": 1.7479093074798584, "rewards/rejected": 1.2972162961959839, "step": 82740 }, { "epoch": 3.8418682390083108, "grad_norm": 26.64607048034668, "learning_rate": 6.95937601559961e-08, "logits/chosen": -18.978191375732422, "logits/rejected": -17.866117477416992, "logps/chosen": -397.35986328125, "logps/rejected": -266.6436462402344, "loss": 0.471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.906947374343872, "rewards/margins": 1.7060010433197021, "rewards/rejected": 2.20094633102417, "step": 82750 }, { "epoch": 3.8423325131157435, "grad_norm": 4.905397415161133, "learning_rate": 6.956590370955011e-08, "logits/chosen": -19.43796157836914, "logits/rejected": -18.449382781982422, "logps/chosen": -558.3621215820312, "logps/rejected": -383.33831787109375, "loss": 0.4662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.314053058624268, "rewards/margins": 1.615983247756958, "rewards/rejected": 2.6980693340301514, "step": 82760 }, { "epoch": 3.8427967872231763, "grad_norm": 27.618797302246094, "learning_rate": 6.953804726310414e-08, "logits/chosen": -19.072900772094727, "logits/rejected": -18.684215545654297, "logps/chosen": -320.66864013671875, "logps/rejected": -266.5968322753906, "loss": 0.8344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.719487190246582, "rewards/margins": 1.6805400848388672, "rewards/rejected": 1.0389469861984253, "step": 82770 }, { "epoch": 3.8432610613306095, "grad_norm": 8.563541412353516, "learning_rate": 6.951019081665815e-08, "logits/chosen": -18.999961853027344, "logits/rejected": -17.743921279907227, "logps/chosen": -439.363037109375, "logps/rejected": -297.3880615234375, "loss": 0.4038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8781535625457764, "rewards/margins": 1.4934227466583252, "rewards/rejected": 2.3847310543060303, "step": 82780 }, { "epoch": 3.8437253354380427, "grad_norm": 172.22911071777344, "learning_rate": 6.948233437021217e-08, "logits/chosen": -18.09160614013672, "logits/rejected": -17.375656127929688, "logps/chosen": -330.7923889160156, "logps/rejected": -244.04806518554688, "loss": 1.2627, "rewards/accuracies": 0.5, "rewards/chosen": 2.1569571495056152, "rewards/margins": 0.5270928144454956, "rewards/rejected": 1.6298644542694092, "step": 82790 }, { "epoch": 3.8441896095454755, "grad_norm": 59.234046936035156, "learning_rate": 6.945447792376618e-08, "logits/chosen": -19.58481216430664, "logits/rejected": -18.752981185913086, "logps/chosen": -405.6495666503906, "logps/rejected": -354.1210021972656, "loss": 1.0664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3705010414123535, "rewards/margins": 0.6368796825408936, "rewards/rejected": 2.73362135887146, "step": 82800 }, { "epoch": 3.8446538836529087, "grad_norm": 183.70413208007812, "learning_rate": 6.942662147732021e-08, "logits/chosen": -19.192970275878906, "logits/rejected": -18.493913650512695, "logps/chosen": -435.239501953125, "logps/rejected": -346.7238464355469, "loss": 0.764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.597315549850464, "rewards/margins": 0.7319122552871704, "rewards/rejected": 2.865403413772583, "step": 82810 }, { "epoch": 3.845118157760342, "grad_norm": 0.055261094123125076, "learning_rate": 6.939876503087422e-08, "logits/chosen": -19.85549545288086, "logits/rejected": -18.58570098876953, "logps/chosen": -492.2687072753906, "logps/rejected": -424.23199462890625, "loss": 0.8187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.73067045211792, "rewards/margins": 1.927896499633789, "rewards/rejected": 3.802773952484131, "step": 82820 }, { "epoch": 3.8455824318677747, "grad_norm": 29.357254028320312, "learning_rate": 6.937090858442824e-08, "logits/chosen": -18.965618133544922, "logits/rejected": -18.409013748168945, "logps/chosen": -438.53753662109375, "logps/rejected": -449.0547790527344, "loss": 0.4602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.403804302215576, "rewards/margins": 1.1177537441253662, "rewards/rejected": 2.286050319671631, "step": 82830 }, { "epoch": 3.846046705975208, "grad_norm": 25.35873031616211, "learning_rate": 6.934305213798227e-08, "logits/chosen": -18.60951805114746, "logits/rejected": -17.825782775878906, "logps/chosen": -363.08563232421875, "logps/rejected": -271.8993225097656, "loss": 0.8322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.661522626876831, "rewards/margins": 0.837298572063446, "rewards/rejected": 2.824223756790161, "step": 82840 }, { "epoch": 3.8465109800826407, "grad_norm": 14.187904357910156, "learning_rate": 6.931519569153628e-08, "logits/chosen": -19.292203903198242, "logits/rejected": -18.286962509155273, "logps/chosen": -321.0242919921875, "logps/rejected": -262.7847900390625, "loss": 0.3794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.519172191619873, "rewards/margins": 1.9482342004776, "rewards/rejected": 1.570937991142273, "step": 82850 }, { "epoch": 3.846975254190074, "grad_norm": 1.8256925344467163, "learning_rate": 6.92873392450903e-08, "logits/chosen": -18.463726043701172, "logits/rejected": -17.587589263916016, "logps/chosen": -448.1845703125, "logps/rejected": -385.9537658691406, "loss": 0.6366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2924022674560547, "rewards/margins": 1.3354910612106323, "rewards/rejected": 1.9569108486175537, "step": 82860 }, { "epoch": 3.8474395282975067, "grad_norm": 65.77098083496094, "learning_rate": 6.925948279864432e-08, "logits/chosen": -19.225948333740234, "logits/rejected": -18.924192428588867, "logps/chosen": -422.0940856933594, "logps/rejected": -465.706298828125, "loss": 1.0508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.491336822509766, "rewards/margins": 0.9266079068183899, "rewards/rejected": 3.5647284984588623, "step": 82870 }, { "epoch": 3.84790380240494, "grad_norm": 171.11227416992188, "learning_rate": 6.923162635219834e-08, "logits/chosen": -17.25986099243164, "logits/rejected": -17.673152923583984, "logps/chosen": -292.4560241699219, "logps/rejected": -310.5705871582031, "loss": 1.1444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3457815647125244, "rewards/margins": 0.9661914706230164, "rewards/rejected": 1.3795900344848633, "step": 82880 }, { "epoch": 3.848368076512373, "grad_norm": 118.43526458740234, "learning_rate": 6.920376990575235e-08, "logits/chosen": -19.28803062438965, "logits/rejected": -19.56048583984375, "logps/chosen": -427.2181701660156, "logps/rejected": -474.03955078125, "loss": 1.1399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.930091619491577, "rewards/margins": -0.2510322630405426, "rewards/rejected": 3.181123971939087, "step": 82890 }, { "epoch": 3.848832350619806, "grad_norm": 4.703688144683838, "learning_rate": 6.917591345930637e-08, "logits/chosen": -18.286067962646484, "logits/rejected": -17.8140811920166, "logps/chosen": -399.5721740722656, "logps/rejected": -325.29534912109375, "loss": 0.6503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0919342041015625, "rewards/margins": 1.7524017095565796, "rewards/rejected": 2.3395328521728516, "step": 82900 }, { "epoch": 3.849296624727239, "grad_norm": 234.3141326904297, "learning_rate": 6.914805701286038e-08, "logits/chosen": -18.496410369873047, "logits/rejected": -19.60198211669922, "logps/chosen": -355.64794921875, "logps/rejected": -433.5859375, "loss": 1.2146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5368285179138184, "rewards/margins": -0.06644539535045624, "rewards/rejected": 3.6032745838165283, "step": 82910 }, { "epoch": 3.849760898834672, "grad_norm": 41.6307373046875, "learning_rate": 6.912020056641441e-08, "logits/chosen": -18.640472412109375, "logits/rejected": -17.23788833618164, "logps/chosen": -374.85980224609375, "logps/rejected": -248.33236694335938, "loss": 0.7779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.943419933319092, "rewards/margins": 1.586075782775879, "rewards/rejected": 1.357344388961792, "step": 82920 }, { "epoch": 3.850225172942105, "grad_norm": 90.78601837158203, "learning_rate": 6.909234411996842e-08, "logits/chosen": -20.244897842407227, "logits/rejected": -19.002849578857422, "logps/chosen": -436.2967224121094, "logps/rejected": -425.3692932128906, "loss": 0.3219, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.437746047973633, "rewards/margins": 1.6705875396728516, "rewards/rejected": 2.7671587467193604, "step": 82930 }, { "epoch": 3.850689447049538, "grad_norm": 71.77496337890625, "learning_rate": 6.906448767352244e-08, "logits/chosen": -18.59955596923828, "logits/rejected": -17.459041595458984, "logps/chosen": -472.83905029296875, "logps/rejected": -317.0156555175781, "loss": 0.5976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.256882667541504, "rewards/margins": 1.8873878717422485, "rewards/rejected": 2.369495153427124, "step": 82940 }, { "epoch": 3.851153721156971, "grad_norm": 3.1633410453796387, "learning_rate": 6.903663122707647e-08, "logits/chosen": -19.653263092041016, "logits/rejected": -17.661327362060547, "logps/chosen": -459.74530029296875, "logps/rejected": -349.59808349609375, "loss": 0.2386, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6410186290740967, "rewards/margins": 2.1482009887695312, "rewards/rejected": 1.4928176403045654, "step": 82950 }, { "epoch": 3.8516179952644043, "grad_norm": 136.2979278564453, "learning_rate": 6.900877478063048e-08, "logits/chosen": -18.838327407836914, "logits/rejected": -18.397502899169922, "logps/chosen": -333.9546813964844, "logps/rejected": -319.3922119140625, "loss": 1.0067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5194458961486816, "rewards/margins": 0.5613917112350464, "rewards/rejected": 1.9580539464950562, "step": 82960 }, { "epoch": 3.852082269371837, "grad_norm": 29.016597747802734, "learning_rate": 6.89809183341845e-08, "logits/chosen": -19.17471694946289, "logits/rejected": -18.35928726196289, "logps/chosen": -403.9818420410156, "logps/rejected": -314.37689208984375, "loss": 0.5156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6770622730255127, "rewards/margins": 1.5678282976150513, "rewards/rejected": 2.109233856201172, "step": 82970 }, { "epoch": 3.8525465434792703, "grad_norm": 123.27198791503906, "learning_rate": 6.895306188773852e-08, "logits/chosen": -18.447376251220703, "logits/rejected": -17.885784149169922, "logps/chosen": -316.5374450683594, "logps/rejected": -177.89385986328125, "loss": 0.626, "rewards/accuracies": 0.5, "rewards/chosen": 1.6174399852752686, "rewards/margins": 1.4590730667114258, "rewards/rejected": 0.15836700797080994, "step": 82980 }, { "epoch": 3.853010817586703, "grad_norm": 89.5006103515625, "learning_rate": 6.892520544129254e-08, "logits/chosen": -19.59791374206543, "logits/rejected": -18.72304916381836, "logps/chosen": -311.4092102050781, "logps/rejected": -227.71817016601562, "loss": 0.6317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.723171591758728, "rewards/margins": 0.9131560325622559, "rewards/rejected": 0.8100155591964722, "step": 82990 }, { "epoch": 3.8534750916941363, "grad_norm": 250.66146850585938, "learning_rate": 6.889734899484655e-08, "logits/chosen": -18.26765251159668, "logits/rejected": -17.641908645629883, "logps/chosen": -366.1690673828125, "logps/rejected": -266.3424377441406, "loss": 0.9589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2989721298217773, "rewards/margins": 0.6712281703948975, "rewards/rejected": 1.6277440786361694, "step": 83000 }, { "epoch": 3.853939365801569, "grad_norm": 196.37777709960938, "learning_rate": 6.886949254840057e-08, "logits/chosen": -19.771289825439453, "logits/rejected": -19.271581649780273, "logps/chosen": -348.75921630859375, "logps/rejected": -308.92523193359375, "loss": 0.7341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4447579383850098, "rewards/margins": 0.33289483189582825, "rewards/rejected": 2.111863136291504, "step": 83010 }, { "epoch": 3.8544036399090023, "grad_norm": 0.02850738726556301, "learning_rate": 6.88416361019546e-08, "logits/chosen": -19.204652786254883, "logits/rejected": -18.722007751464844, "logps/chosen": -392.6307678222656, "logps/rejected": -311.89923095703125, "loss": 0.5737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5311195850372314, "rewards/margins": 1.8470134735107422, "rewards/rejected": 1.684105634689331, "step": 83020 }, { "epoch": 3.8548679140164355, "grad_norm": 80.38414001464844, "learning_rate": 6.881377965550861e-08, "logits/chosen": -19.99285888671875, "logits/rejected": -18.06136703491211, "logps/chosen": -385.07366943359375, "logps/rejected": -280.7999572753906, "loss": 0.4768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.124640464782715, "rewards/margins": 2.340038776397705, "rewards/rejected": 1.7846019268035889, "step": 83030 }, { "epoch": 3.8553321881238682, "grad_norm": 61.3717041015625, "learning_rate": 6.878592320906262e-08, "logits/chosen": -19.567731857299805, "logits/rejected": -18.223281860351562, "logps/chosen": -464.66943359375, "logps/rejected": -358.261474609375, "loss": 0.5552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8773484230041504, "rewards/margins": 1.6139402389526367, "rewards/rejected": 2.2634081840515137, "step": 83040 }, { "epoch": 3.8557964622313015, "grad_norm": 1.6595113277435303, "learning_rate": 6.875806676261665e-08, "logits/chosen": -19.641324996948242, "logits/rejected": -19.433780670166016, "logps/chosen": -344.88140869140625, "logps/rejected": -362.05206298828125, "loss": 1.3153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8161704540252686, "rewards/margins": 1.0559720993041992, "rewards/rejected": 1.7601983547210693, "step": 83050 }, { "epoch": 3.8562607363387342, "grad_norm": 84.6695785522461, "learning_rate": 6.873021031617067e-08, "logits/chosen": -18.830448150634766, "logits/rejected": -18.16927719116211, "logps/chosen": -387.57745361328125, "logps/rejected": -301.15753173828125, "loss": 0.9308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.621289014816284, "rewards/margins": 1.2594802379608154, "rewards/rejected": 2.3618083000183105, "step": 83060 }, { "epoch": 3.8567250104461674, "grad_norm": 101.1635513305664, "learning_rate": 6.870235386972468e-08, "logits/chosen": -19.236221313476562, "logits/rejected": -18.48905372619629, "logps/chosen": -513.9224853515625, "logps/rejected": -430.39837646484375, "loss": 0.8803, "rewards/accuracies": 0.5, "rewards/chosen": 3.3446216583251953, "rewards/margins": 0.9330185055732727, "rewards/rejected": 2.4116032123565674, "step": 83070 }, { "epoch": 3.8571892845536, "grad_norm": 263.0240478515625, "learning_rate": 6.867449742327871e-08, "logits/chosen": -18.25802230834961, "logits/rejected": -18.412761688232422, "logps/chosen": -358.5531311035156, "logps/rejected": -387.46868896484375, "loss": 1.5633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1108031272888184, "rewards/margins": -0.06379926204681396, "rewards/rejected": 3.174602508544922, "step": 83080 }, { "epoch": 3.8576535586610334, "grad_norm": 6.828197002410889, "learning_rate": 6.864664097683271e-08, "logits/chosen": -19.172748565673828, "logits/rejected": -18.34885025024414, "logps/chosen": -358.1543273925781, "logps/rejected": -256.857666015625, "loss": 0.6157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.775125503540039, "rewards/margins": 1.1137025356292725, "rewards/rejected": 1.6614229679107666, "step": 83090 }, { "epoch": 3.8581178327684666, "grad_norm": 196.58612060546875, "learning_rate": 6.861878453038674e-08, "logits/chosen": -18.252887725830078, "logits/rejected": -18.195354461669922, "logps/chosen": -331.7142639160156, "logps/rejected": -278.091064453125, "loss": 0.5286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.664066791534424, "rewards/margins": 1.4541585445404053, "rewards/rejected": 2.2099082469940186, "step": 83100 }, { "epoch": 3.8585821068758994, "grad_norm": 55.95177459716797, "learning_rate": 6.859092808394075e-08, "logits/chosen": -19.432979583740234, "logits/rejected": -17.872791290283203, "logps/chosen": -246.1913604736328, "logps/rejected": -202.08035278320312, "loss": 0.8061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8440558910369873, "rewards/margins": 1.7989113330841064, "rewards/rejected": 1.0451446771621704, "step": 83110 }, { "epoch": 3.8590463809833326, "grad_norm": 50.4084587097168, "learning_rate": 6.856307163749477e-08, "logits/chosen": -18.53499412536621, "logits/rejected": -18.517898559570312, "logps/chosen": -407.13885498046875, "logps/rejected": -352.6810607910156, "loss": 1.2117, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.842076063156128, "rewards/margins": -0.22205662727355957, "rewards/rejected": 3.0641324520111084, "step": 83120 }, { "epoch": 3.859510655090766, "grad_norm": 2.1461048126220703, "learning_rate": 6.85352151910488e-08, "logits/chosen": -19.120418548583984, "logits/rejected": -18.151582717895508, "logps/chosen": -440.83544921875, "logps/rejected": -366.05767822265625, "loss": 0.2518, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.383194923400879, "rewards/margins": 2.489786148071289, "rewards/rejected": 1.8934085369110107, "step": 83130 }, { "epoch": 3.8599749291981986, "grad_norm": 17.32701301574707, "learning_rate": 6.850735874460281e-08, "logits/chosen": -19.06890106201172, "logits/rejected": -17.776729583740234, "logps/chosen": -407.7281494140625, "logps/rejected": -366.9337158203125, "loss": 0.9303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.470457077026367, "rewards/margins": 0.9008628726005554, "rewards/rejected": 2.569594144821167, "step": 83140 }, { "epoch": 3.8604392033056314, "grad_norm": 114.55855560302734, "learning_rate": 6.847950229815682e-08, "logits/chosen": -18.888486862182617, "logits/rejected": -18.982603073120117, "logps/chosen": -331.9040222167969, "logps/rejected": -344.8931884765625, "loss": 1.0767, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.194826602935791, "rewards/margins": 0.4721211791038513, "rewards/rejected": 2.722705364227295, "step": 83150 }, { "epoch": 3.8609034774130646, "grad_norm": 53.59537124633789, "learning_rate": 6.845164585171085e-08, "logits/chosen": -19.660850524902344, "logits/rejected": -18.084125518798828, "logps/chosen": -396.1886901855469, "logps/rejected": -287.73577880859375, "loss": 0.4195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5613536834716797, "rewards/margins": 1.745391845703125, "rewards/rejected": 1.8159618377685547, "step": 83160 }, { "epoch": 3.861367751520498, "grad_norm": 38.3410758972168, "learning_rate": 6.842378940526487e-08, "logits/chosen": -19.456336975097656, "logits/rejected": -18.922527313232422, "logps/chosen": -475.719482421875, "logps/rejected": -371.7920837402344, "loss": 0.9258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.411975383758545, "rewards/margins": 0.39084991812705994, "rewards/rejected": 3.021125316619873, "step": 83170 }, { "epoch": 3.8618320256279306, "grad_norm": 55.99730682373047, "learning_rate": 6.839593295881888e-08, "logits/chosen": -19.020992279052734, "logits/rejected": -18.19707679748535, "logps/chosen": -492.150390625, "logps/rejected": -448.0868225097656, "loss": 0.8472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5787289142608643, "rewards/margins": 0.3314064145088196, "rewards/rejected": 3.2473225593566895, "step": 83180 }, { "epoch": 3.862296299735364, "grad_norm": 19.510705947875977, "learning_rate": 6.836807651237291e-08, "logits/chosen": -18.800832748413086, "logits/rejected": -18.340118408203125, "logps/chosen": -310.0818786621094, "logps/rejected": -281.7327575683594, "loss": 0.5175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3920111656188965, "rewards/margins": 1.4830700159072876, "rewards/rejected": 1.9089412689208984, "step": 83190 }, { "epoch": 3.862760573842797, "grad_norm": 11.054143905639648, "learning_rate": 6.834022006592692e-08, "logits/chosen": -18.456533432006836, "logits/rejected": -17.83530044555664, "logps/chosen": -387.06964111328125, "logps/rejected": -340.2308654785156, "loss": 0.715, "rewards/accuracies": 0.5, "rewards/chosen": 3.530679225921631, "rewards/margins": 0.8863745927810669, "rewards/rejected": 2.6443047523498535, "step": 83200 }, { "epoch": 3.86322484795023, "grad_norm": 50.56161117553711, "learning_rate": 6.831236361948094e-08, "logits/chosen": -19.959909439086914, "logits/rejected": -17.80406379699707, "logps/chosen": -417.79217529296875, "logps/rejected": -190.39755249023438, "loss": 0.2691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.568141222000122, "rewards/margins": 2.7467474937438965, "rewards/rejected": 0.8213933706283569, "step": 83210 }, { "epoch": 3.8636891220576626, "grad_norm": 226.88719177246094, "learning_rate": 6.828450717303495e-08, "logits/chosen": -18.336450576782227, "logits/rejected": -19.027433395385742, "logps/chosen": -368.4238586425781, "logps/rejected": -399.2201232910156, "loss": 1.7684, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.13714861869812, "rewards/margins": -0.9648746252059937, "rewards/rejected": 3.1020236015319824, "step": 83220 }, { "epoch": 3.864153396165096, "grad_norm": 63.55339431762695, "learning_rate": 6.825665072658898e-08, "logits/chosen": -19.285621643066406, "logits/rejected": -19.203350067138672, "logps/chosen": -344.1467590332031, "logps/rejected": -339.3214111328125, "loss": 0.7818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4617130756378174, "rewards/margins": 1.4357370138168335, "rewards/rejected": 2.0259759426116943, "step": 83230 }, { "epoch": 3.864617670272529, "grad_norm": 143.6300506591797, "learning_rate": 6.8228794280143e-08, "logits/chosen": -19.247966766357422, "logits/rejected": -18.56056022644043, "logps/chosen": -347.08953857421875, "logps/rejected": -305.794921875, "loss": 0.9994, "rewards/accuracies": 0.5, "rewards/chosen": 3.6581146717071533, "rewards/margins": 0.8514358401298523, "rewards/rejected": 2.806678295135498, "step": 83240 }, { "epoch": 3.8650819443799618, "grad_norm": 45.83924865722656, "learning_rate": 6.820093783369701e-08, "logits/chosen": -19.022676467895508, "logits/rejected": -18.090648651123047, "logps/chosen": -419.4461975097656, "logps/rejected": -319.0638732910156, "loss": 0.4017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0784149169921875, "rewards/margins": 1.9688937664031982, "rewards/rejected": 2.1095213890075684, "step": 83250 }, { "epoch": 3.865546218487395, "grad_norm": 85.66883850097656, "learning_rate": 6.817308138725104e-08, "logits/chosen": -18.571121215820312, "logits/rejected": -17.962398529052734, "logps/chosen": -236.0616912841797, "logps/rejected": -291.78314208984375, "loss": 1.2816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.011561870574951, "rewards/margins": 0.8532019853591919, "rewards/rejected": 1.1583597660064697, "step": 83260 }, { "epoch": 3.866010492594828, "grad_norm": 9.949991226196289, "learning_rate": 6.814522494080505e-08, "logits/chosen": -19.006635665893555, "logits/rejected": -18.181995391845703, "logps/chosen": -408.3558349609375, "logps/rejected": -337.6304016113281, "loss": 0.6259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1662847995758057, "rewards/margins": 0.9319504499435425, "rewards/rejected": 2.2343344688415527, "step": 83270 }, { "epoch": 3.866474766702261, "grad_norm": 106.78714752197266, "learning_rate": 6.811736849435907e-08, "logits/chosen": -19.33186149597168, "logits/rejected": -19.570011138916016, "logps/chosen": -397.7899475097656, "logps/rejected": -345.1612243652344, "loss": 1.3808, "rewards/accuracies": 0.5, "rewards/chosen": 1.5879828929901123, "rewards/margins": -0.1912427395582199, "rewards/rejected": 1.7792257070541382, "step": 83280 }, { "epoch": 3.866939040809694, "grad_norm": 147.1942596435547, "learning_rate": 6.808951204791308e-08, "logits/chosen": -18.878787994384766, "logits/rejected": -17.53584861755371, "logps/chosen": -324.82794189453125, "logps/rejected": -219.0811309814453, "loss": 0.5751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.585209608078003, "rewards/margins": 1.9658092260360718, "rewards/rejected": 1.6194006204605103, "step": 83290 }, { "epoch": 3.867403314917127, "grad_norm": 19.807514190673828, "learning_rate": 6.80616556014671e-08, "logits/chosen": -18.340192794799805, "logits/rejected": -18.097915649414062, "logps/chosen": -318.91314697265625, "logps/rejected": -314.39892578125, "loss": 0.5402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.62188982963562, "rewards/margins": 0.7083841562271118, "rewards/rejected": 1.9135059118270874, "step": 83300 }, { "epoch": 3.86786758902456, "grad_norm": 0.008620452135801315, "learning_rate": 6.803379915502112e-08, "logits/chosen": -19.22941780090332, "logits/rejected": -17.849695205688477, "logps/chosen": -330.1048889160156, "logps/rejected": -232.49679565429688, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.847327709197998, "rewards/margins": 2.4653942584991455, "rewards/rejected": 0.3819335997104645, "step": 83310 }, { "epoch": 3.868331863131993, "grad_norm": 120.57203674316406, "learning_rate": 6.800594270857514e-08, "logits/chosen": -19.73676872253418, "logits/rejected": -18.68575096130371, "logps/chosen": -430.28863525390625, "logps/rejected": -388.59234619140625, "loss": 0.89, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5997416973114014, "rewards/margins": 0.6331290602684021, "rewards/rejected": 2.9666130542755127, "step": 83320 }, { "epoch": 3.868796137239426, "grad_norm": 8.629732131958008, "learning_rate": 6.797808626212915e-08, "logits/chosen": -19.649402618408203, "logits/rejected": -19.731426239013672, "logps/chosen": -440.77850341796875, "logps/rejected": -472.80938720703125, "loss": 1.167, "rewards/accuracies": 0.5, "rewards/chosen": 3.378777027130127, "rewards/margins": 0.022022509947419167, "rewards/rejected": 3.3567538261413574, "step": 83330 }, { "epoch": 3.8692604113468594, "grad_norm": 175.42587280273438, "learning_rate": 6.795022981568318e-08, "logits/chosen": -17.78219985961914, "logits/rejected": -17.81266975402832, "logps/chosen": -280.20538330078125, "logps/rejected": -323.5339660644531, "loss": 1.3528, "rewards/accuracies": 0.5, "rewards/chosen": 2.463284969329834, "rewards/margins": 0.5434213876724243, "rewards/rejected": 1.9198634624481201, "step": 83340 }, { "epoch": 3.869724685454292, "grad_norm": 37.40005111694336, "learning_rate": 6.79223733692372e-08, "logits/chosen": -19.75771713256836, "logits/rejected": -19.90283203125, "logps/chosen": -402.55902099609375, "logps/rejected": -386.9107360839844, "loss": 1.4902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.725864887237549, "rewards/margins": -0.4940146505832672, "rewards/rejected": 3.219879627227783, "step": 83350 }, { "epoch": 3.8701889595617254, "grad_norm": 0.2791077196598053, "learning_rate": 6.789451692279121e-08, "logits/chosen": -19.49423599243164, "logits/rejected": -18.422325134277344, "logps/chosen": -322.56048583984375, "logps/rejected": -285.11431884765625, "loss": 0.5623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8205840587615967, "rewards/margins": 0.8297876119613647, "rewards/rejected": 1.990796685218811, "step": 83360 }, { "epoch": 3.870653233669158, "grad_norm": 110.9983139038086, "learning_rate": 6.786666047634524e-08, "logits/chosen": -19.115571975708008, "logits/rejected": -18.441987991333008, "logps/chosen": -380.99072265625, "logps/rejected": -309.12188720703125, "loss": 0.5766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.519827365875244, "rewards/margins": 1.0240764617919922, "rewards/rejected": 2.495750904083252, "step": 83370 }, { "epoch": 3.8711175077765914, "grad_norm": 12.309714317321777, "learning_rate": 6.783880402989925e-08, "logits/chosen": -19.21698570251465, "logits/rejected": -18.338459014892578, "logps/chosen": -496.4999084472656, "logps/rejected": -356.797119140625, "loss": 0.4325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.520037651062012, "rewards/margins": 1.920531988143921, "rewards/rejected": 2.59950590133667, "step": 83380 }, { "epoch": 3.871581781884024, "grad_norm": 222.15512084960938, "learning_rate": 6.781094758345327e-08, "logits/chosen": -18.655237197875977, "logits/rejected": -19.380077362060547, "logps/chosen": -343.17950439453125, "logps/rejected": -401.4005126953125, "loss": 1.6076, "rewards/accuracies": 0.5, "rewards/chosen": 3.9343605041503906, "rewards/margins": 0.357060968875885, "rewards/rejected": 3.5772995948791504, "step": 83390 }, { "epoch": 3.8720460559914573, "grad_norm": 0.030504778027534485, "learning_rate": 6.778309113700729e-08, "logits/chosen": -18.631269454956055, "logits/rejected": -17.4733829498291, "logps/chosen": -365.22894287109375, "logps/rejected": -247.6499786376953, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8249242305755615, "rewards/margins": 2.3247742652893066, "rewards/rejected": 0.5001503229141235, "step": 83400 }, { "epoch": 3.8725103300988906, "grad_norm": 95.45449829101562, "learning_rate": 6.775523469056131e-08, "logits/chosen": -18.74335479736328, "logits/rejected": -18.897811889648438, "logps/chosen": -380.33868408203125, "logps/rejected": -357.55572509765625, "loss": 0.7569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9459950923919678, "rewards/margins": 0.5401400327682495, "rewards/rejected": 2.4058549404144287, "step": 83410 }, { "epoch": 3.8729746042063233, "grad_norm": 50.392822265625, "learning_rate": 6.772737824411532e-08, "logits/chosen": -19.425683975219727, "logits/rejected": -18.443973541259766, "logps/chosen": -389.414306640625, "logps/rejected": -271.14764404296875, "loss": 0.4253, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7872047424316406, "rewards/margins": 1.4336035251617432, "rewards/rejected": 2.3536014556884766, "step": 83420 }, { "epoch": 3.8734388783137566, "grad_norm": 243.28712463378906, "learning_rate": 6.769952179766934e-08, "logits/chosen": -19.0905704498291, "logits/rejected": -18.562639236450195, "logps/chosen": -439.8740234375, "logps/rejected": -379.350341796875, "loss": 0.8706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.915557861328125, "rewards/margins": 1.3967044353485107, "rewards/rejected": 2.518852949142456, "step": 83430 }, { "epoch": 3.8739031524211893, "grad_norm": 223.5845184326172, "learning_rate": 6.767166535122336e-08, "logits/chosen": -19.31787109375, "logits/rejected": -19.358003616333008, "logps/chosen": -372.1022033691406, "logps/rejected": -377.12628173828125, "loss": 1.0717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.474775791168213, "rewards/margins": 0.4546341001987457, "rewards/rejected": 3.0201416015625, "step": 83440 }, { "epoch": 3.8743674265286225, "grad_norm": 90.44889068603516, "learning_rate": 6.764380890477738e-08, "logits/chosen": -19.197608947753906, "logits/rejected": -18.632450103759766, "logps/chosen": -411.7037048339844, "logps/rejected": -347.38336181640625, "loss": 0.5596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.266803741455078, "rewards/margins": 0.7957821488380432, "rewards/rejected": 2.4710214138031006, "step": 83450 }, { "epoch": 3.8748317006360553, "grad_norm": 118.18647766113281, "learning_rate": 6.761595245833139e-08, "logits/chosen": -18.875484466552734, "logits/rejected": -19.2412109375, "logps/chosen": -370.0308532714844, "logps/rejected": -365.50018310546875, "loss": 0.5607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.749171018600464, "rewards/margins": 1.0572420358657837, "rewards/rejected": 2.6919291019439697, "step": 83460 }, { "epoch": 3.8752959747434885, "grad_norm": 84.22698211669922, "learning_rate": 6.758809601188542e-08, "logits/chosen": -18.678503036499023, "logits/rejected": -17.9500789642334, "logps/chosen": -385.3847351074219, "logps/rejected": -298.8756103515625, "loss": 0.8091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9219303131103516, "rewards/margins": 1.3753143548965454, "rewards/rejected": 1.5466158390045166, "step": 83470 }, { "epoch": 3.8757602488509217, "grad_norm": 8.606375694274902, "learning_rate": 6.756023956543944e-08, "logits/chosen": -18.751516342163086, "logits/rejected": -17.633148193359375, "logps/chosen": -453.0447692871094, "logps/rejected": -313.76104736328125, "loss": 0.2104, "rewards/accuracies": 1.0, "rewards/chosen": 3.460070848464966, "rewards/margins": 2.384068489074707, "rewards/rejected": 1.076002597808838, "step": 83480 }, { "epoch": 3.8762245229583545, "grad_norm": 0.036536265164613724, "learning_rate": 6.753238311899345e-08, "logits/chosen": -18.898094177246094, "logits/rejected": -17.209598541259766, "logps/chosen": -404.90997314453125, "logps/rejected": -233.53530883789062, "loss": 0.1834, "rewards/accuracies": 1.0, "rewards/chosen": 4.209765911102295, "rewards/margins": 2.944997787475586, "rewards/rejected": 1.264768123626709, "step": 83490 }, { "epoch": 3.8766887970657877, "grad_norm": 11.431798934936523, "learning_rate": 6.750452667254746e-08, "logits/chosen": -18.863086700439453, "logits/rejected": -18.13851547241211, "logps/chosen": -376.02374267578125, "logps/rejected": -307.18670654296875, "loss": 0.7333, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9605560302734375, "rewards/margins": 1.0424325466156006, "rewards/rejected": 1.9181230068206787, "step": 83500 }, { "epoch": 3.877153071173221, "grad_norm": 9.531046867370605, "learning_rate": 6.747667022610148e-08, "logits/chosen": -19.352832794189453, "logits/rejected": -18.099140167236328, "logps/chosen": -430.5023498535156, "logps/rejected": -335.4076843261719, "loss": 0.221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.700230121612549, "rewards/margins": 2.8315978050231934, "rewards/rejected": 1.8686320781707764, "step": 83510 }, { "epoch": 3.8776173452806537, "grad_norm": 0.7848460078239441, "learning_rate": 6.744881377965551e-08, "logits/chosen": -20.070594787597656, "logits/rejected": -19.459232330322266, "logps/chosen": -392.46221923828125, "logps/rejected": -303.4393005371094, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.805157423019409, "rewards/margins": 1.755141019821167, "rewards/rejected": 2.0500168800354004, "step": 83520 }, { "epoch": 3.8780816193880865, "grad_norm": 202.2367401123047, "learning_rate": 6.742095733320952e-08, "logits/chosen": -18.305553436279297, "logits/rejected": -18.478160858154297, "logps/chosen": -355.6667785644531, "logps/rejected": -395.8020324707031, "loss": 1.7684, "rewards/accuracies": 0.5, "rewards/chosen": 2.4840810298919678, "rewards/margins": -0.5259817838668823, "rewards/rejected": 3.0100626945495605, "step": 83530 }, { "epoch": 3.8785458934955197, "grad_norm": 367.2061462402344, "learning_rate": 6.739310088676354e-08, "logits/chosen": -19.338516235351562, "logits/rejected": -19.319826126098633, "logps/chosen": -538.0533447265625, "logps/rejected": -568.7679443359375, "loss": 0.9826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.684871196746826, "rewards/margins": 0.44485634565353394, "rewards/rejected": 3.2400150299072266, "step": 83540 }, { "epoch": 3.879010167602953, "grad_norm": 32.300743103027344, "learning_rate": 6.736524444031756e-08, "logits/chosen": -17.884593963623047, "logits/rejected": -17.636348724365234, "logps/chosen": -519.7507934570312, "logps/rejected": -418.63873291015625, "loss": 0.9375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.477963447570801, "rewards/margins": 1.0284910202026367, "rewards/rejected": 3.449472427368164, "step": 83550 }, { "epoch": 3.8794744417103857, "grad_norm": 0.08113771677017212, "learning_rate": 6.733738799387158e-08, "logits/chosen": -19.14596176147461, "logits/rejected": -18.652130126953125, "logps/chosen": -384.25811767578125, "logps/rejected": -300.8114929199219, "loss": 1.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1034605503082275, "rewards/margins": 0.4980948567390442, "rewards/rejected": 2.605365514755249, "step": 83560 }, { "epoch": 3.879938715817819, "grad_norm": 135.5502166748047, "learning_rate": 6.730953154742559e-08, "logits/chosen": -19.233577728271484, "logits/rejected": -18.82040786743164, "logps/chosen": -287.307861328125, "logps/rejected": -239.15396118164062, "loss": 0.8859, "rewards/accuracies": 0.5, "rewards/chosen": 2.6640706062316895, "rewards/margins": 0.6548472046852112, "rewards/rejected": 2.009223461151123, "step": 83570 }, { "epoch": 3.880402989925252, "grad_norm": 87.8908462524414, "learning_rate": 6.728167510097962e-08, "logits/chosen": -20.075082778930664, "logits/rejected": -18.699817657470703, "logps/chosen": -534.0682983398438, "logps/rejected": -343.61126708984375, "loss": 0.3358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.50604772567749, "rewards/margins": 2.514086961746216, "rewards/rejected": 2.9919605255126953, "step": 83580 }, { "epoch": 3.880867264032685, "grad_norm": 50.033905029296875, "learning_rate": 6.725381865453364e-08, "logits/chosen": -19.540653228759766, "logits/rejected": -18.330856323242188, "logps/chosen": -438.91680908203125, "logps/rejected": -347.72442626953125, "loss": 0.3219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.845046043395996, "rewards/margins": 2.9626286029815674, "rewards/rejected": 1.882417917251587, "step": 83590 }, { "epoch": 3.8813315381401177, "grad_norm": 59.3780517578125, "learning_rate": 6.722596220808765e-08, "logits/chosen": -19.063129425048828, "logits/rejected": -18.447301864624023, "logps/chosen": -376.8631286621094, "logps/rejected": -365.3336486816406, "loss": 1.2663, "rewards/accuracies": 0.5, "rewards/chosen": 3.691359758377075, "rewards/margins": 0.48744553327560425, "rewards/rejected": 3.2039146423339844, "step": 83600 }, { "epoch": 3.881795812247551, "grad_norm": 86.80658721923828, "learning_rate": 6.719810576164168e-08, "logits/chosen": -20.706947326660156, "logits/rejected": -19.88002586364746, "logps/chosen": -422.4228515625, "logps/rejected": -380.72723388671875, "loss": 1.1782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.847888708114624, "rewards/margins": -0.31568580865859985, "rewards/rejected": 3.163574695587158, "step": 83610 }, { "epoch": 3.882260086354984, "grad_norm": 143.13694763183594, "learning_rate": 6.717024931519569e-08, "logits/chosen": -18.129140853881836, "logits/rejected": -17.507211685180664, "logps/chosen": -390.84442138671875, "logps/rejected": -260.9144592285156, "loss": 0.5336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.017742395401001, "rewards/margins": 1.5719950199127197, "rewards/rejected": 1.4457472562789917, "step": 83620 }, { "epoch": 3.882724360462417, "grad_norm": 181.1947784423828, "learning_rate": 6.71423928687497e-08, "logits/chosen": -19.104843139648438, "logits/rejected": -18.79624366760254, "logps/chosen": -368.34521484375, "logps/rejected": -382.13568115234375, "loss": 1.3288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.869596242904663, "rewards/margins": -0.0057347179390490055, "rewards/rejected": 2.875331163406372, "step": 83630 }, { "epoch": 3.88318863456985, "grad_norm": 230.90908813476562, "learning_rate": 6.711453642230372e-08, "logits/chosen": -19.366989135742188, "logits/rejected": -18.502178192138672, "logps/chosen": -373.6533203125, "logps/rejected": -320.8853454589844, "loss": 0.8879, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9421284198760986, "rewards/margins": 1.4863765239715576, "rewards/rejected": 2.455751895904541, "step": 83640 }, { "epoch": 3.8836529086772833, "grad_norm": 55.496585845947266, "learning_rate": 6.708667997585775e-08, "logits/chosen": -18.959779739379883, "logits/rejected": -17.604074478149414, "logps/chosen": -375.35870361328125, "logps/rejected": -250.6071319580078, "loss": 0.4269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.998218536376953, "rewards/margins": 1.3745821714401245, "rewards/rejected": 1.6236364841461182, "step": 83650 }, { "epoch": 3.884117182784716, "grad_norm": 17.514209747314453, "learning_rate": 6.705882352941176e-08, "logits/chosen": -19.555511474609375, "logits/rejected": -18.35560417175293, "logps/chosen": -385.1622314453125, "logps/rejected": -291.96990966796875, "loss": 0.6146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.031524658203125, "rewards/margins": 2.2452635765075684, "rewards/rejected": 1.7862606048583984, "step": 83660 }, { "epoch": 3.8845814568921493, "grad_norm": 161.6426544189453, "learning_rate": 6.703096708296578e-08, "logits/chosen": -19.48507308959961, "logits/rejected": -18.2769718170166, "logps/chosen": -440.2476501464844, "logps/rejected": -368.4419860839844, "loss": 0.8905, "rewards/accuracies": 0.5, "rewards/chosen": 3.632209300994873, "rewards/margins": 0.33475208282470703, "rewards/rejected": 3.297457218170166, "step": 83670 }, { "epoch": 3.885045730999582, "grad_norm": 70.81555938720703, "learning_rate": 6.70031106365198e-08, "logits/chosen": -19.61261749267578, "logits/rejected": -18.91722297668457, "logps/chosen": -401.62994384765625, "logps/rejected": -334.9603271484375, "loss": 0.5959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.35958194732666, "rewards/margins": 1.301945686340332, "rewards/rejected": 3.057636022567749, "step": 83680 }, { "epoch": 3.8855100051070153, "grad_norm": 118.01411437988281, "learning_rate": 6.697525419007381e-08, "logits/chosen": -19.278610229492188, "logits/rejected": -18.82252311706543, "logps/chosen": -397.9342346191406, "logps/rejected": -362.3501281738281, "loss": 0.7284, "rewards/accuracies": 0.5, "rewards/chosen": 3.861347198486328, "rewards/margins": 0.09944518655538559, "rewards/rejected": 3.761902332305908, "step": 83690 }, { "epoch": 3.885974279214448, "grad_norm": 35.443119049072266, "learning_rate": 6.694739774362783e-08, "logits/chosen": -19.286518096923828, "logits/rejected": -18.956689834594727, "logps/chosen": -452.86328125, "logps/rejected": -409.33721923828125, "loss": 0.8752, "rewards/accuracies": 0.5, "rewards/chosen": 2.547456979751587, "rewards/margins": 0.866499125957489, "rewards/rejected": 1.6809574365615845, "step": 83700 }, { "epoch": 3.8864385533218813, "grad_norm": 38.94902420043945, "learning_rate": 6.691954129718185e-08, "logits/chosen": -20.670825958251953, "logits/rejected": -18.67987823486328, "logps/chosen": -460.19683837890625, "logps/rejected": -313.4102783203125, "loss": 0.2627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.027632713317871, "rewards/margins": 2.623582124710083, "rewards/rejected": 2.404050350189209, "step": 83710 }, { "epoch": 3.8869028274293145, "grad_norm": 57.60503387451172, "learning_rate": 6.689168485073586e-08, "logits/chosen": -18.671924591064453, "logits/rejected": -18.96628761291504, "logps/chosen": -355.96099853515625, "logps/rejected": -328.2414245605469, "loss": 0.7059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6753413677215576, "rewards/margins": 0.6884049773216248, "rewards/rejected": 2.986936569213867, "step": 83720 }, { "epoch": 3.8873671015367472, "grad_norm": 29.555095672607422, "learning_rate": 6.686382840428989e-08, "logits/chosen": -19.39398956298828, "logits/rejected": -19.955324172973633, "logps/chosen": -456.59307861328125, "logps/rejected": -429.55657958984375, "loss": 0.8738, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.9827232360839844, "rewards/margins": 0.15887174010276794, "rewards/rejected": 3.8238518238067627, "step": 83730 }, { "epoch": 3.8878313756441805, "grad_norm": 102.93517303466797, "learning_rate": 6.68359719578439e-08, "logits/chosen": -17.996456146240234, "logits/rejected": -17.723892211914062, "logps/chosen": -312.26068115234375, "logps/rejected": -330.4823303222656, "loss": 1.3638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.25100040435791, "rewards/margins": -0.3817533254623413, "rewards/rejected": 2.632753849029541, "step": 83740 }, { "epoch": 3.8882956497516132, "grad_norm": 64.87890625, "learning_rate": 6.680811551139792e-08, "logits/chosen": -18.101516723632812, "logits/rejected": -17.9215030670166, "logps/chosen": -293.26800537109375, "logps/rejected": -292.19830322265625, "loss": 1.1386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3589253425598145, "rewards/margins": 0.7002813816070557, "rewards/rejected": 2.658644199371338, "step": 83750 }, { "epoch": 3.8887599238590465, "grad_norm": 43.777305603027344, "learning_rate": 6.678025906495195e-08, "logits/chosen": -18.948162078857422, "logits/rejected": -18.410316467285156, "logps/chosen": -347.29620361328125, "logps/rejected": -295.3915100097656, "loss": 0.4982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.048380136489868, "rewards/margins": 0.8204485177993774, "rewards/rejected": 2.227931499481201, "step": 83760 }, { "epoch": 3.8892241979664792, "grad_norm": 22.567686080932617, "learning_rate": 6.675240261850596e-08, "logits/chosen": -18.812637329101562, "logits/rejected": -18.96322250366211, "logps/chosen": -542.2919921875, "logps/rejected": -463.91070556640625, "loss": 0.9861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.921980381011963, "rewards/margins": 0.1715896874666214, "rewards/rejected": 4.75039005279541, "step": 83770 }, { "epoch": 3.8896884720739124, "grad_norm": 3.388545274734497, "learning_rate": 6.672454617205998e-08, "logits/chosen": -19.254451751708984, "logits/rejected": -17.406404495239258, "logps/chosen": -464.7227478027344, "logps/rejected": -310.12860107421875, "loss": 0.3024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.657561302185059, "rewards/margins": 2.887265682220459, "rewards/rejected": 1.77029550075531, "step": 83780 }, { "epoch": 3.8901527461813457, "grad_norm": 45.59442901611328, "learning_rate": 6.6696689725614e-08, "logits/chosen": -18.765382766723633, "logits/rejected": -18.411911010742188, "logps/chosen": -438.38958740234375, "logps/rejected": -360.92120361328125, "loss": 0.4237, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.017319202423096, "rewards/margins": 1.102107048034668, "rewards/rejected": 2.9152116775512695, "step": 83790 }, { "epoch": 3.8906170202887784, "grad_norm": 0.26205602288246155, "learning_rate": 6.666883327916802e-08, "logits/chosen": -19.21249771118164, "logits/rejected": -18.774507522583008, "logps/chosen": -326.693115234375, "logps/rejected": -292.2266540527344, "loss": 1.0082, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7872474193573, "rewards/margins": 0.08567655086517334, "rewards/rejected": 2.701570510864258, "step": 83800 }, { "epoch": 3.8910812943962116, "grad_norm": 66.35730743408203, "learning_rate": 6.664097683272203e-08, "logits/chosen": -17.900402069091797, "logits/rejected": -17.41292381286621, "logps/chosen": -331.627685546875, "logps/rejected": -307.9882507324219, "loss": 0.5322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5542593002319336, "rewards/margins": 1.669991135597229, "rewards/rejected": 0.8842681646347046, "step": 83810 }, { "epoch": 3.8915455685036444, "grad_norm": 8.360135078430176, "learning_rate": 6.661312038627606e-08, "logits/chosen": -20.34487533569336, "logits/rejected": -19.34736442565918, "logps/chosen": -412.1064453125, "logps/rejected": -322.259033203125, "loss": 0.3969, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.388604640960693, "rewards/margins": 1.9852300882339478, "rewards/rejected": 2.403374671936035, "step": 83820 }, { "epoch": 3.8920098426110776, "grad_norm": 43.138450622558594, "learning_rate": 6.658526393983008e-08, "logits/chosen": -19.571985244750977, "logits/rejected": -19.51734161376953, "logps/chosen": -452.79595947265625, "logps/rejected": -324.3371887207031, "loss": 0.3427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.232007026672363, "rewards/margins": 1.9621989727020264, "rewards/rejected": 2.269808292388916, "step": 83830 }, { "epoch": 3.8924741167185104, "grad_norm": 25.627012252807617, "learning_rate": 6.655740749338409e-08, "logits/chosen": -19.461528778076172, "logits/rejected": -18.53296661376953, "logps/chosen": -442.794677734375, "logps/rejected": -347.9832763671875, "loss": 0.309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.280649185180664, "rewards/margins": 1.3219538927078247, "rewards/rejected": 2.958695888519287, "step": 83840 }, { "epoch": 3.8929383908259436, "grad_norm": 16.659841537475586, "learning_rate": 6.65295510469381e-08, "logits/chosen": -19.598251342773438, "logits/rejected": -18.867841720581055, "logps/chosen": -358.269775390625, "logps/rejected": -288.03765869140625, "loss": 0.4242, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.90521240234375, "rewards/margins": 1.0524237155914307, "rewards/rejected": 1.8527886867523193, "step": 83850 }, { "epoch": 3.893402664933377, "grad_norm": 186.49234008789062, "learning_rate": 6.650169460049213e-08, "logits/chosen": -19.539819717407227, "logits/rejected": -19.714603424072266, "logps/chosen": -386.81195068359375, "logps/rejected": -369.13177490234375, "loss": 1.0983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.143467903137207, "rewards/margins": 0.006247973535209894, "rewards/rejected": 4.137219429016113, "step": 83860 }, { "epoch": 3.8938669390408096, "grad_norm": 2.931213140487671, "learning_rate": 6.647383815404615e-08, "logits/chosen": -18.31120491027832, "logits/rejected": -17.965970993041992, "logps/chosen": -299.84454345703125, "logps/rejected": -302.58062744140625, "loss": 1.0506, "rewards/accuracies": 0.5, "rewards/chosen": 2.35599946975708, "rewards/margins": 0.6141807436943054, "rewards/rejected": 1.7418190240859985, "step": 83870 }, { "epoch": 3.894331213148243, "grad_norm": 102.87145233154297, "learning_rate": 6.644598170760016e-08, "logits/chosen": -18.21095085144043, "logits/rejected": -18.17913246154785, "logps/chosen": -308.6466369628906, "logps/rejected": -302.01336669921875, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.811166286468506, "rewards/margins": 1.159292459487915, "rewards/rejected": 1.6518739461898804, "step": 83880 }, { "epoch": 3.8947954872556756, "grad_norm": 93.50906372070312, "learning_rate": 6.641812526115418e-08, "logits/chosen": -18.185260772705078, "logits/rejected": -17.152713775634766, "logps/chosen": -476.9034118652344, "logps/rejected": -348.81170654296875, "loss": 0.39, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.096654415130615, "rewards/margins": 1.4287301301956177, "rewards/rejected": 2.667924404144287, "step": 83890 }, { "epoch": 3.895259761363109, "grad_norm": 675.4229125976562, "learning_rate": 6.639026881470819e-08, "logits/chosen": -19.509061813354492, "logits/rejected": -19.1091365814209, "logps/chosen": -398.5319519042969, "logps/rejected": -281.4526672363281, "loss": 0.8273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.834761381149292, "rewards/margins": 0.9041540026664734, "rewards/rejected": 1.9306071996688843, "step": 83900 }, { "epoch": 3.8957240354705416, "grad_norm": 202.4513397216797, "learning_rate": 6.636241236826222e-08, "logits/chosen": -18.4334659576416, "logits/rejected": -18.8012638092041, "logps/chosen": -418.56658935546875, "logps/rejected": -436.21856689453125, "loss": 1.9359, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.601515769958496, "rewards/margins": -1.0648338794708252, "rewards/rejected": 3.6663501262664795, "step": 83910 }, { "epoch": 3.896188309577975, "grad_norm": 83.30846405029297, "learning_rate": 6.633455592181623e-08, "logits/chosen": -19.46686363220215, "logits/rejected": -18.412370681762695, "logps/chosen": -404.0231018066406, "logps/rejected": -326.0473327636719, "loss": 0.3946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.072964191436768, "rewards/margins": 2.219954013824463, "rewards/rejected": 1.8530105352401733, "step": 83920 }, { "epoch": 3.896652583685408, "grad_norm": 1.427042007446289, "learning_rate": 6.630669947537025e-08, "logits/chosen": -18.968399047851562, "logits/rejected": -18.182849884033203, "logps/chosen": -442.4674377441406, "logps/rejected": -381.49810791015625, "loss": 0.5842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.780489683151245, "rewards/margins": 0.8985570669174194, "rewards/rejected": 2.8819329738616943, "step": 83930 }, { "epoch": 3.897116857792841, "grad_norm": 172.25694274902344, "learning_rate": 6.627884302892428e-08, "logits/chosen": -20.196258544921875, "logits/rejected": -19.59206771850586, "logps/chosen": -386.36871337890625, "logps/rejected": -362.2970886230469, "loss": 1.2386, "rewards/accuracies": 0.5, "rewards/chosen": 3.9272141456604004, "rewards/margins": -0.12157297134399414, "rewards/rejected": 4.0487871170043945, "step": 83940 }, { "epoch": 3.897581131900274, "grad_norm": 31.24770164489746, "learning_rate": 6.625098658247829e-08, "logits/chosen": -19.852584838867188, "logits/rejected": -19.722253799438477, "logps/chosen": -340.3541564941406, "logps/rejected": -302.13616943359375, "loss": 0.9139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5509839057922363, "rewards/margins": 0.6104103326797485, "rewards/rejected": 2.94057297706604, "step": 83950 }, { "epoch": 3.898045406007707, "grad_norm": 16.72226905822754, "learning_rate": 6.62231301360323e-08, "logits/chosen": -18.894479751586914, "logits/rejected": -17.69122886657715, "logps/chosen": -377.20599365234375, "logps/rejected": -246.01644897460938, "loss": 0.7131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1057779788970947, "rewards/margins": 1.6330331563949585, "rewards/rejected": 0.4727448523044586, "step": 83960 }, { "epoch": 3.89850968011514, "grad_norm": 189.27603149414062, "learning_rate": 6.619527368958633e-08, "logits/chosen": -19.203609466552734, "logits/rejected": -18.483333587646484, "logps/chosen": -389.6803894042969, "logps/rejected": -384.879150390625, "loss": 0.6022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.460883378982544, "rewards/margins": 0.7309004068374634, "rewards/rejected": 2.729982852935791, "step": 83970 }, { "epoch": 3.8989739542225728, "grad_norm": 3.6534087657928467, "learning_rate": 6.616741724314035e-08, "logits/chosen": -18.716110229492188, "logits/rejected": -18.19845199584961, "logps/chosen": -430.76910400390625, "logps/rejected": -354.7948303222656, "loss": 0.6457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.820828676223755, "rewards/margins": 1.1309285163879395, "rewards/rejected": 2.6899003982543945, "step": 83980 }, { "epoch": 3.899438228330006, "grad_norm": 23.949203491210938, "learning_rate": 6.613956079669436e-08, "logits/chosen": -19.228256225585938, "logits/rejected": -19.5943603515625, "logps/chosen": -366.9690246582031, "logps/rejected": -405.76385498046875, "loss": 1.2405, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.200202226638794, "rewards/margins": -0.3584023416042328, "rewards/rejected": 2.5586044788360596, "step": 83990 }, { "epoch": 3.899902502437439, "grad_norm": 2.105034351348877, "learning_rate": 6.611170435024839e-08, "logits/chosen": -18.87721061706543, "logits/rejected": -17.33332633972168, "logps/chosen": -404.2878723144531, "logps/rejected": -260.4331359863281, "loss": 0.3677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.82563853263855, "rewards/margins": 2.454711675643921, "rewards/rejected": 1.370926856994629, "step": 84000 }, { "epoch": 3.900366776544872, "grad_norm": 32.70994186401367, "learning_rate": 6.60838479038024e-08, "logits/chosen": -19.314085006713867, "logits/rejected": -17.858259201049805, "logps/chosen": -333.3265075683594, "logps/rejected": -238.7610626220703, "loss": 0.8414, "rewards/accuracies": 0.5, "rewards/chosen": 2.5837206840515137, "rewards/margins": 0.48961418867111206, "rewards/rejected": 2.094106674194336, "step": 84010 }, { "epoch": 3.900831050652305, "grad_norm": 38.89448547363281, "learning_rate": 6.605599145735642e-08, "logits/chosen": -19.01270866394043, "logits/rejected": -19.0332088470459, "logps/chosen": -383.7535095214844, "logps/rejected": -415.63134765625, "loss": 1.1083, "rewards/accuracies": 0.5, "rewards/chosen": 3.246840238571167, "rewards/margins": -0.19981738924980164, "rewards/rejected": 3.446657180786133, "step": 84020 }, { "epoch": 3.9012953247597384, "grad_norm": 200.95285034179688, "learning_rate": 6.602813501091045e-08, "logits/chosen": -20.138309478759766, "logits/rejected": -19.127391815185547, "logps/chosen": -358.38360595703125, "logps/rejected": -294.15277099609375, "loss": 0.7257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8739898204803467, "rewards/margins": 1.449836015701294, "rewards/rejected": 2.4241538047790527, "step": 84030 }, { "epoch": 3.901759598867171, "grad_norm": 1.4369990825653076, "learning_rate": 6.600027856446446e-08, "logits/chosen": -19.293495178222656, "logits/rejected": -18.612621307373047, "logps/chosen": -435.79541015625, "logps/rejected": -329.82672119140625, "loss": 1.032, "rewards/accuracies": 0.5, "rewards/chosen": 3.5348458290100098, "rewards/margins": 1.5149577856063843, "rewards/rejected": 2.019888162612915, "step": 84040 }, { "epoch": 3.902223872974604, "grad_norm": 15.192971229553223, "learning_rate": 6.597242211801848e-08, "logits/chosen": -18.726430892944336, "logits/rejected": -18.150705337524414, "logps/chosen": -399.1216735839844, "logps/rejected": -336.018798828125, "loss": 0.5331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.365253448486328, "rewards/margins": 1.0073895454406738, "rewards/rejected": 2.3578639030456543, "step": 84050 }, { "epoch": 3.902688147082037, "grad_norm": 113.9559326171875, "learning_rate": 6.594456567157249e-08, "logits/chosen": -19.216501235961914, "logits/rejected": -18.261547088623047, "logps/chosen": -408.2545166015625, "logps/rejected": -294.4457702636719, "loss": 0.7, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7586123943328857, "rewards/margins": 1.2541491985321045, "rewards/rejected": 2.5044636726379395, "step": 84060 }, { "epoch": 3.9031524211894704, "grad_norm": 54.627235412597656, "learning_rate": 6.591670922512652e-08, "logits/chosen": -18.193756103515625, "logits/rejected": -17.532390594482422, "logps/chosen": -324.4866638183594, "logps/rejected": -255.45571899414062, "loss": 1.1793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.267904281616211, "rewards/margins": 0.8298803567886353, "rewards/rejected": 2.438023805618286, "step": 84070 }, { "epoch": 3.903616695296903, "grad_norm": 232.1663360595703, "learning_rate": 6.588885277868052e-08, "logits/chosen": -18.791423797607422, "logits/rejected": -18.105928421020508, "logps/chosen": -513.9459838867188, "logps/rejected": -430.1944274902344, "loss": 0.7362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.076842308044434, "rewards/margins": 1.3489398956298828, "rewards/rejected": 2.72790265083313, "step": 84080 }, { "epoch": 3.9040809694043364, "grad_norm": 14.189334869384766, "learning_rate": 6.586099633223455e-08, "logits/chosen": -19.304302215576172, "logits/rejected": -18.01993179321289, "logps/chosen": -313.69775390625, "logps/rejected": -234.0298614501953, "loss": 0.8233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5542664527893066, "rewards/margins": 0.941268801689148, "rewards/rejected": 1.6129977703094482, "step": 84090 }, { "epoch": 3.9045452435117696, "grad_norm": 171.24630737304688, "learning_rate": 6.583313988578856e-08, "logits/chosen": -19.24958038330078, "logits/rejected": -18.293447494506836, "logps/chosen": -360.4772033691406, "logps/rejected": -281.0696105957031, "loss": 1.0241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6328399181365967, "rewards/margins": 1.4913768768310547, "rewards/rejected": 2.141463041305542, "step": 84100 }, { "epoch": 3.9050095176192023, "grad_norm": 106.19756317138672, "learning_rate": 6.580528343934258e-08, "logits/chosen": -20.349637985229492, "logits/rejected": -19.155595779418945, "logps/chosen": -416.5174255371094, "logps/rejected": -303.4716796875, "loss": 0.6116, "rewards/accuracies": 0.5, "rewards/chosen": 4.228062152862549, "rewards/margins": 0.8722609281539917, "rewards/rejected": 3.3558011054992676, "step": 84110 }, { "epoch": 3.9054737917266356, "grad_norm": 72.40499877929688, "learning_rate": 6.57774269928966e-08, "logits/chosen": -18.725208282470703, "logits/rejected": -17.424354553222656, "logps/chosen": -343.95697021484375, "logps/rejected": -202.73980712890625, "loss": 0.4106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8243165016174316, "rewards/margins": 2.591309070587158, "rewards/rejected": 1.2330074310302734, "step": 84120 }, { "epoch": 3.9059380658340683, "grad_norm": 18.725522994995117, "learning_rate": 6.574957054645062e-08, "logits/chosen": -19.399255752563477, "logits/rejected": -18.827634811401367, "logps/chosen": -352.5404357910156, "logps/rejected": -288.95831298828125, "loss": 0.4825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.593191146850586, "rewards/margins": 1.2668384313583374, "rewards/rejected": 2.326352596282959, "step": 84130 }, { "epoch": 3.9064023399415015, "grad_norm": 44.97898864746094, "learning_rate": 6.572171410000463e-08, "logits/chosen": -18.76078224182129, "logits/rejected": -17.824962615966797, "logps/chosen": -326.3005676269531, "logps/rejected": -283.5957336425781, "loss": 0.7173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3125500679016113, "rewards/margins": 1.178754210472107, "rewards/rejected": 1.133795976638794, "step": 84140 }, { "epoch": 3.9068666140489343, "grad_norm": 11.09322452545166, "learning_rate": 6.569385765355866e-08, "logits/chosen": -18.136577606201172, "logits/rejected": -17.5452880859375, "logps/chosen": -383.4258117675781, "logps/rejected": -352.73626708984375, "loss": 0.5109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.841550350189209, "rewards/margins": 1.523862361907959, "rewards/rejected": 2.317688465118408, "step": 84150 }, { "epoch": 3.9073308881563675, "grad_norm": 6.6405510902404785, "learning_rate": 6.566600120711268e-08, "logits/chosen": -18.6344051361084, "logits/rejected": -18.193145751953125, "logps/chosen": -422.89080810546875, "logps/rejected": -378.8515319824219, "loss": 0.7709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2371726036071777, "rewards/margins": 1.0076212882995605, "rewards/rejected": 2.229551315307617, "step": 84160 }, { "epoch": 3.9077951622638007, "grad_norm": 0.13038311898708344, "learning_rate": 6.563814476066669e-08, "logits/chosen": -18.342317581176758, "logits/rejected": -18.526227951049805, "logps/chosen": -370.6702575683594, "logps/rejected": -304.61761474609375, "loss": 0.4895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6838302612304688, "rewards/margins": 1.4297349452972412, "rewards/rejected": 2.2540953159332275, "step": 84170 }, { "epoch": 3.9082594363712335, "grad_norm": 142.52944946289062, "learning_rate": 6.561028831422072e-08, "logits/chosen": -18.127580642700195, "logits/rejected": -18.009624481201172, "logps/chosen": -266.5430603027344, "logps/rejected": -212.25949096679688, "loss": 0.6701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0693917274475098, "rewards/margins": 0.8413907885551453, "rewards/rejected": 1.2280009984970093, "step": 84180 }, { "epoch": 3.9087237104786667, "grad_norm": 28.960886001586914, "learning_rate": 6.558243186777473e-08, "logits/chosen": -19.424976348876953, "logits/rejected": -18.147855758666992, "logps/chosen": -344.9874267578125, "logps/rejected": -532.1837158203125, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.742830276489258, "rewards/margins": 2.455753803253174, "rewards/rejected": 0.2870761752128601, "step": 84190 }, { "epoch": 3.9091879845860995, "grad_norm": 43.68040466308594, "learning_rate": 6.555457542132875e-08, "logits/chosen": -18.190996170043945, "logits/rejected": -17.884092330932617, "logps/chosen": -435.4384765625, "logps/rejected": -347.1155700683594, "loss": 0.516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4847233295440674, "rewards/margins": 0.6342189311981201, "rewards/rejected": 1.8505042791366577, "step": 84200 }, { "epoch": 3.9096522586935327, "grad_norm": 56.5424690246582, "learning_rate": 6.552671897488277e-08, "logits/chosen": -18.79568099975586, "logits/rejected": -17.35741424560547, "logps/chosen": -373.06353759765625, "logps/rejected": -225.3444366455078, "loss": 0.2259, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.864800214767456, "rewards/margins": 2.720729112625122, "rewards/rejected": 1.1440708637237549, "step": 84210 }, { "epoch": 3.9101165328009655, "grad_norm": 203.33517456054688, "learning_rate": 6.549886252843679e-08, "logits/chosen": -19.076684951782227, "logits/rejected": -19.295902252197266, "logps/chosen": -423.88653564453125, "logps/rejected": -386.1885070800781, "loss": 1.9411, "rewards/accuracies": 0.5, "rewards/chosen": 3.845446825027466, "rewards/margins": -0.5890695452690125, "rewards/rejected": 4.434516429901123, "step": 84220 }, { "epoch": 3.9105808069083987, "grad_norm": 22.59850311279297, "learning_rate": 6.54710060819908e-08, "logits/chosen": -18.807186126708984, "logits/rejected": -17.254165649414062, "logps/chosen": -456.7149353027344, "logps/rejected": -311.3751525878906, "loss": 0.2235, "rewards/accuracies": 1.0, "rewards/chosen": 4.356869220733643, "rewards/margins": 3.0844569206237793, "rewards/rejected": 1.2724121809005737, "step": 84230 }, { "epoch": 3.911045081015832, "grad_norm": 0.12683142721652985, "learning_rate": 6.544314963554483e-08, "logits/chosen": -20.43851661682129, "logits/rejected": -19.451274871826172, "logps/chosen": -443.60845947265625, "logps/rejected": -351.3703918457031, "loss": 0.5766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.9625678062438965, "rewards/margins": 1.7979066371917725, "rewards/rejected": 3.1646618843078613, "step": 84240 }, { "epoch": 3.9115093551232647, "grad_norm": 256.0137023925781, "learning_rate": 6.541529318909885e-08, "logits/chosen": -18.768640518188477, "logits/rejected": -17.73142433166504, "logps/chosen": -398.9678955078125, "logps/rejected": -285.8780822753906, "loss": 1.0771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4368762969970703, "rewards/margins": 0.8455663919448853, "rewards/rejected": 1.5913100242614746, "step": 84250 }, { "epoch": 3.911973629230698, "grad_norm": 22.62544822692871, "learning_rate": 6.538743674265286e-08, "logits/chosen": -18.543163299560547, "logits/rejected": -18.269153594970703, "logps/chosen": -464.7802734375, "logps/rejected": -442.3211975097656, "loss": 0.6265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.399288177490234, "rewards/margins": 1.7061622142791748, "rewards/rejected": 2.6931264400482178, "step": 84260 }, { "epoch": 3.9124379033381307, "grad_norm": 1.1761902570724487, "learning_rate": 6.535958029620688e-08, "logits/chosen": -19.40328025817871, "logits/rejected": -18.17809295654297, "logps/chosen": -373.83416748046875, "logps/rejected": -315.34423828125, "loss": 0.299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.20125675201416, "rewards/margins": 2.004441499710083, "rewards/rejected": 2.196815252304077, "step": 84270 }, { "epoch": 3.912902177445564, "grad_norm": 52.712032318115234, "learning_rate": 6.533172384976089e-08, "logits/chosen": -18.356962203979492, "logits/rejected": -17.90798568725586, "logps/chosen": -299.5628356933594, "logps/rejected": -255.24557495117188, "loss": 0.6246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.348733425140381, "rewards/margins": 1.2113876342773438, "rewards/rejected": 2.137346029281616, "step": 84280 }, { "epoch": 3.9133664515529967, "grad_norm": 59.00001907348633, "learning_rate": 6.53038674033149e-08, "logits/chosen": -18.137638092041016, "logits/rejected": -18.10030174255371, "logps/chosen": -301.54254150390625, "logps/rejected": -266.8402404785156, "loss": 0.7867, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2009987831115723, "rewards/margins": 0.18685157597064972, "rewards/rejected": 2.0141472816467285, "step": 84290 }, { "epoch": 3.91383072566043, "grad_norm": 1.2042016983032227, "learning_rate": 6.527601095686893e-08, "logits/chosen": -18.926738739013672, "logits/rejected": -18.160554885864258, "logps/chosen": -308.15631103515625, "logps/rejected": -240.15878295898438, "loss": 0.245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.750950336456299, "rewards/margins": 2.1906490325927734, "rewards/rejected": 0.5603010654449463, "step": 84300 }, { "epoch": 3.914294999767863, "grad_norm": 40.062503814697266, "learning_rate": 6.524815451042295e-08, "logits/chosen": -18.631275177001953, "logits/rejected": -16.64082908630371, "logps/chosen": -331.6647033691406, "logps/rejected": -201.76046752929688, "loss": 0.232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.668944835662842, "rewards/margins": 2.1884007453918457, "rewards/rejected": 0.4805440902709961, "step": 84310 }, { "epoch": 3.914759273875296, "grad_norm": 204.26702880859375, "learning_rate": 6.522029806397696e-08, "logits/chosen": -19.125364303588867, "logits/rejected": -18.117551803588867, "logps/chosen": -367.4419250488281, "logps/rejected": -316.0423278808594, "loss": 0.3716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.007662296295166, "rewards/margins": 1.2625501155853271, "rewards/rejected": 1.7451120615005493, "step": 84320 }, { "epoch": 3.915223547982729, "grad_norm": 18.294086456298828, "learning_rate": 6.519244161753099e-08, "logits/chosen": -19.63479232788086, "logits/rejected": -18.284461975097656, "logps/chosen": -518.3358764648438, "logps/rejected": -345.1438293457031, "loss": 0.1793, "rewards/accuracies": 1.0, "rewards/chosen": 5.018204689025879, "rewards/margins": 2.5828702449798584, "rewards/rejected": 2.4353339672088623, "step": 84330 }, { "epoch": 3.9156878220901623, "grad_norm": 87.11324310302734, "learning_rate": 6.5164585171085e-08, "logits/chosen": -19.672603607177734, "logits/rejected": -19.693418502807617, "logps/chosen": -254.0330810546875, "logps/rejected": -269.77532958984375, "loss": 1.245, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4566073417663574, "rewards/margins": 0.6072307825088501, "rewards/rejected": 1.8493766784667969, "step": 84340 }, { "epoch": 3.916152096197595, "grad_norm": 27.22275733947754, "learning_rate": 6.513672872463902e-08, "logits/chosen": -18.931272506713867, "logits/rejected": -18.996898651123047, "logps/chosen": -309.4859313964844, "logps/rejected": -334.6663513183594, "loss": 0.8965, "rewards/accuracies": 0.5, "rewards/chosen": 2.0378119945526123, "rewards/margins": 0.14796903729438782, "rewards/rejected": 1.889843225479126, "step": 84350 }, { "epoch": 3.916616370305028, "grad_norm": 0.0189284086227417, "learning_rate": 6.510887227819305e-08, "logits/chosen": -18.825008392333984, "logits/rejected": -18.026229858398438, "logps/chosen": -592.2230224609375, "logps/rejected": -463.939453125, "loss": 0.6005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.484522342681885, "rewards/margins": 2.169024705886841, "rewards/rejected": 3.315497636795044, "step": 84360 }, { "epoch": 3.917080644412461, "grad_norm": 29.928634643554688, "learning_rate": 6.508101583174706e-08, "logits/chosen": -19.094879150390625, "logits/rejected": -18.359628677368164, "logps/chosen": -395.70794677734375, "logps/rejected": -305.1390686035156, "loss": 0.5431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.94897723197937, "rewards/margins": 0.5668101906776428, "rewards/rejected": 2.382167339324951, "step": 84370 }, { "epoch": 3.9175449185198943, "grad_norm": 165.67901611328125, "learning_rate": 6.505315938530107e-08, "logits/chosen": -18.451358795166016, "logits/rejected": -18.414993286132812, "logps/chosen": -494.6398010253906, "logps/rejected": -430.1500549316406, "loss": 0.8866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4028077125549316, "rewards/margins": 0.5691083669662476, "rewards/rejected": 2.8336987495422363, "step": 84380 }, { "epoch": 3.918009192627327, "grad_norm": 45.393795013427734, "learning_rate": 6.50253029388551e-08, "logits/chosen": -19.276140213012695, "logits/rejected": -18.09490203857422, "logps/chosen": -435.99542236328125, "logps/rejected": -310.41900634765625, "loss": 0.7137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.515293121337891, "rewards/margins": 2.1999125480651855, "rewards/rejected": 2.315380334854126, "step": 84390 }, { "epoch": 3.9184734667347603, "grad_norm": 10.554024696350098, "learning_rate": 6.499744649240912e-08, "logits/chosen": -19.21718978881836, "logits/rejected": -18.54525375366211, "logps/chosen": -328.5167541503906, "logps/rejected": -312.40509033203125, "loss": 0.5355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1079344749450684, "rewards/margins": 1.2201564311981201, "rewards/rejected": 1.8877782821655273, "step": 84400 }, { "epoch": 3.9189377408421935, "grad_norm": 24.295740127563477, "learning_rate": 6.496959004596313e-08, "logits/chosen": -18.525346755981445, "logits/rejected": -18.581205368041992, "logps/chosen": -481.721923828125, "logps/rejected": -476.67626953125, "loss": 0.6901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8567192554473877, "rewards/margins": 0.40528231859207153, "rewards/rejected": 2.451436996459961, "step": 84410 }, { "epoch": 3.9194020149496263, "grad_norm": 53.93294906616211, "learning_rate": 6.494173359951716e-08, "logits/chosen": -19.587791442871094, "logits/rejected": -19.0814151763916, "logps/chosen": -388.4432373046875, "logps/rejected": -316.65521240234375, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2254130840301514, "rewards/margins": 1.4693291187286377, "rewards/rejected": 1.7560840845108032, "step": 84420 }, { "epoch": 3.919866289057059, "grad_norm": 23.770883560180664, "learning_rate": 6.491387715307117e-08, "logits/chosen": -19.26851463317871, "logits/rejected": -18.031139373779297, "logps/chosen": -396.34893798828125, "logps/rejected": -296.6791687011719, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.79512095451355, "rewards/margins": 2.3331310749053955, "rewards/rejected": 1.4619897603988647, "step": 84430 }, { "epoch": 3.9203305631644922, "grad_norm": 191.7220916748047, "learning_rate": 6.488602070662519e-08, "logits/chosen": -18.70809555053711, "logits/rejected": -17.866474151611328, "logps/chosen": -425.12139892578125, "logps/rejected": -326.3572692871094, "loss": 0.3654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.857165575027466, "rewards/margins": 1.7863982915878296, "rewards/rejected": 2.0707671642303467, "step": 84440 }, { "epoch": 3.9207948372719255, "grad_norm": 168.0272979736328, "learning_rate": 6.485816426017922e-08, "logits/chosen": -18.095767974853516, "logits/rejected": -18.298625946044922, "logps/chosen": -294.16290283203125, "logps/rejected": -380.9183654785156, "loss": 1.2304, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.312927722930908, "rewards/margins": -0.19004793465137482, "rewards/rejected": 2.5029757022857666, "step": 84450 }, { "epoch": 3.9212591113793582, "grad_norm": 34.130393981933594, "learning_rate": 6.483030781373323e-08, "logits/chosen": -19.316696166992188, "logits/rejected": -18.860416412353516, "logps/chosen": -430.16741943359375, "logps/rejected": -391.6951599121094, "loss": 0.4716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.011256217956543, "rewards/margins": 1.0866155624389648, "rewards/rejected": 2.92464017868042, "step": 84460 }, { "epoch": 3.9217233854867914, "grad_norm": 54.91353225708008, "learning_rate": 6.480245136728725e-08, "logits/chosen": -19.054336547851562, "logits/rejected": -18.88920021057129, "logps/chosen": -351.6539306640625, "logps/rejected": -340.952880859375, "loss": 0.68, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8144304752349854, "rewards/margins": 0.5673493146896362, "rewards/rejected": 2.2470812797546387, "step": 84470 }, { "epoch": 3.9221876595942247, "grad_norm": 0.3737688660621643, "learning_rate": 6.477459492084126e-08, "logits/chosen": -18.610088348388672, "logits/rejected": -18.145633697509766, "logps/chosen": -397.97650146484375, "logps/rejected": -402.0596923828125, "loss": 1.032, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.182492256164551, "rewards/margins": 0.8246628046035767, "rewards/rejected": 3.3578293323516846, "step": 84480 }, { "epoch": 3.9226519337016574, "grad_norm": 123.70046997070312, "learning_rate": 6.474673847439527e-08, "logits/chosen": -18.99249267578125, "logits/rejected": -18.690446853637695, "logps/chosen": -454.0592346191406, "logps/rejected": -384.6270751953125, "loss": 0.8044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.459871292114258, "rewards/margins": 0.16081050038337708, "rewards/rejected": 3.299060821533203, "step": 84490 }, { "epoch": 3.9231162078090906, "grad_norm": 53.58122253417969, "learning_rate": 6.471888202794929e-08, "logits/chosen": -19.421579360961914, "logits/rejected": -19.272449493408203, "logps/chosen": -294.3502197265625, "logps/rejected": -292.18011474609375, "loss": 0.7808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7125885486602783, "rewards/margins": 0.4024890065193176, "rewards/rejected": 2.3100993633270264, "step": 84500 }, { "epoch": 3.9235804819165234, "grad_norm": 35.24113845825195, "learning_rate": 6.469102558150332e-08, "logits/chosen": -19.021869659423828, "logits/rejected": -18.71137809753418, "logps/chosen": -470.6234436035156, "logps/rejected": -437.53802490234375, "loss": 0.8567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.293356418609619, "rewards/margins": 1.0497510433197021, "rewards/rejected": 3.243605136871338, "step": 84510 }, { "epoch": 3.9240447560239566, "grad_norm": 195.2259979248047, "learning_rate": 6.466316913505733e-08, "logits/chosen": -19.169904708862305, "logits/rejected": -18.22869300842285, "logps/chosen": -403.047607421875, "logps/rejected": -312.56158447265625, "loss": 0.4588, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1849019527435303, "rewards/margins": 1.7311547994613647, "rewards/rejected": 1.4537471532821655, "step": 84520 }, { "epoch": 3.9245090301313894, "grad_norm": 0.017148107290267944, "learning_rate": 6.463531268861135e-08, "logits/chosen": -18.477018356323242, "logits/rejected": -18.182918548583984, "logps/chosen": -324.32867431640625, "logps/rejected": -254.4798126220703, "loss": 0.9221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9984869956970215, "rewards/margins": 1.866080641746521, "rewards/rejected": 1.1324061155319214, "step": 84530 }, { "epoch": 3.9249733042388226, "grad_norm": 177.13023376464844, "learning_rate": 6.460745624216537e-08, "logits/chosen": -19.117570877075195, "logits/rejected": -18.19545555114746, "logps/chosen": -432.3330993652344, "logps/rejected": -324.3127746582031, "loss": 0.6211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.748324155807495, "rewards/margins": 1.288281798362732, "rewards/rejected": 2.4600419998168945, "step": 84540 }, { "epoch": 3.925437578346256, "grad_norm": 151.09808349609375, "learning_rate": 6.457959979571939e-08, "logits/chosen": -18.652597427368164, "logits/rejected": -18.188650131225586, "logps/chosen": -350.39453125, "logps/rejected": -358.1575012207031, "loss": 1.0507, "rewards/accuracies": 0.5, "rewards/chosen": 3.8383700847625732, "rewards/margins": 0.23003900051116943, "rewards/rejected": 3.6083309650421143, "step": 84550 }, { "epoch": 3.9259018524536886, "grad_norm": 34.15740203857422, "learning_rate": 6.45517433492734e-08, "logits/chosen": -18.602581024169922, "logits/rejected": -19.496999740600586, "logps/chosen": -380.6752014160156, "logps/rejected": -355.81036376953125, "loss": 1.1829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5717129707336426, "rewards/margins": -0.05212615802884102, "rewards/rejected": 2.6238391399383545, "step": 84560 }, { "epoch": 3.926366126561122, "grad_norm": 65.11540985107422, "learning_rate": 6.452388690282743e-08, "logits/chosen": -19.353206634521484, "logits/rejected": -18.930988311767578, "logps/chosen": -390.32635498046875, "logps/rejected": -336.2194519042969, "loss": 0.8992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6800777912139893, "rewards/margins": 1.2238799333572388, "rewards/rejected": 2.456198215484619, "step": 84570 }, { "epoch": 3.9268304006685546, "grad_norm": 72.18653869628906, "learning_rate": 6.449603045638144e-08, "logits/chosen": -20.30356788635254, "logits/rejected": -20.167112350463867, "logps/chosen": -353.0306701660156, "logps/rejected": -336.38690185546875, "loss": 0.6874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.175853967666626, "rewards/margins": 0.4617990553379059, "rewards/rejected": 2.714054822921753, "step": 84580 }, { "epoch": 3.927294674775988, "grad_norm": 83.64645385742188, "learning_rate": 6.446817400993546e-08, "logits/chosen": -18.39504623413086, "logits/rejected": -18.74053192138672, "logps/chosen": -355.42144775390625, "logps/rejected": -399.15301513671875, "loss": 1.3257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5976295471191406, "rewards/margins": -0.22594156861305237, "rewards/rejected": 2.82357120513916, "step": 84590 }, { "epoch": 3.9277589488834206, "grad_norm": 43.6380729675293, "learning_rate": 6.444031756348949e-08, "logits/chosen": -19.478260040283203, "logits/rejected": -17.49190902709961, "logps/chosen": -471.936767578125, "logps/rejected": -281.0962219238281, "loss": 0.4087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.536947727203369, "rewards/margins": 2.0310301780700684, "rewards/rejected": 1.5059174299240112, "step": 84600 }, { "epoch": 3.928223222990854, "grad_norm": 104.16887664794922, "learning_rate": 6.44124611170435e-08, "logits/chosen": -19.74152183532715, "logits/rejected": -19.064260482788086, "logps/chosen": -467.60595703125, "logps/rejected": -370.52740478515625, "loss": 0.9931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.983721733093262, "rewards/margins": 1.0311214923858643, "rewards/rejected": 3.9526000022888184, "step": 84610 }, { "epoch": 3.928687497098287, "grad_norm": 15.590496063232422, "learning_rate": 6.438460467059752e-08, "logits/chosen": -18.841861724853516, "logits/rejected": -17.344913482666016, "logps/chosen": -438.7102966308594, "logps/rejected": -261.38702392578125, "loss": 0.3315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.803304672241211, "rewards/margins": 2.1268208026885986, "rewards/rejected": 1.6764843463897705, "step": 84620 }, { "epoch": 3.92915177120572, "grad_norm": 47.95005416870117, "learning_rate": 6.435674822415154e-08, "logits/chosen": -18.606714248657227, "logits/rejected": -18.05817413330078, "logps/chosen": -432.3556213378906, "logps/rejected": -371.70391845703125, "loss": 1.1139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.329853057861328, "rewards/margins": 0.4880637228488922, "rewards/rejected": 2.8417892456054688, "step": 84630 }, { "epoch": 3.929616045313153, "grad_norm": 185.90830993652344, "learning_rate": 6.432889177770556e-08, "logits/chosen": -19.979122161865234, "logits/rejected": -19.24504280090332, "logps/chosen": -431.93353271484375, "logps/rejected": -376.72308349609375, "loss": 0.5561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.36892032623291, "rewards/margins": 0.9458069801330566, "rewards/rejected": 3.4231133460998535, "step": 84640 }, { "epoch": 3.9300803194205858, "grad_norm": 193.10025024414062, "learning_rate": 6.430103533125957e-08, "logits/chosen": -19.229007720947266, "logits/rejected": -19.247610092163086, "logps/chosen": -438.49462890625, "logps/rejected": -446.72003173828125, "loss": 1.2533, "rewards/accuracies": 0.5, "rewards/chosen": 3.720813274383545, "rewards/margins": -0.09202395379543304, "rewards/rejected": 3.812837600708008, "step": 84650 }, { "epoch": 3.930544593528019, "grad_norm": 211.001708984375, "learning_rate": 6.42731788848136e-08, "logits/chosen": -18.838191986083984, "logits/rejected": -18.678442001342773, "logps/chosen": -333.803466796875, "logps/rejected": -336.62744140625, "loss": 1.701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9537734985351562, "rewards/margins": -0.6372452974319458, "rewards/rejected": 2.5910186767578125, "step": 84660 }, { "epoch": 3.9310088676354518, "grad_norm": 52.44597625732422, "learning_rate": 6.424532243836762e-08, "logits/chosen": -17.677255630493164, "logits/rejected": -17.398113250732422, "logps/chosen": -339.7994689941406, "logps/rejected": -323.32891845703125, "loss": 0.6935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.014626979827881, "rewards/margins": 1.3307808637619019, "rewards/rejected": 1.6838462352752686, "step": 84670 }, { "epoch": 3.931473141742885, "grad_norm": 2.6214141845703125, "learning_rate": 6.421746599192163e-08, "logits/chosen": -19.325626373291016, "logits/rejected": -17.7513370513916, "logps/chosen": -560.0324096679688, "logps/rejected": -310.33819580078125, "loss": 0.3256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.240015983581543, "rewards/margins": 2.4399497509002686, "rewards/rejected": 1.800066590309143, "step": 84680 }, { "epoch": 3.931937415850318, "grad_norm": 1.9034096002578735, "learning_rate": 6.418960954547564e-08, "logits/chosen": -19.591806411743164, "logits/rejected": -19.618764877319336, "logps/chosen": -484.1849670410156, "logps/rejected": -404.7006530761719, "loss": 0.5469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.396653652191162, "rewards/margins": 1.3357534408569336, "rewards/rejected": 2.0609004497528076, "step": 84690 }, { "epoch": 3.932401689957751, "grad_norm": 19.041776657104492, "learning_rate": 6.416175309902966e-08, "logits/chosen": -18.81094741821289, "logits/rejected": -17.99752426147461, "logps/chosen": -345.0314025878906, "logps/rejected": -158.56564331054688, "loss": 0.2085, "rewards/accuracies": 1.0, "rewards/chosen": 3.0320348739624023, "rewards/margins": 2.77899432182312, "rewards/rejected": 0.25304049253463745, "step": 84700 }, { "epoch": 3.932865964065184, "grad_norm": 25.542129516601562, "learning_rate": 6.413389665258367e-08, "logits/chosen": -19.496417999267578, "logits/rejected": -18.555200576782227, "logps/chosen": -390.72186279296875, "logps/rejected": -324.78973388671875, "loss": 0.2292, "rewards/accuracies": 1.0, "rewards/chosen": 2.9731950759887695, "rewards/margins": 2.059401273727417, "rewards/rejected": 0.9137933850288391, "step": 84710 }, { "epoch": 3.933330238172617, "grad_norm": 164.9372100830078, "learning_rate": 6.41060402061377e-08, "logits/chosen": -19.850210189819336, "logits/rejected": -19.871654510498047, "logps/chosen": -487.892822265625, "logps/rejected": -505.1952209472656, "loss": 0.9078, "rewards/accuracies": 0.5, "rewards/chosen": 3.934157133102417, "rewards/margins": 0.08082306385040283, "rewards/rejected": 3.853334426879883, "step": 84720 }, { "epoch": 3.93379451228005, "grad_norm": 31.670934677124023, "learning_rate": 6.407818375969172e-08, "logits/chosen": -19.68820571899414, "logits/rejected": -19.45567512512207, "logps/chosen": -415.2879333496094, "logps/rejected": -413.65570068359375, "loss": 1.4769, "rewards/accuracies": 0.5, "rewards/chosen": 3.4707858562469482, "rewards/margins": 0.021235942840576172, "rewards/rejected": 3.449549913406372, "step": 84730 }, { "epoch": 3.934258786387483, "grad_norm": 73.94093322753906, "learning_rate": 6.405032731324573e-08, "logits/chosen": -19.425769805908203, "logits/rejected": -18.027095794677734, "logps/chosen": -440.0811462402344, "logps/rejected": -345.93341064453125, "loss": 0.1923, "rewards/accuracies": 1.0, "rewards/chosen": 4.169264316558838, "rewards/margins": 2.093400478363037, "rewards/rejected": 2.0758633613586426, "step": 84740 }, { "epoch": 3.934723060494916, "grad_norm": 175.19903564453125, "learning_rate": 6.402247086679976e-08, "logits/chosen": -18.615306854248047, "logits/rejected": -17.699745178222656, "logps/chosen": -277.8166809082031, "logps/rejected": -215.6940155029297, "loss": 0.6456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4466466903686523, "rewards/margins": 1.289587140083313, "rewards/rejected": 1.1570594310760498, "step": 84750 }, { "epoch": 3.9351873346023494, "grad_norm": 5.741724014282227, "learning_rate": 6.399461442035377e-08, "logits/chosen": -19.1490421295166, "logits/rejected": -17.94010353088379, "logps/chosen": -379.53045654296875, "logps/rejected": -279.5968017578125, "loss": 0.6894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.306918144226074, "rewards/margins": 2.1945431232452393, "rewards/rejected": 2.112375020980835, "step": 84760 }, { "epoch": 3.935651608709782, "grad_norm": 18.85141372680664, "learning_rate": 6.396675797390779e-08, "logits/chosen": -18.05788803100586, "logits/rejected": -17.015766143798828, "logps/chosen": -417.4012145996094, "logps/rejected": -291.6497497558594, "loss": 0.5715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8621222972869873, "rewards/margins": 2.2170474529266357, "rewards/rejected": 1.6450748443603516, "step": 84770 }, { "epoch": 3.9361158828172154, "grad_norm": 0.5539250373840332, "learning_rate": 6.393890152746182e-08, "logits/chosen": -18.851604461669922, "logits/rejected": -17.274852752685547, "logps/chosen": -475.982421875, "logps/rejected": -317.64910888671875, "loss": 0.2984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.539632320404053, "rewards/margins": 2.73321795463562, "rewards/rejected": 1.806414008140564, "step": 84780 }, { "epoch": 3.9365801569246486, "grad_norm": 116.9456558227539, "learning_rate": 6.391104508101583e-08, "logits/chosen": -19.16238784790039, "logits/rejected": -18.195056915283203, "logps/chosen": -437.17645263671875, "logps/rejected": -329.63446044921875, "loss": 0.605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.315532684326172, "rewards/margins": 0.8152216076850891, "rewards/rejected": 2.5003113746643066, "step": 84790 }, { "epoch": 3.9370444310320813, "grad_norm": 8.359155654907227, "learning_rate": 6.388597427921445e-08, "logits/chosen": -20.492382049560547, "logits/rejected": -20.698904037475586, "logps/chosen": -493.25750732421875, "logps/rejected": -386.8310546875, "loss": 0.8219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3604774475097656, "rewards/margins": 0.30105945467948914, "rewards/rejected": 3.059417963027954, "step": 84800 }, { "epoch": 3.937508705139514, "grad_norm": 70.29492950439453, "learning_rate": 6.385811783276847e-08, "logits/chosen": -19.04037857055664, "logits/rejected": -18.54644012451172, "logps/chosen": -430.1265563964844, "logps/rejected": -339.760498046875, "loss": 0.5698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2673392295837402, "rewards/margins": 1.1462501287460327, "rewards/rejected": 2.121089220046997, "step": 84810 }, { "epoch": 3.9379729792469473, "grad_norm": 125.72160339355469, "learning_rate": 6.383026138632248e-08, "logits/chosen": -18.687692642211914, "logits/rejected": -19.334392547607422, "logps/chosen": -324.29815673828125, "logps/rejected": -332.1873474121094, "loss": 0.6244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.428288221359253, "rewards/margins": 0.5696643590927124, "rewards/rejected": 1.8586238622665405, "step": 84820 }, { "epoch": 3.9384372533543806, "grad_norm": 19.56196403503418, "learning_rate": 6.38024049398765e-08, "logits/chosen": -20.27109146118164, "logits/rejected": -19.388957977294922, "logps/chosen": -499.41485595703125, "logps/rejected": -417.3627014160156, "loss": 0.642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.623181343078613, "rewards/margins": 1.8444774150848389, "rewards/rejected": 3.778703212738037, "step": 84830 }, { "epoch": 3.9389015274618133, "grad_norm": 135.09837341308594, "learning_rate": 6.377454849343052e-08, "logits/chosen": -19.706829071044922, "logits/rejected": -17.612445831298828, "logps/chosen": -479.2242736816406, "logps/rejected": -257.78240966796875, "loss": 0.1705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.996295928955078, "rewards/margins": 3.5532219409942627, "rewards/rejected": 1.443074345588684, "step": 84840 }, { "epoch": 3.9393658015692465, "grad_norm": 30.82343101501465, "learning_rate": 6.374669204698454e-08, "logits/chosen": -18.810558319091797, "logits/rejected": -17.984981536865234, "logps/chosen": -412.99591064453125, "logps/rejected": -307.35882568359375, "loss": 0.6659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.772421836853027, "rewards/margins": 2.379246711730957, "rewards/rejected": 2.393174886703491, "step": 84850 }, { "epoch": 3.9398300756766798, "grad_norm": 0.44274210929870605, "learning_rate": 6.371883560053855e-08, "logits/chosen": -18.845287322998047, "logits/rejected": -18.120906829833984, "logps/chosen": -441.8895568847656, "logps/rejected": -381.2094421386719, "loss": 0.6973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.803258180618286, "rewards/margins": 0.9089662432670593, "rewards/rejected": 2.8942923545837402, "step": 84860 }, { "epoch": 3.9402943497841125, "grad_norm": 5.910613059997559, "learning_rate": 6.369097915409258e-08, "logits/chosen": -20.02606201171875, "logits/rejected": -18.691905975341797, "logps/chosen": -308.9320068359375, "logps/rejected": -233.01229858398438, "loss": 0.5181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.349229335784912, "rewards/margins": 1.3679975271224976, "rewards/rejected": 1.981231927871704, "step": 84870 }, { "epoch": 3.9407586238915453, "grad_norm": 3.7639291286468506, "learning_rate": 6.366312270764659e-08, "logits/chosen": -18.909454345703125, "logits/rejected": -17.669506072998047, "logps/chosen": -370.66363525390625, "logps/rejected": -361.43304443359375, "loss": 0.6819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.23760986328125, "rewards/margins": 2.2979273796081543, "rewards/rejected": 0.9396823644638062, "step": 84880 }, { "epoch": 3.9412228979989785, "grad_norm": 80.60633087158203, "learning_rate": 6.363526626120061e-08, "logits/chosen": -19.745962142944336, "logits/rejected": -19.29330062866211, "logps/chosen": -347.61737060546875, "logps/rejected": -314.5465087890625, "loss": 0.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3633310794830322, "rewards/margins": 0.6427547931671143, "rewards/rejected": 1.720576286315918, "step": 84890 }, { "epoch": 3.9416871721064117, "grad_norm": 0.32040491700172424, "learning_rate": 6.360740981475464e-08, "logits/chosen": -18.87298583984375, "logits/rejected": -17.905643463134766, "logps/chosen": -469.29119873046875, "logps/rejected": -357.6742858886719, "loss": 0.4394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9567036628723145, "rewards/margins": 1.969852089881897, "rewards/rejected": 1.9868520498275757, "step": 84900 }, { "epoch": 3.9421514462138445, "grad_norm": 156.93316650390625, "learning_rate": 6.357955336830865e-08, "logits/chosen": -19.155349731445312, "logits/rejected": -18.378108978271484, "logps/chosen": -392.2906799316406, "logps/rejected": -349.2376403808594, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.055622100830078, "rewards/margins": 1.0232232809066772, "rewards/rejected": 2.0323984622955322, "step": 84910 }, { "epoch": 3.9426157203212777, "grad_norm": 28.787677764892578, "learning_rate": 6.355169692186266e-08, "logits/chosen": -17.847152709960938, "logits/rejected": -17.75461196899414, "logps/chosen": -435.6014099121094, "logps/rejected": -440.5311584472656, "loss": 0.9705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4320473670959473, "rewards/margins": 0.4974687099456787, "rewards/rejected": 1.9345782995224, "step": 84920 }, { "epoch": 3.943079994428711, "grad_norm": 0.9221758842468262, "learning_rate": 6.352384047541668e-08, "logits/chosen": -19.265928268432617, "logits/rejected": -19.066635131835938, "logps/chosen": -450.3172302246094, "logps/rejected": -445.30023193359375, "loss": 1.012, "rewards/accuracies": 0.5, "rewards/chosen": 3.569714069366455, "rewards/margins": 0.4825262129306793, "rewards/rejected": 3.0871877670288086, "step": 84930 }, { "epoch": 3.9435442685361437, "grad_norm": 0.8908770084381104, "learning_rate": 6.34959840289707e-08, "logits/chosen": -19.10573959350586, "logits/rejected": -19.152103424072266, "logps/chosen": -396.90203857421875, "logps/rejected": -353.47149658203125, "loss": 0.4703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.914616107940674, "rewards/margins": 1.4751527309417725, "rewards/rejected": 2.4394631385803223, "step": 84940 }, { "epoch": 3.944008542643577, "grad_norm": 33.88606262207031, "learning_rate": 6.346812758252472e-08, "logits/chosen": -19.828176498413086, "logits/rejected": -19.755075454711914, "logps/chosen": -344.0177917480469, "logps/rejected": -386.0017395019531, "loss": 1.764, "rewards/accuracies": 0.5, "rewards/chosen": 3.179245710372925, "rewards/margins": -0.5636781454086304, "rewards/rejected": 3.7429232597351074, "step": 84950 }, { "epoch": 3.9444728167510097, "grad_norm": 37.80941390991211, "learning_rate": 6.344027113607874e-08, "logits/chosen": -18.109060287475586, "logits/rejected": -17.88369369506836, "logps/chosen": -307.099365234375, "logps/rejected": -279.8067321777344, "loss": 0.5717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.940037965774536, "rewards/margins": 0.7618069648742676, "rewards/rejected": 2.1782310009002686, "step": 84960 }, { "epoch": 3.944937090858443, "grad_norm": 81.85223388671875, "learning_rate": 6.341241468963275e-08, "logits/chosen": -19.06864356994629, "logits/rejected": -18.544788360595703, "logps/chosen": -448.073486328125, "logps/rejected": -317.1857604980469, "loss": 0.9657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.843374729156494, "rewards/margins": 1.0778653621673584, "rewards/rejected": 2.7655093669891357, "step": 84970 }, { "epoch": 3.9454013649658757, "grad_norm": 123.0018310546875, "learning_rate": 6.338455824318678e-08, "logits/chosen": -19.698915481567383, "logits/rejected": -19.514869689941406, "logps/chosen": -327.1632385253906, "logps/rejected": -264.7063903808594, "loss": 0.6418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1887526512145996, "rewards/margins": 1.9435287714004517, "rewards/rejected": 1.245223879814148, "step": 84980 }, { "epoch": 3.945865639073309, "grad_norm": 29.084793090820312, "learning_rate": 6.335670179674079e-08, "logits/chosen": -19.353445053100586, "logits/rejected": -18.8869571685791, "logps/chosen": -463.15081787109375, "logps/rejected": -325.07666015625, "loss": 0.5808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4230637550354004, "rewards/margins": 0.9657142758369446, "rewards/rejected": 2.4573490619659424, "step": 84990 }, { "epoch": 3.946329913180742, "grad_norm": 168.8478546142578, "learning_rate": 6.332884535029481e-08, "logits/chosen": -19.077205657958984, "logits/rejected": -18.292688369750977, "logps/chosen": -454.08978271484375, "logps/rejected": -415.71954345703125, "loss": 0.3821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.494090557098389, "rewards/margins": 1.6549656391143799, "rewards/rejected": 3.839125156402588, "step": 85000 }, { "epoch": 3.946794187288175, "grad_norm": 19.934444427490234, "learning_rate": 6.330098890384884e-08, "logits/chosen": -18.48027992248535, "logits/rejected": -18.962020874023438, "logps/chosen": -314.98052978515625, "logps/rejected": -294.80621337890625, "loss": 1.7348, "rewards/accuracies": 0.5, "rewards/chosen": 2.3765406608581543, "rewards/margins": -0.24431395530700684, "rewards/rejected": 2.620854616165161, "step": 85010 }, { "epoch": 3.947258461395608, "grad_norm": 0.7270900011062622, "learning_rate": 6.327313245740285e-08, "logits/chosen": -19.336685180664062, "logits/rejected": -17.312450408935547, "logps/chosen": -416.0316467285156, "logps/rejected": -213.7653045654297, "loss": 0.1719, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.75028657913208, "rewards/margins": 3.0675530433654785, "rewards/rejected": 0.6827334761619568, "step": 85020 }, { "epoch": 3.947722735503041, "grad_norm": 1.1205790042877197, "learning_rate": 6.324527601095686e-08, "logits/chosen": -19.347795486450195, "logits/rejected": -18.35625457763672, "logps/chosen": -471.498779296875, "logps/rejected": -337.22381591796875, "loss": 0.9376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.541136264801025, "rewards/margins": 1.8782535791397095, "rewards/rejected": 2.6628825664520264, "step": 85030 }, { "epoch": 3.948187009610474, "grad_norm": 198.28172302246094, "learning_rate": 6.321741956451088e-08, "logits/chosen": -18.734516143798828, "logits/rejected": -18.185121536254883, "logps/chosen": -327.91680908203125, "logps/rejected": -251.47598266601562, "loss": 0.8624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.503345012664795, "rewards/margins": 1.4946982860565186, "rewards/rejected": 2.0086469650268555, "step": 85040 }, { "epoch": 3.948651283717907, "grad_norm": 70.61682891845703, "learning_rate": 6.318956311806491e-08, "logits/chosen": -19.061853408813477, "logits/rejected": -18.624332427978516, "logps/chosen": -478.538818359375, "logps/rejected": -392.4251403808594, "loss": 0.5127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.6358442306518555, "rewards/margins": 1.395742654800415, "rewards/rejected": 3.2401020526885986, "step": 85050 }, { "epoch": 3.94911555782534, "grad_norm": 236.90728759765625, "learning_rate": 6.316170667161892e-08, "logits/chosen": -18.548980712890625, "logits/rejected": -18.33152961730957, "logps/chosen": -360.47174072265625, "logps/rejected": -336.8711853027344, "loss": 1.1189, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0347042083740234, "rewards/margins": 0.22747115790843964, "rewards/rejected": 2.8072330951690674, "step": 85060 }, { "epoch": 3.9495798319327733, "grad_norm": 44.887760162353516, "learning_rate": 6.313385022517294e-08, "logits/chosen": -19.240150451660156, "logits/rejected": -18.218341827392578, "logps/chosen": -374.5462646484375, "logps/rejected": -246.52987670898438, "loss": 0.6582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.74285888671875, "rewards/margins": 2.270186185836792, "rewards/rejected": 1.472672462463379, "step": 85070 }, { "epoch": 3.950044106040206, "grad_norm": 139.55564880371094, "learning_rate": 6.310599377872696e-08, "logits/chosen": -19.56790542602539, "logits/rejected": -18.309850692749023, "logps/chosen": -341.091552734375, "logps/rejected": -318.41033935546875, "loss": 0.4621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.158093452453613, "rewards/margins": 1.8085944652557373, "rewards/rejected": 2.349499225616455, "step": 85080 }, { "epoch": 3.9505083801476393, "grad_norm": 22.060096740722656, "learning_rate": 6.307813733228098e-08, "logits/chosen": -18.829509735107422, "logits/rejected": -17.323314666748047, "logps/chosen": -429.6470642089844, "logps/rejected": -263.3935852050781, "loss": 0.8033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.484116792678833, "rewards/margins": 1.8372091054916382, "rewards/rejected": 1.6469074487686157, "step": 85090 }, { "epoch": 3.950972654255072, "grad_norm": 73.87777709960938, "learning_rate": 6.305028088583499e-08, "logits/chosen": -21.098264694213867, "logits/rejected": -20.867990493774414, "logps/chosen": -397.8309020996094, "logps/rejected": -377.2681579589844, "loss": 0.5579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3833000659942627, "rewards/margins": 0.7928260564804077, "rewards/rejected": 2.5904736518859863, "step": 85100 }, { "epoch": 3.9514369283625053, "grad_norm": 186.54209899902344, "learning_rate": 6.302242443938902e-08, "logits/chosen": -19.25406265258789, "logits/rejected": -19.97382926940918, "logps/chosen": -451.48565673828125, "logps/rejected": -438.88629150390625, "loss": 1.2472, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.613152265548706, "rewards/margins": -0.4678754210472107, "rewards/rejected": 4.081027507781982, "step": 85110 }, { "epoch": 3.951901202469938, "grad_norm": 12.324043273925781, "learning_rate": 6.299456799294303e-08, "logits/chosen": -19.554645538330078, "logits/rejected": -18.70926284790039, "logps/chosen": -302.9822692871094, "logps/rejected": -242.66610717773438, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1497745513916016, "rewards/margins": 0.7643169164657593, "rewards/rejected": 1.3854577541351318, "step": 85120 }, { "epoch": 3.9523654765773713, "grad_norm": 242.95547485351562, "learning_rate": 6.296671154649705e-08, "logits/chosen": -19.709857940673828, "logits/rejected": -18.91845703125, "logps/chosen": -476.50225830078125, "logps/rejected": -396.2652282714844, "loss": 1.1079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.656510829925537, "rewards/margins": 0.5001578330993652, "rewards/rejected": 2.1563527584075928, "step": 85130 }, { "epoch": 3.9528297506848045, "grad_norm": 43.887184143066406, "learning_rate": 6.293885510005106e-08, "logits/chosen": -19.275545120239258, "logits/rejected": -18.43598747253418, "logps/chosen": -344.76226806640625, "logps/rejected": -284.10638427734375, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9801697731018066, "rewards/margins": 1.0446473360061646, "rewards/rejected": 1.935522437095642, "step": 85140 }, { "epoch": 3.9532940247922372, "grad_norm": 0.2577086389064789, "learning_rate": 6.291099865360508e-08, "logits/chosen": -18.067150115966797, "logits/rejected": -17.797836303710938, "logps/chosen": -346.43682861328125, "logps/rejected": -342.43231201171875, "loss": 0.502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9279963970184326, "rewards/margins": 1.1739225387573242, "rewards/rejected": 1.7540738582611084, "step": 85150 }, { "epoch": 3.9537582988996705, "grad_norm": 43.362239837646484, "learning_rate": 6.28831422071591e-08, "logits/chosen": -19.264039993286133, "logits/rejected": -18.15843963623047, "logps/chosen": -346.6811218261719, "logps/rejected": -241.05459594726562, "loss": 0.3804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7217068672180176, "rewards/margins": 2.1754508018493652, "rewards/rejected": 1.5462558269500732, "step": 85160 }, { "epoch": 3.9542225730071037, "grad_norm": 45.21062469482422, "learning_rate": 6.285528576071312e-08, "logits/chosen": -19.315969467163086, "logits/rejected": -18.136486053466797, "logps/chosen": -346.3556823730469, "logps/rejected": -243.9335479736328, "loss": 0.3262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.621758222579956, "rewards/margins": 2.3768310546875, "rewards/rejected": 1.2449268102645874, "step": 85170 }, { "epoch": 3.9546868471145364, "grad_norm": 68.14388275146484, "learning_rate": 6.282742931426714e-08, "logits/chosen": -19.522953033447266, "logits/rejected": -18.809062957763672, "logps/chosen": -357.8813171386719, "logps/rejected": -280.9727478027344, "loss": 0.8312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.334984540939331, "rewards/margins": 1.388559341430664, "rewards/rejected": 1.9464250802993774, "step": 85180 }, { "epoch": 3.955151121221969, "grad_norm": 5.656580924987793, "learning_rate": 6.279957286782116e-08, "logits/chosen": -19.06489372253418, "logits/rejected": -18.99321937561035, "logps/chosen": -335.5340881347656, "logps/rejected": -362.7983703613281, "loss": 0.996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1469316482543945, "rewards/margins": 0.1854027956724167, "rewards/rejected": 3.96152925491333, "step": 85190 }, { "epoch": 3.9556153953294024, "grad_norm": 59.53868865966797, "learning_rate": 6.277171642137518e-08, "logits/chosen": -19.798202514648438, "logits/rejected": -19.41533851623535, "logps/chosen": -445.70770263671875, "logps/rejected": -377.3526306152344, "loss": 0.5486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.453049182891846, "rewards/margins": 0.8203984498977661, "rewards/rejected": 3.632650375366211, "step": 85200 }, { "epoch": 3.9560796694368356, "grad_norm": 195.14016723632812, "learning_rate": 6.274385997492919e-08, "logits/chosen": -18.690706253051758, "logits/rejected": -17.448001861572266, "logps/chosen": -422.74139404296875, "logps/rejected": -295.31170654296875, "loss": 0.9115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4805665016174316, "rewards/margins": 1.5389659404754639, "rewards/rejected": 1.9416002035140991, "step": 85210 }, { "epoch": 3.9565439435442684, "grad_norm": 138.44046020507812, "learning_rate": 6.271600352848322e-08, "logits/chosen": -19.308456420898438, "logits/rejected": -18.867725372314453, "logps/chosen": -427.9267578125, "logps/rejected": -365.90948486328125, "loss": 0.8037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.640172481536865, "rewards/margins": 0.9704271554946899, "rewards/rejected": 3.669745683670044, "step": 85220 }, { "epoch": 3.9570082176517016, "grad_norm": 0.05176996439695358, "learning_rate": 6.268814708203723e-08, "logits/chosen": -20.035823822021484, "logits/rejected": -18.418685913085938, "logps/chosen": -419.18572998046875, "logps/rejected": -323.2266540527344, "loss": 0.6588, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.412322998046875, "rewards/margins": 1.4794399738311768, "rewards/rejected": 1.9328832626342773, "step": 85230 }, { "epoch": 3.957472491759135, "grad_norm": 122.80873107910156, "learning_rate": 6.266029063559125e-08, "logits/chosen": -18.43703842163086, "logits/rejected": -18.502452850341797, "logps/chosen": -361.1705322265625, "logps/rejected": -429.690185546875, "loss": 1.3288, "rewards/accuracies": 0.5, "rewards/chosen": 2.9466729164123535, "rewards/margins": -0.20785360038280487, "rewards/rejected": 3.154526472091675, "step": 85240 }, { "epoch": 3.9579367658665676, "grad_norm": 32.523372650146484, "learning_rate": 6.263243418914526e-08, "logits/chosen": -18.21815299987793, "logits/rejected": -18.177661895751953, "logps/chosen": -277.789794921875, "logps/rejected": -265.4390869140625, "loss": 0.9874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6715247631072998, "rewards/margins": -0.16004498302936554, "rewards/rejected": 1.8315699100494385, "step": 85250 }, { "epoch": 3.9584010399740004, "grad_norm": 18.414827346801758, "learning_rate": 6.260457774269929e-08, "logits/chosen": -19.5926570892334, "logits/rejected": -18.767230987548828, "logps/chosen": -393.88323974609375, "logps/rejected": -335.38629150390625, "loss": 0.3845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0558533668518066, "rewards/margins": 1.2243479490280151, "rewards/rejected": 1.8315051794052124, "step": 85260 }, { "epoch": 3.9588653140814336, "grad_norm": 0.2634376287460327, "learning_rate": 6.25767212962533e-08, "logits/chosen": -19.000041961669922, "logits/rejected": -18.140043258666992, "logps/chosen": -342.4188537597656, "logps/rejected": -258.50482177734375, "loss": 0.6428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.294903516769409, "rewards/margins": 1.4678397178649902, "rewards/rejected": 0.8270637392997742, "step": 85270 }, { "epoch": 3.959329588188867, "grad_norm": 135.54202270507812, "learning_rate": 6.254886484980732e-08, "logits/chosen": -18.530391693115234, "logits/rejected": -17.82175064086914, "logps/chosen": -499.7379455566406, "logps/rejected": -370.3997802734375, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.071588516235352, "rewards/margins": 2.087735652923584, "rewards/rejected": 1.983852744102478, "step": 85280 }, { "epoch": 3.9597938622962996, "grad_norm": 231.0936279296875, "learning_rate": 6.252100840336135e-08, "logits/chosen": -19.717941284179688, "logits/rejected": -19.501623153686523, "logps/chosen": -430.5433654785156, "logps/rejected": -467.6695861816406, "loss": 0.7592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.665259599685669, "rewards/margins": 0.10882220417261124, "rewards/rejected": 3.5564377307891846, "step": 85290 }, { "epoch": 3.960258136403733, "grad_norm": 33.34235763549805, "learning_rate": 6.249315195691536e-08, "logits/chosen": -19.19443702697754, "logits/rejected": -18.151535034179688, "logps/chosen": -457.46942138671875, "logps/rejected": -357.0614318847656, "loss": 0.5393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.35461163520813, "rewards/margins": 1.2977334260940552, "rewards/rejected": 2.0568783283233643, "step": 85300 }, { "epoch": 3.960722410511166, "grad_norm": 9.462959289550781, "learning_rate": 6.246529551046938e-08, "logits/chosen": -18.90106773376465, "logits/rejected": -18.142230987548828, "logps/chosen": -311.42236328125, "logps/rejected": -258.1402893066406, "loss": 0.3178, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.378166675567627, "rewards/margins": 1.6439859867095947, "rewards/rejected": 0.7341808080673218, "step": 85310 }, { "epoch": 3.961186684618599, "grad_norm": 280.3699645996094, "learning_rate": 6.24374390640234e-08, "logits/chosen": -18.650606155395508, "logits/rejected": -18.1698055267334, "logps/chosen": -323.8889465332031, "logps/rejected": -354.2730407714844, "loss": 1.2677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6270251274108887, "rewards/margins": 0.6215068101882935, "rewards/rejected": 2.0055181980133057, "step": 85320 }, { "epoch": 3.961650958726032, "grad_norm": 1.814361333847046, "learning_rate": 6.24095826175774e-08, "logits/chosen": -20.22372817993164, "logits/rejected": -19.32352066040039, "logps/chosen": -407.2505798339844, "logps/rejected": -413.8016662597656, "loss": 0.4678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.780513763427734, "rewards/margins": 1.2457705736160278, "rewards/rejected": 3.534743547439575, "step": 85330 }, { "epoch": 3.962115232833465, "grad_norm": 18.637325286865234, "learning_rate": 6.238172617113143e-08, "logits/chosen": -19.658802032470703, "logits/rejected": -18.118942260742188, "logps/chosen": -518.5392456054688, "logps/rejected": -341.32098388671875, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 4.975239276885986, "rewards/margins": 2.3169307708740234, "rewards/rejected": 2.658308506011963, "step": 85340 }, { "epoch": 3.962579506940898, "grad_norm": 55.57835388183594, "learning_rate": 6.235386972468545e-08, "logits/chosen": -18.70376205444336, "logits/rejected": -18.797510147094727, "logps/chosen": -309.82244873046875, "logps/rejected": -281.71044921875, "loss": 0.8931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8829963207244873, "rewards/margins": 0.369198739528656, "rewards/rejected": 2.5137979984283447, "step": 85350 }, { "epoch": 3.9630437810483308, "grad_norm": 239.27540588378906, "learning_rate": 6.232601327823946e-08, "logits/chosen": -18.759632110595703, "logits/rejected": -18.142223358154297, "logps/chosen": -387.5661315917969, "logps/rejected": -337.2460632324219, "loss": 1.8819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2041189670562744, "rewards/margins": 0.1308729648590088, "rewards/rejected": 2.0732462406158447, "step": 85360 }, { "epoch": 3.963508055155764, "grad_norm": 232.2117462158203, "learning_rate": 6.229815683179349e-08, "logits/chosen": -19.274417877197266, "logits/rejected": -19.257314682006836, "logps/chosen": -446.41375732421875, "logps/rejected": -329.17694091796875, "loss": 1.1178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2039456367492676, "rewards/margins": 0.7681716680526733, "rewards/rejected": 2.4357738494873047, "step": 85370 }, { "epoch": 3.963972329263197, "grad_norm": 40.702720642089844, "learning_rate": 6.22703003853475e-08, "logits/chosen": -18.587688446044922, "logits/rejected": -17.67668914794922, "logps/chosen": -355.5891418457031, "logps/rejected": -275.9471130371094, "loss": 0.4726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.190250873565674, "rewards/margins": 2.1002259254455566, "rewards/rejected": 1.090024709701538, "step": 85380 }, { "epoch": 3.96443660337063, "grad_norm": 16.406740188598633, "learning_rate": 6.224244393890152e-08, "logits/chosen": -18.956865310668945, "logits/rejected": -18.050899505615234, "logps/chosen": -269.37445068359375, "logps/rejected": -221.45321655273438, "loss": 0.1967, "rewards/accuracies": 1.0, "rewards/chosen": 3.0932774543762207, "rewards/margins": 2.2097580432891846, "rewards/rejected": 0.8835195302963257, "step": 85390 }, { "epoch": 3.964900877478063, "grad_norm": 15.848085403442383, "learning_rate": 6.221458749245555e-08, "logits/chosen": -20.003734588623047, "logits/rejected": -18.452491760253906, "logps/chosen": -328.04901123046875, "logps/rejected": -261.28497314453125, "loss": 0.8692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8125369548797607, "rewards/margins": 1.6624714136123657, "rewards/rejected": 2.1500658988952637, "step": 85400 }, { "epoch": 3.965365151585496, "grad_norm": 53.59614562988281, "learning_rate": 6.218673104600956e-08, "logits/chosen": -19.237878799438477, "logits/rejected": -18.63428497314453, "logps/chosen": -438.83099365234375, "logps/rejected": -405.28564453125, "loss": 1.03, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.553497314453125, "rewards/margins": 0.18808627128601074, "rewards/rejected": 3.3654110431671143, "step": 85410 }, { "epoch": 3.965829425692929, "grad_norm": 3.6216442584991455, "learning_rate": 6.215887459956358e-08, "logits/chosen": -19.34564971923828, "logits/rejected": -17.72673988342285, "logps/chosen": -410.7491149902344, "logps/rejected": -261.4142150878906, "loss": 0.6321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.300261974334717, "rewards/margins": 1.8986423015594482, "rewards/rejected": 1.4016190767288208, "step": 85420 }, { "epoch": 3.966293699800362, "grad_norm": 0.8408811092376709, "learning_rate": 6.21310181531176e-08, "logits/chosen": -19.67356300354004, "logits/rejected": -19.83698272705078, "logps/chosen": -344.82061767578125, "logps/rejected": -372.3453063964844, "loss": 0.856, "rewards/accuracies": 0.5, "rewards/chosen": 3.223982572555542, "rewards/margins": 0.8284748196601868, "rewards/rejected": 2.395508289337158, "step": 85430 }, { "epoch": 3.966757973907795, "grad_norm": 3.1432173252105713, "learning_rate": 6.210316170667162e-08, "logits/chosen": -18.702831268310547, "logits/rejected": -17.071504592895508, "logps/chosen": -428.58416748046875, "logps/rejected": -254.4168701171875, "loss": 0.1828, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.581488609313965, "rewards/margins": 3.467916965484619, "rewards/rejected": 1.1135714054107666, "step": 85440 }, { "epoch": 3.9672222480152284, "grad_norm": 0.004921233281493187, "learning_rate": 6.207530526022563e-08, "logits/chosen": -19.705778121948242, "logits/rejected": -18.491283416748047, "logps/chosen": -420.1832580566406, "logps/rejected": -295.0804138183594, "loss": 0.2408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.794856071472168, "rewards/margins": 3.1053364276885986, "rewards/rejected": 1.6895195245742798, "step": 85450 }, { "epoch": 3.967686522122661, "grad_norm": 189.69052124023438, "learning_rate": 6.204744881377965e-08, "logits/chosen": -18.420698165893555, "logits/rejected": -18.351383209228516, "logps/chosen": -512.4886474609375, "logps/rejected": -438.8605041503906, "loss": 0.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.719334602355957, "rewards/margins": 1.6046346426010132, "rewards/rejected": 3.1147005558013916, "step": 85460 }, { "epoch": 3.9681507962300944, "grad_norm": 116.6161117553711, "learning_rate": 6.201959236733368e-08, "logits/chosen": -19.916072845458984, "logits/rejected": -18.76433753967285, "logps/chosen": -360.23101806640625, "logps/rejected": -266.3527526855469, "loss": 0.539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6760263442993164, "rewards/margins": 1.5391294956207275, "rewards/rejected": 1.1368968486785889, "step": 85470 }, { "epoch": 3.968615070337527, "grad_norm": 9.17127799987793, "learning_rate": 6.199173592088769e-08, "logits/chosen": -19.889583587646484, "logits/rejected": -19.322065353393555, "logps/chosen": -396.57940673828125, "logps/rejected": -392.9962463378906, "loss": 0.9027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.448697566986084, "rewards/margins": 0.2415177822113037, "rewards/rejected": 3.207179546356201, "step": 85480 }, { "epoch": 3.9690793444449604, "grad_norm": 128.99832153320312, "learning_rate": 6.19638794744417e-08, "logits/chosen": -18.737890243530273, "logits/rejected": -18.063385009765625, "logps/chosen": -379.1358337402344, "logps/rejected": -302.5445556640625, "loss": 0.9165, "rewards/accuracies": 0.5, "rewards/chosen": 3.5991909503936768, "rewards/margins": 0.9258950352668762, "rewards/rejected": 2.673295736312866, "step": 85490 }, { "epoch": 3.969543618552393, "grad_norm": 15.365880012512207, "learning_rate": 6.193602302799573e-08, "logits/chosen": -20.26514434814453, "logits/rejected": -18.730419158935547, "logps/chosen": -417.7408142089844, "logps/rejected": -280.92626953125, "loss": 0.2518, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.890225410461426, "rewards/margins": 3.07173752784729, "rewards/rejected": 1.8184878826141357, "step": 85500 }, { "epoch": 3.9700078926598263, "grad_norm": 9.306290626525879, "learning_rate": 6.190816658154975e-08, "logits/chosen": -19.46629524230957, "logits/rejected": -18.48971939086914, "logps/chosen": -382.9201354980469, "logps/rejected": -270.21368408203125, "loss": 0.2824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.606327056884766, "rewards/margins": 2.39998197555542, "rewards/rejected": 2.2063443660736084, "step": 85510 }, { "epoch": 3.9704721667672596, "grad_norm": 0.012015829794108868, "learning_rate": 6.188031013510376e-08, "logits/chosen": -18.875957489013672, "logits/rejected": -17.55351448059082, "logps/chosen": -473.6968688964844, "logps/rejected": -336.16925048828125, "loss": 0.7876, "rewards/accuracies": 0.5, "rewards/chosen": 4.480191707611084, "rewards/margins": 2.1327853202819824, "rewards/rejected": 2.3474063873291016, "step": 85520 }, { "epoch": 3.9709364408746923, "grad_norm": 73.11560821533203, "learning_rate": 6.185245368865778e-08, "logits/chosen": -19.014665603637695, "logits/rejected": -18.369243621826172, "logps/chosen": -366.9291687011719, "logps/rejected": -322.5953063964844, "loss": 0.3988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.648190975189209, "rewards/margins": 1.8951857089996338, "rewards/rejected": 1.753005027770996, "step": 85530 }, { "epoch": 3.9714007149821255, "grad_norm": 106.27274322509766, "learning_rate": 6.182459724221179e-08, "logits/chosen": -19.06810760498047, "logits/rejected": -18.200517654418945, "logps/chosen": -346.3553161621094, "logps/rejected": -330.1546936035156, "loss": 0.4567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2206051349639893, "rewards/margins": 1.2977758646011353, "rewards/rejected": 1.922829270362854, "step": 85540 }, { "epoch": 3.9718649890895583, "grad_norm": 33.9152946472168, "learning_rate": 6.179674079576582e-08, "logits/chosen": -19.343490600585938, "logits/rejected": -18.791217803955078, "logps/chosen": -375.5373840332031, "logps/rejected": -297.0636901855469, "loss": 0.8787, "rewards/accuracies": 0.5, "rewards/chosen": 3.6399693489074707, "rewards/margins": 0.5332730412483215, "rewards/rejected": 3.106696367263794, "step": 85550 }, { "epoch": 3.9723292631969915, "grad_norm": 18.960468292236328, "learning_rate": 6.176888434931983e-08, "logits/chosen": -17.953365325927734, "logits/rejected": -17.68062973022461, "logps/chosen": -314.0066833496094, "logps/rejected": -240.48147583007812, "loss": 0.685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.054227590560913, "rewards/margins": 0.9983442425727844, "rewards/rejected": 1.0558836460113525, "step": 85560 }, { "epoch": 3.9727935373044243, "grad_norm": 187.12876892089844, "learning_rate": 6.174102790287385e-08, "logits/chosen": -20.172496795654297, "logits/rejected": -18.94241714477539, "logps/chosen": -515.799560546875, "logps/rejected": -371.1161193847656, "loss": 0.3733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.771679401397705, "rewards/margins": 1.9053337574005127, "rewards/rejected": 2.8663458824157715, "step": 85570 }, { "epoch": 3.9732578114118575, "grad_norm": 84.405029296875, "learning_rate": 6.171317145642788e-08, "logits/chosen": -19.23383140563965, "logits/rejected": -18.171669006347656, "logps/chosen": -533.365234375, "logps/rejected": -289.43792724609375, "loss": 0.4808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.969442367553711, "rewards/margins": 1.7654567956924438, "rewards/rejected": 1.2039858102798462, "step": 85580 }, { "epoch": 3.9737220855192907, "grad_norm": 159.10751342773438, "learning_rate": 6.168531500998189e-08, "logits/chosen": -18.855802536010742, "logits/rejected": -18.79120445251465, "logps/chosen": -354.3898620605469, "logps/rejected": -325.24957275390625, "loss": 0.7774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.958437442779541, "rewards/margins": 1.4851168394088745, "rewards/rejected": 1.473320484161377, "step": 85590 }, { "epoch": 3.9741863596267235, "grad_norm": 72.59188079833984, "learning_rate": 6.16574585635359e-08, "logits/chosen": -19.145915985107422, "logits/rejected": -18.19487953186035, "logps/chosen": -401.3539733886719, "logps/rejected": -280.2364501953125, "loss": 0.2483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2775046825408936, "rewards/margins": 2.3048126697540283, "rewards/rejected": 0.9726920127868652, "step": 85600 }, { "epoch": 3.9746506337341567, "grad_norm": 147.27769470214844, "learning_rate": 6.162960211708993e-08, "logits/chosen": -18.54000473022461, "logits/rejected": -18.535198211669922, "logps/chosen": -334.4247131347656, "logps/rejected": -272.85784912109375, "loss": 0.7497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2361226081848145, "rewards/margins": 0.5534325838088989, "rewards/rejected": 1.6826900243759155, "step": 85610 }, { "epoch": 3.97511490784159, "grad_norm": 4.001386642456055, "learning_rate": 6.160174567064395e-08, "logits/chosen": -19.07870101928711, "logits/rejected": -17.639131546020508, "logps/chosen": -469.0028381347656, "logps/rejected": -300.404541015625, "loss": 0.3153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.923341751098633, "rewards/margins": 1.9847252368927002, "rewards/rejected": 1.938616394996643, "step": 85620 }, { "epoch": 3.9755791819490227, "grad_norm": 8.558700561523438, "learning_rate": 6.157388922419796e-08, "logits/chosen": -19.434188842773438, "logits/rejected": -17.745830535888672, "logps/chosen": -396.6700134277344, "logps/rejected": -278.92864990234375, "loss": 0.229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.608338832855225, "rewards/margins": 2.7908761501312256, "rewards/rejected": 1.817462682723999, "step": 85630 }, { "epoch": 3.9760434560564555, "grad_norm": 128.550537109375, "learning_rate": 6.154603277775199e-08, "logits/chosen": -19.0589599609375, "logits/rejected": -18.646717071533203, "logps/chosen": -340.64202880859375, "logps/rejected": -310.24676513671875, "loss": 0.5972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.349093198776245, "rewards/margins": 1.093117356300354, "rewards/rejected": 2.2559757232666016, "step": 85640 }, { "epoch": 3.9765077301638887, "grad_norm": 232.36508178710938, "learning_rate": 6.1518176331306e-08, "logits/chosen": -18.538043975830078, "logits/rejected": -18.588144302368164, "logps/chosen": -258.5337219238281, "logps/rejected": -266.5849914550781, "loss": 1.1034, "rewards/accuracies": 0.5, "rewards/chosen": 2.1475939750671387, "rewards/margins": 0.26800066232681274, "rewards/rejected": 1.8795932531356812, "step": 85650 }, { "epoch": 3.976972004271322, "grad_norm": 30.865161895751953, "learning_rate": 6.149031988486002e-08, "logits/chosen": -19.239429473876953, "logits/rejected": -18.739137649536133, "logps/chosen": -315.4664001464844, "logps/rejected": -303.9526062011719, "loss": 0.3819, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.846252918243408, "rewards/margins": 1.5908457040786743, "rewards/rejected": 1.2554075717926025, "step": 85660 }, { "epoch": 3.9774362783787547, "grad_norm": 37.010040283203125, "learning_rate": 6.146246343841403e-08, "logits/chosen": -19.60382080078125, "logits/rejected": -18.519529342651367, "logps/chosen": -372.9150085449219, "logps/rejected": -375.69915771484375, "loss": 0.762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.026749610900879, "rewards/margins": 0.8710934519767761, "rewards/rejected": 2.155655860900879, "step": 85670 }, { "epoch": 3.977900552486188, "grad_norm": 123.96073913574219, "learning_rate": 6.143460699196806e-08, "logits/chosen": -19.241323471069336, "logits/rejected": -17.9105167388916, "logps/chosen": -307.5005798339844, "logps/rejected": -262.3202819824219, "loss": 0.3167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2047152519226074, "rewards/margins": 1.9813123941421509, "rewards/rejected": 1.2234026193618774, "step": 85680 }, { "epoch": 3.978364826593621, "grad_norm": 65.28163146972656, "learning_rate": 6.140675054552208e-08, "logits/chosen": -18.571781158447266, "logits/rejected": -18.348907470703125, "logps/chosen": -342.52899169921875, "logps/rejected": -370.6855163574219, "loss": 0.4902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5361578464508057, "rewards/margins": 1.138406753540039, "rewards/rejected": 1.3977510929107666, "step": 85690 }, { "epoch": 3.978829100701054, "grad_norm": 89.97096252441406, "learning_rate": 6.137889409907609e-08, "logits/chosen": -19.93851089477539, "logits/rejected": -18.809566497802734, "logps/chosen": -380.50689697265625, "logps/rejected": -310.86260986328125, "loss": 0.4385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.106362819671631, "rewards/margins": 1.314994215965271, "rewards/rejected": 1.791369080543518, "step": 85700 }, { "epoch": 3.9792933748084867, "grad_norm": 6.216719150543213, "learning_rate": 6.135103765263012e-08, "logits/chosen": -19.163908004760742, "logits/rejected": -18.51702308654785, "logps/chosen": -305.81561279296875, "logps/rejected": -262.6042175292969, "loss": 0.3889, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.025531053543091, "rewards/margins": 1.8453524112701416, "rewards/rejected": 1.1801784038543701, "step": 85710 }, { "epoch": 3.97975764891592, "grad_norm": 106.59211730957031, "learning_rate": 6.132318120618413e-08, "logits/chosen": -18.17080307006836, "logits/rejected": -17.626140594482422, "logps/chosen": -379.7077941894531, "logps/rejected": -289.5619201660156, "loss": 0.6825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6862120628356934, "rewards/margins": 1.2420856952667236, "rewards/rejected": 1.4441264867782593, "step": 85720 }, { "epoch": 3.980221923023353, "grad_norm": 12.270059585571289, "learning_rate": 6.129532475973815e-08, "logits/chosen": -18.620851516723633, "logits/rejected": -18.69316864013672, "logps/chosen": -368.697021484375, "logps/rejected": -364.7361145019531, "loss": 1.0566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5766589641571045, "rewards/margins": 0.2162565290927887, "rewards/rejected": 3.3604018688201904, "step": 85730 }, { "epoch": 3.980686197130786, "grad_norm": 9.603548049926758, "learning_rate": 6.126746831329216e-08, "logits/chosen": -18.71630096435547, "logits/rejected": -17.95653533935547, "logps/chosen": -432.491455078125, "logps/rejected": -416.50335693359375, "loss": 0.4299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.321910381317139, "rewards/margins": 1.2722049951553345, "rewards/rejected": 3.0497052669525146, "step": 85740 }, { "epoch": 3.981150471238219, "grad_norm": 0.10830007493495941, "learning_rate": 6.123961186684618e-08, "logits/chosen": -19.091907501220703, "logits/rejected": -18.14443588256836, "logps/chosen": -405.685791015625, "logps/rejected": -363.63616943359375, "loss": 0.7944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.143385887145996, "rewards/margins": 1.1961138248443604, "rewards/rejected": 2.9472718238830566, "step": 85750 }, { "epoch": 3.9816147453456523, "grad_norm": 0.5029964447021484, "learning_rate": 6.12117554204002e-08, "logits/chosen": -19.461898803710938, "logits/rejected": -18.052377700805664, "logps/chosen": -438.47198486328125, "logps/rejected": -250.43118286132812, "loss": 0.2361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.877819061279297, "rewards/margins": 2.9806721210479736, "rewards/rejected": 0.8971472978591919, "step": 85760 }, { "epoch": 3.982079019453085, "grad_norm": 15.870490074157715, "learning_rate": 6.118389897395422e-08, "logits/chosen": -19.495067596435547, "logits/rejected": -19.136444091796875, "logps/chosen": -368.5936584472656, "logps/rejected": -369.93585205078125, "loss": 1.3144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.540234327316284, "rewards/margins": 0.6134767532348633, "rewards/rejected": 2.926757335662842, "step": 85770 }, { "epoch": 3.9825432935605183, "grad_norm": 109.13871002197266, "learning_rate": 6.115604252750823e-08, "logits/chosen": -19.31312370300293, "logits/rejected": -18.42337417602539, "logps/chosen": -356.0110168457031, "logps/rejected": -319.1285095214844, "loss": 0.6346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4992644786834717, "rewards/margins": 1.2418848276138306, "rewards/rejected": 2.2573797702789307, "step": 85780 }, { "epoch": 3.983007567667951, "grad_norm": 8.757101058959961, "learning_rate": 6.112818608106226e-08, "logits/chosen": -19.030954360961914, "logits/rejected": -18.04067611694336, "logps/chosen": -344.92376708984375, "logps/rejected": -335.5879211425781, "loss": 0.9129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.817058801651001, "rewards/margins": 0.6838475465774536, "rewards/rejected": 2.133211612701416, "step": 85790 }, { "epoch": 3.9834718417753843, "grad_norm": 167.30905151367188, "learning_rate": 6.110032963461627e-08, "logits/chosen": -18.446420669555664, "logits/rejected": -18.372413635253906, "logps/chosen": -368.7493591308594, "logps/rejected": -312.8578186035156, "loss": 0.9695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0236878395080566, "rewards/margins": 0.9895267486572266, "rewards/rejected": 2.034161329269409, "step": 85800 }, { "epoch": 3.983936115882817, "grad_norm": 4.579732894897461, "learning_rate": 6.107247318817029e-08, "logits/chosen": -17.882909774780273, "logits/rejected": -17.56096076965332, "logps/chosen": -320.99578857421875, "logps/rejected": -302.14849853515625, "loss": 0.7359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.404207468032837, "rewards/margins": 1.5919358730316162, "rewards/rejected": 0.8122712969779968, "step": 85810 }, { "epoch": 3.9844003899902503, "grad_norm": 58.34621810913086, "learning_rate": 6.104461674172432e-08, "logits/chosen": -19.148714065551758, "logits/rejected": -18.557785034179688, "logps/chosen": -405.5776062011719, "logps/rejected": -369.91741943359375, "loss": 0.8345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.443598985671997, "rewards/margins": 0.589240550994873, "rewards/rejected": 2.854358434677124, "step": 85820 }, { "epoch": 3.9848646640976835, "grad_norm": 18.57583999633789, "learning_rate": 6.101676029527833e-08, "logits/chosen": -19.174224853515625, "logits/rejected": -18.79338836669922, "logps/chosen": -417.151611328125, "logps/rejected": -355.1629638671875, "loss": 0.3623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5745949745178223, "rewards/margins": 1.419655442237854, "rewards/rejected": 2.1549391746520996, "step": 85830 }, { "epoch": 3.9853289382051162, "grad_norm": 52.97024917602539, "learning_rate": 6.098890384883235e-08, "logits/chosen": -19.004676818847656, "logits/rejected": -18.652284622192383, "logps/chosen": -336.1697082519531, "logps/rejected": -312.2818908691406, "loss": 1.5151, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.451853036880493, "rewards/margins": -0.35958337783813477, "rewards/rejected": 2.811436176300049, "step": 85840 }, { "epoch": 3.9857932123125495, "grad_norm": 168.72564697265625, "learning_rate": 6.096104740238637e-08, "logits/chosen": -20.222591400146484, "logits/rejected": -19.414905548095703, "logps/chosen": -463.37646484375, "logps/rejected": -362.54742431640625, "loss": 0.8986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.725294828414917, "rewards/margins": 0.6433233618736267, "rewards/rejected": 3.0819716453552246, "step": 85850 }, { "epoch": 3.9862574864199822, "grad_norm": 0.2941904366016388, "learning_rate": 6.093319095594039e-08, "logits/chosen": -18.34418487548828, "logits/rejected": -18.074996948242188, "logps/chosen": -384.8881530761719, "logps/rejected": -344.26318359375, "loss": 0.7252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9888243675231934, "rewards/margins": 1.5656640529632568, "rewards/rejected": 2.4231603145599365, "step": 85860 }, { "epoch": 3.9867217605274154, "grad_norm": 3.0774457454681396, "learning_rate": 6.09053345094944e-08, "logits/chosen": -19.397985458374023, "logits/rejected": -17.17258644104004, "logps/chosen": -412.9896545410156, "logps/rejected": -237.8641357421875, "loss": 0.5969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.564654350280762, "rewards/margins": 3.6203856468200684, "rewards/rejected": 0.9442685842514038, "step": 85870 }, { "epoch": 3.987186034634848, "grad_norm": 99.49996185302734, "learning_rate": 6.087747806304842e-08, "logits/chosen": -18.604721069335938, "logits/rejected": -19.23520278930664, "logps/chosen": -357.3772888183594, "logps/rejected": -388.7425842285156, "loss": 0.8561, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3350977897644043, "rewards/margins": 0.3513413965702057, "rewards/rejected": 2.9837570190429688, "step": 85880 }, { "epoch": 3.9876503087422814, "grad_norm": 254.55291748046875, "learning_rate": 6.084962161660245e-08, "logits/chosen": -18.219707489013672, "logits/rejected": -17.749849319458008, "logps/chosen": -354.0780334472656, "logps/rejected": -318.209716796875, "loss": 0.6897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0853357315063477, "rewards/margins": 0.8040071725845337, "rewards/rejected": 2.2813286781311035, "step": 85890 }, { "epoch": 3.9881145828497147, "grad_norm": 0.9851788282394409, "learning_rate": 6.082176517015646e-08, "logits/chosen": -19.159496307373047, "logits/rejected": -17.873489379882812, "logps/chosen": -488.08697509765625, "logps/rejected": -334.62347412109375, "loss": 0.3084, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.412586688995361, "rewards/margins": 2.425637722015381, "rewards/rejected": 1.9869487285614014, "step": 85900 }, { "epoch": 3.9885788569571474, "grad_norm": 4.766965389251709, "learning_rate": 6.079390872371047e-08, "logits/chosen": -18.457481384277344, "logits/rejected": -18.06962013244629, "logps/chosen": -354.91790771484375, "logps/rejected": -323.4766540527344, "loss": 1.1441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8588740825653076, "rewards/margins": 0.1739383041858673, "rewards/rejected": 2.6849358081817627, "step": 85910 }, { "epoch": 3.9890431310645806, "grad_norm": 0.06760457903146744, "learning_rate": 6.076605227726449e-08, "logits/chosen": -19.334758758544922, "logits/rejected": -18.380916595458984, "logps/chosen": -374.3926696777344, "logps/rejected": -284.86993408203125, "loss": 0.2917, "rewards/accuracies": 1.0, "rewards/chosen": 3.476996660232544, "rewards/margins": 2.1725950241088867, "rewards/rejected": 1.3044016361236572, "step": 85920 }, { "epoch": 3.9895074051720134, "grad_norm": 14.64083480834961, "learning_rate": 6.07381958308185e-08, "logits/chosen": -19.530712127685547, "logits/rejected": -18.204404830932617, "logps/chosen": -439.4419860839844, "logps/rejected": -264.1788635253906, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": 4.18912410736084, "rewards/margins": 3.182823896408081, "rewards/rejected": 1.0063002109527588, "step": 85930 }, { "epoch": 3.9899716792794466, "grad_norm": 22.213029861450195, "learning_rate": 6.071033938437253e-08, "logits/chosen": -20.151344299316406, "logits/rejected": -19.689090728759766, "logps/chosen": -370.4812927246094, "logps/rejected": -373.4326171875, "loss": 1.2472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.716648817062378, "rewards/margins": 0.2489163875579834, "rewards/rejected": 3.4677319526672363, "step": 85940 }, { "epoch": 3.9904359533868794, "grad_norm": 114.9783706665039, "learning_rate": 6.068248293792655e-08, "logits/chosen": -19.013242721557617, "logits/rejected": -19.096237182617188, "logps/chosen": -277.62384033203125, "logps/rejected": -279.9267883300781, "loss": 0.5406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.864337205886841, "rewards/margins": 1.2454144954681396, "rewards/rejected": 1.618922472000122, "step": 85950 }, { "epoch": 3.9909002274943126, "grad_norm": 0.020366612821817398, "learning_rate": 6.065462649148056e-08, "logits/chosen": -19.68311309814453, "logits/rejected": -18.897748947143555, "logps/chosen": -337.44366455078125, "logps/rejected": -395.73468017578125, "loss": 1.2657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8641319274902344, "rewards/margins": 1.356379508972168, "rewards/rejected": 2.5077526569366455, "step": 85960 }, { "epoch": 3.991364501601746, "grad_norm": 77.67724609375, "learning_rate": 6.062677004503459e-08, "logits/chosen": -18.32514190673828, "logits/rejected": -17.33155059814453, "logps/chosen": -470.960693359375, "logps/rejected": -298.92669677734375, "loss": 0.3251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.137560844421387, "rewards/margins": 2.8625380992889404, "rewards/rejected": 1.2750228643417358, "step": 85970 }, { "epoch": 3.9918287757091786, "grad_norm": 52.6295166015625, "learning_rate": 6.05989135985886e-08, "logits/chosen": -18.816469192504883, "logits/rejected": -18.79204750061035, "logps/chosen": -442.962158203125, "logps/rejected": -457.3777770996094, "loss": 0.5065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7174384593963623, "rewards/margins": 1.1196990013122559, "rewards/rejected": 2.5977396965026855, "step": 85980 }, { "epoch": 3.992293049816612, "grad_norm": 46.141597747802734, "learning_rate": 6.057105715214262e-08, "logits/chosen": -19.627735137939453, "logits/rejected": -17.90958595275879, "logps/chosen": -351.51434326171875, "logps/rejected": -195.35337829589844, "loss": 0.4026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9174399375915527, "rewards/margins": 2.2141215801239014, "rewards/rejected": 0.7033182978630066, "step": 85990 }, { "epoch": 3.992757323924045, "grad_norm": 187.74485778808594, "learning_rate": 6.054320070569664e-08, "logits/chosen": -19.04098892211914, "logits/rejected": -19.097461700439453, "logps/chosen": -368.67840576171875, "logps/rejected": -287.147216796875, "loss": 0.7586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2126617431640625, "rewards/margins": 0.4485917091369629, "rewards/rejected": 2.7640697956085205, "step": 86000 }, { "epoch": 3.993221598031478, "grad_norm": 0.2978731393814087, "learning_rate": 6.051534425925066e-08, "logits/chosen": -19.581165313720703, "logits/rejected": -19.1498966217041, "logps/chosen": -448.61187744140625, "logps/rejected": -312.59527587890625, "loss": 0.396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8922905921936035, "rewards/margins": 2.2726221084594727, "rewards/rejected": 1.6196680068969727, "step": 86010 }, { "epoch": 3.9936858721389106, "grad_norm": 39.03105163574219, "learning_rate": 6.048748781280467e-08, "logits/chosen": -19.047794342041016, "logits/rejected": -18.2028865814209, "logps/chosen": -366.1371765136719, "logps/rejected": -253.0278778076172, "loss": 0.3433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1165895462036133, "rewards/margins": 1.8289581537246704, "rewards/rejected": 1.2876315116882324, "step": 86020 }, { "epoch": 3.994150146246344, "grad_norm": 1.311802625656128, "learning_rate": 6.04596313663587e-08, "logits/chosen": -19.749053955078125, "logits/rejected": -18.97982406616211, "logps/chosen": -411.9888610839844, "logps/rejected": -352.5691223144531, "loss": 0.6304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5675134658813477, "rewards/margins": 1.3678089380264282, "rewards/rejected": 1.1997045278549194, "step": 86030 }, { "epoch": 3.994614420353777, "grad_norm": 90.33289337158203, "learning_rate": 6.043177491991272e-08, "logits/chosen": -19.450597763061523, "logits/rejected": -18.682544708251953, "logps/chosen": -443.6434020996094, "logps/rejected": -362.82208251953125, "loss": 0.6586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.596363067626953, "rewards/margins": 0.994842529296875, "rewards/rejected": 1.6015207767486572, "step": 86040 }, { "epoch": 3.99507869446121, "grad_norm": 49.40602493286133, "learning_rate": 6.040391847346673e-08, "logits/chosen": -18.609683990478516, "logits/rejected": -18.446048736572266, "logps/chosen": -407.02880859375, "logps/rejected": -369.74432373046875, "loss": 0.4915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.373657703399658, "rewards/margins": 1.0882453918457031, "rewards/rejected": 3.285412311553955, "step": 86050 }, { "epoch": 3.995542968568643, "grad_norm": 10.596392631530762, "learning_rate": 6.037606202702076e-08, "logits/chosen": -19.06061363220215, "logits/rejected": -17.792470932006836, "logps/chosen": -448.84967041015625, "logps/rejected": -301.2569580078125, "loss": 0.2712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.691507339477539, "rewards/margins": 2.1646742820739746, "rewards/rejected": 1.5268332958221436, "step": 86060 }, { "epoch": 3.996007242676076, "grad_norm": 27.345542907714844, "learning_rate": 6.034820558057477e-08, "logits/chosen": -19.83919906616211, "logits/rejected": -18.29206657409668, "logps/chosen": -448.9232482910156, "logps/rejected": -322.9082946777344, "loss": 0.3897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9636077880859375, "rewards/margins": 2.4143147468566895, "rewards/rejected": 1.5492929220199585, "step": 86070 }, { "epoch": 3.996471516783509, "grad_norm": 3.484313726425171, "learning_rate": 6.032034913412879e-08, "logits/chosen": -20.011852264404297, "logits/rejected": -19.316789627075195, "logps/chosen": -325.7017822265625, "logps/rejected": -287.4249572753906, "loss": 1.2284, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.284989833831787, "rewards/margins": -0.10643668472766876, "rewards/rejected": 2.3914265632629395, "step": 86080 }, { "epoch": 3.9969357908909418, "grad_norm": 0.9627315998077393, "learning_rate": 6.02924926876828e-08, "logits/chosen": -19.96929359436035, "logits/rejected": -19.266735076904297, "logps/chosen": -309.9285888671875, "logps/rejected": -264.83367919921875, "loss": 0.5659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7019920349121094, "rewards/margins": 1.2485802173614502, "rewards/rejected": 1.45341157913208, "step": 86090 }, { "epoch": 3.997400064998375, "grad_norm": 104.6070327758789, "learning_rate": 6.026463624123683e-08, "logits/chosen": -17.891979217529297, "logits/rejected": -19.096839904785156, "logps/chosen": -325.32781982421875, "logps/rejected": -399.713134765625, "loss": 1.87, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2217679023742676, "rewards/margins": -1.215868353843689, "rewards/rejected": 3.437636613845825, "step": 86100 }, { "epoch": 3.997864339105808, "grad_norm": 144.52252197265625, "learning_rate": 6.023677979479084e-08, "logits/chosen": -18.55562973022461, "logits/rejected": -18.640987396240234, "logps/chosen": -392.9922790527344, "logps/rejected": -402.56658935546875, "loss": 0.9908, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.123368501663208, "rewards/margins": 0.19939443469047546, "rewards/rejected": 2.9239742755889893, "step": 86110 }, { "epoch": 3.998328613213241, "grad_norm": 69.52377319335938, "learning_rate": 6.020892334834486e-08, "logits/chosen": -19.498634338378906, "logits/rejected": -19.198293685913086, "logps/chosen": -325.8841857910156, "logps/rejected": -283.1436462402344, "loss": 0.6115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7757439613342285, "rewards/margins": 1.2321937084197998, "rewards/rejected": 1.5435503721237183, "step": 86120 }, { "epoch": 3.998792887320674, "grad_norm": 231.2386932373047, "learning_rate": 6.018106690189887e-08, "logits/chosen": -18.2478084564209, "logits/rejected": -17.168550491333008, "logps/chosen": -380.83416748046875, "logps/rejected": -232.5230255126953, "loss": 0.9691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8950603008270264, "rewards/margins": 2.3574202060699463, "rewards/rejected": 1.5376403331756592, "step": 86130 }, { "epoch": 3.9992571614281074, "grad_norm": 61.99256134033203, "learning_rate": 6.015321045545289e-08, "logits/chosen": -19.313364028930664, "logits/rejected": -19.108388900756836, "logps/chosen": -472.90911865234375, "logps/rejected": -457.50164794921875, "loss": 0.3543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9822170734405518, "rewards/margins": 1.6025068759918213, "rewards/rejected": 2.3797104358673096, "step": 86140 }, { "epoch": 3.99972143553554, "grad_norm": 37.403289794921875, "learning_rate": 6.012535400900692e-08, "logits/chosen": -19.932836532592773, "logits/rejected": -18.242603302001953, "logps/chosen": -401.27044677734375, "logps/rejected": -270.201171875, "loss": 0.2318, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7450389862060547, "rewards/margins": 2.0401129722595215, "rewards/rejected": 1.7049258947372437, "step": 86150 }, { "epoch": 4.000185709642973, "grad_norm": 190.29522705078125, "learning_rate": 6.009749756256093e-08, "logits/chosen": -19.000202178955078, "logits/rejected": -18.704387664794922, "logps/chosen": -420.09320068359375, "logps/rejected": -360.67059326171875, "loss": 0.7276, "rewards/accuracies": 0.5, "rewards/chosen": 3.0222015380859375, "rewards/margins": 0.538114070892334, "rewards/rejected": 2.4840874671936035, "step": 86160 }, { "epoch": 4.000649983750407, "grad_norm": 102.33076477050781, "learning_rate": 6.006964111611495e-08, "logits/chosen": -18.237567901611328, "logits/rejected": -17.386280059814453, "logps/chosen": -413.6520080566406, "logps/rejected": -280.13104248046875, "loss": 0.6368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1859209537506104, "rewards/margins": 1.9118322134017944, "rewards/rejected": 1.2740883827209473, "step": 86170 }, { "epoch": 4.001114257857839, "grad_norm": 5.928825378417969, "learning_rate": 6.004178466966897e-08, "logits/chosen": -17.83127784729004, "logits/rejected": -17.627513885498047, "logps/chosen": -416.3126525878906, "logps/rejected": -327.42498779296875, "loss": 1.2684, "rewards/accuracies": 0.5, "rewards/chosen": 3.0582852363586426, "rewards/margins": 0.7147278785705566, "rewards/rejected": 2.343557596206665, "step": 86180 }, { "epoch": 4.001578531965272, "grad_norm": 30.921358108520508, "learning_rate": 6.001392822322299e-08, "logits/chosen": -19.384174346923828, "logits/rejected": -18.428836822509766, "logps/chosen": -389.8158874511719, "logps/rejected": -303.85870361328125, "loss": 0.3314, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5358314514160156, "rewards/margins": 1.3934623003005981, "rewards/rejected": 2.142369031906128, "step": 86190 }, { "epoch": 4.002042806072705, "grad_norm": 130.98690795898438, "learning_rate": 5.9986071776777e-08, "logits/chosen": -19.97119140625, "logits/rejected": -19.660301208496094, "logps/chosen": -394.3602600097656, "logps/rejected": -383.3748474121094, "loss": 0.6364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.072067737579346, "rewards/margins": 0.7767651677131653, "rewards/rejected": 3.295302629470825, "step": 86200 }, { "epoch": 4.002507080180139, "grad_norm": 0.11952272057533264, "learning_rate": 5.995821533033103e-08, "logits/chosen": -20.603120803833008, "logits/rejected": -19.444442749023438, "logps/chosen": -472.8717346191406, "logps/rejected": -321.7828674316406, "loss": 0.3496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.846966743469238, "rewards/margins": 2.465402603149414, "rewards/rejected": 2.381563901901245, "step": 86210 }, { "epoch": 4.002971354287571, "grad_norm": 1.8819375038146973, "learning_rate": 5.993035888388504e-08, "logits/chosen": -19.357831954956055, "logits/rejected": -17.896499633789062, "logps/chosen": -379.59674072265625, "logps/rejected": -266.948974609375, "loss": 0.2856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.378260135650635, "rewards/margins": 2.6608643531799316, "rewards/rejected": 1.7173950672149658, "step": 86220 }, { "epoch": 4.003435628395004, "grad_norm": 134.98773193359375, "learning_rate": 5.990250243743906e-08, "logits/chosen": -20.063419342041016, "logits/rejected": -20.15631675720215, "logps/chosen": -275.12994384765625, "logps/rejected": -325.15496826171875, "loss": 1.4526, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2338192462921143, "rewards/margins": -0.963427722454071, "rewards/rejected": 3.197246789932251, "step": 86230 }, { "epoch": 4.003899902502438, "grad_norm": 9.079448699951172, "learning_rate": 5.987464599099309e-08, "logits/chosen": -18.91970443725586, "logits/rejected": -18.4132080078125, "logps/chosen": -320.7572021484375, "logps/rejected": -279.4617919921875, "loss": 0.6402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0746607780456543, "rewards/margins": 0.5208002328872681, "rewards/rejected": 1.5538604259490967, "step": 86240 }, { "epoch": 4.0043641766098705, "grad_norm": 0.40422311425209045, "learning_rate": 5.98467895445471e-08, "logits/chosen": -18.84494972229004, "logits/rejected": -18.39400863647461, "logps/chosen": -396.89581298828125, "logps/rejected": -377.969970703125, "loss": 1.1576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.266456127166748, "rewards/margins": 0.770300030708313, "rewards/rejected": 2.4961562156677246, "step": 86250 }, { "epoch": 4.004828450717303, "grad_norm": 285.8179931640625, "learning_rate": 5.981893309810112e-08, "logits/chosen": -19.877708435058594, "logits/rejected": -19.44083023071289, "logps/chosen": -476.27056884765625, "logps/rejected": -498.9649353027344, "loss": 0.5585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7263214588165283, "rewards/margins": 0.8632593154907227, "rewards/rejected": 2.8630621433258057, "step": 86260 }, { "epoch": 4.005292724824736, "grad_norm": 34.332672119140625, "learning_rate": 5.979107665165514e-08, "logits/chosen": -19.24283218383789, "logits/rejected": -18.348007202148438, "logps/chosen": -509.79913330078125, "logps/rejected": -376.0385437011719, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": 5.300324440002441, "rewards/margins": 2.596857786178589, "rewards/rejected": 2.7034668922424316, "step": 86270 }, { "epoch": 4.00575699893217, "grad_norm": 7.8680291175842285, "learning_rate": 5.976322020520916e-08, "logits/chosen": -19.12407875061035, "logits/rejected": -18.61939239501953, "logps/chosen": -295.8497619628906, "logps/rejected": -257.12530517578125, "loss": 0.3919, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.861123561859131, "rewards/margins": 1.1462461948394775, "rewards/rejected": 1.7148773670196533, "step": 86280 }, { "epoch": 4.0062212730396025, "grad_norm": 67.38977813720703, "learning_rate": 5.973536375876317e-08, "logits/chosen": -19.190303802490234, "logits/rejected": -17.9182186126709, "logps/chosen": -456.24761962890625, "logps/rejected": -269.44476318359375, "loss": 0.3873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.093099594116211, "rewards/margins": 2.537052869796753, "rewards/rejected": 1.5560462474822998, "step": 86290 }, { "epoch": 4.006685547147035, "grad_norm": 72.23764038085938, "learning_rate": 5.970750731231719e-08, "logits/chosen": -19.620004653930664, "logits/rejected": -17.695117950439453, "logps/chosen": -361.2821350097656, "logps/rejected": -194.2471160888672, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": 4.036879539489746, "rewards/margins": 3.480534315109253, "rewards/rejected": 0.556344747543335, "step": 86300 }, { "epoch": 4.007149821254469, "grad_norm": 226.84945678710938, "learning_rate": 5.967965086587121e-08, "logits/chosen": -19.190288543701172, "logits/rejected": -18.7410945892334, "logps/chosen": -385.9962463378906, "logps/rejected": -272.3707580566406, "loss": 0.4349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.379915952682495, "rewards/margins": 1.5613523721694946, "rewards/rejected": 1.8185638189315796, "step": 86310 }, { "epoch": 4.007614095361902, "grad_norm": 5.563143730163574, "learning_rate": 5.965179441942522e-08, "logits/chosen": -19.169750213623047, "logits/rejected": -18.919559478759766, "logps/chosen": -347.8987121582031, "logps/rejected": -264.1069030761719, "loss": 0.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.147696018218994, "rewards/margins": 0.6591135859489441, "rewards/rejected": 1.4885826110839844, "step": 86320 }, { "epoch": 4.0080783694693345, "grad_norm": 7.185044288635254, "learning_rate": 5.962393797297924e-08, "logits/chosen": -19.240310668945312, "logits/rejected": -18.54920768737793, "logps/chosen": -359.7542419433594, "logps/rejected": -370.204345703125, "loss": 0.9483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8159804344177246, "rewards/margins": 0.5580673813819885, "rewards/rejected": 3.257913112640381, "step": 86330 }, { "epoch": 4.008542643576768, "grad_norm": 22.94765281677246, "learning_rate": 5.959608152653326e-08, "logits/chosen": -19.104312896728516, "logits/rejected": -17.962871551513672, "logps/chosen": -326.64251708984375, "logps/rejected": -235.8827362060547, "loss": 0.3218, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.992676258087158, "rewards/margins": 1.8756414651870728, "rewards/rejected": 1.117034673690796, "step": 86340 }, { "epoch": 4.009006917684201, "grad_norm": 0.9619469046592712, "learning_rate": 5.956822508008728e-08, "logits/chosen": -20.030223846435547, "logits/rejected": -18.515155792236328, "logps/chosen": -415.29248046875, "logps/rejected": -355.6144104003906, "loss": 0.7201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8918914794921875, "rewards/margins": 1.1968984603881836, "rewards/rejected": 1.6949927806854248, "step": 86350 }, { "epoch": 4.009471191791634, "grad_norm": 257.0032958984375, "learning_rate": 5.9540368633641294e-08, "logits/chosen": -18.320751190185547, "logits/rejected": -17.758338928222656, "logps/chosen": -321.24072265625, "logps/rejected": -249.05667114257812, "loss": 0.9465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8620662689208984, "rewards/margins": 1.4713242053985596, "rewards/rejected": 1.390742301940918, "step": 86360 }, { "epoch": 4.0099354658990665, "grad_norm": 147.87799072265625, "learning_rate": 5.9512512187195315e-08, "logits/chosen": -19.886371612548828, "logits/rejected": -18.82220458984375, "logps/chosen": -358.84552001953125, "logps/rejected": -296.3784484863281, "loss": 0.8262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.046293020248413, "rewards/margins": 0.6790803670883179, "rewards/rejected": 2.3672127723693848, "step": 86370 }, { "epoch": 4.0103997400065, "grad_norm": 0.273024320602417, "learning_rate": 5.9484655740749336e-08, "logits/chosen": -19.629289627075195, "logits/rejected": -18.57735252380371, "logps/chosen": -329.0124206542969, "logps/rejected": -265.3096008300781, "loss": 0.944, "rewards/accuracies": 0.5, "rewards/chosen": 2.481194019317627, "rewards/margins": 0.5144203305244446, "rewards/rejected": 1.9667739868164062, "step": 86380 }, { "epoch": 4.010864014113933, "grad_norm": 114.92312622070312, "learning_rate": 5.945679929430335e-08, "logits/chosen": -19.196182250976562, "logits/rejected": -18.492799758911133, "logps/chosen": -318.60052490234375, "logps/rejected": -275.32489013671875, "loss": 0.3651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2861900329589844, "rewards/margins": 1.5820600986480713, "rewards/rejected": 1.7041298151016235, "step": 86390 }, { "epoch": 4.011328288221366, "grad_norm": 245.67074584960938, "learning_rate": 5.942894284785737e-08, "logits/chosen": -18.706451416015625, "logits/rejected": -18.58359146118164, "logps/chosen": -419.2167053222656, "logps/rejected": -414.16314697265625, "loss": 0.6789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5911312103271484, "rewards/margins": 0.3448841869831085, "rewards/rejected": 3.2462470531463623, "step": 86400 }, { "epoch": 4.011792562328799, "grad_norm": 9.874828338623047, "learning_rate": 5.940108640141139e-08, "logits/chosen": -19.5563907623291, "logits/rejected": -17.906679153442383, "logps/chosen": -395.5209655761719, "logps/rejected": -271.9704895019531, "loss": 0.2162, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.076083183288574, "rewards/margins": 2.5783677101135254, "rewards/rejected": 1.4977154731750488, "step": 86410 }, { "epoch": 4.012256836436232, "grad_norm": 19.111589431762695, "learning_rate": 5.937322995496541e-08, "logits/chosen": -20.058002471923828, "logits/rejected": -18.641618728637695, "logps/chosen": -578.7446899414062, "logps/rejected": -385.44512939453125, "loss": 0.4504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.924551486968994, "rewards/margins": 2.309337615966797, "rewards/rejected": 1.615213394165039, "step": 86420 }, { "epoch": 4.012721110543665, "grad_norm": 142.87255859375, "learning_rate": 5.934537350851943e-08, "logits/chosen": -18.8559627532959, "logits/rejected": -18.162809371948242, "logps/chosen": -357.66473388671875, "logps/rejected": -262.47100830078125, "loss": 0.4354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4346091747283936, "rewards/margins": 2.467735767364502, "rewards/rejected": 0.9668736457824707, "step": 86430 }, { "epoch": 4.013185384651098, "grad_norm": 55.58509063720703, "learning_rate": 5.9317517062073443e-08, "logits/chosen": -18.680477142333984, "logits/rejected": -17.87734603881836, "logps/chosen": -337.43243408203125, "logps/rejected": -227.583251953125, "loss": 0.5867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9616329669952393, "rewards/margins": 2.179476737976074, "rewards/rejected": 0.7821559906005859, "step": 86440 }, { "epoch": 4.013649658758531, "grad_norm": 69.65911102294922, "learning_rate": 5.9289660615627465e-08, "logits/chosen": -18.156505584716797, "logits/rejected": -17.172788619995117, "logps/chosen": -375.7960510253906, "logps/rejected": -283.1960144042969, "loss": 0.5665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2837014198303223, "rewards/margins": 2.6982784271240234, "rewards/rejected": 0.5854231119155884, "step": 86450 }, { "epoch": 4.014113932865964, "grad_norm": 163.63198852539062, "learning_rate": 5.9261804169181486e-08, "logits/chosen": -18.94144058227539, "logits/rejected": -19.42070960998535, "logps/chosen": -348.88421630859375, "logps/rejected": -391.97943115234375, "loss": 1.5519, "rewards/accuracies": 0.5, "rewards/chosen": 4.009486675262451, "rewards/margins": 0.214274600148201, "rewards/rejected": 3.795212507247925, "step": 86460 }, { "epoch": 4.014578206973397, "grad_norm": 286.0190124511719, "learning_rate": 5.92339477227355e-08, "logits/chosen": -19.194778442382812, "logits/rejected": -18.57473373413086, "logps/chosen": -384.3171081542969, "logps/rejected": -370.2940368652344, "loss": 0.8927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5224926471710205, "rewards/margins": 0.7267628908157349, "rewards/rejected": 2.795729637145996, "step": 86470 }, { "epoch": 4.0150424810808305, "grad_norm": 158.48812866210938, "learning_rate": 5.920609127628952e-08, "logits/chosen": -19.0926513671875, "logits/rejected": -18.660221099853516, "logps/chosen": -391.9421691894531, "logps/rejected": -320.6728515625, "loss": 0.7613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9927010536193848, "rewards/margins": 1.0458347797393799, "rewards/rejected": 1.9468663930892944, "step": 86480 }, { "epoch": 4.015506755188263, "grad_norm": 47.79999923706055, "learning_rate": 5.917823482984354e-08, "logits/chosen": -18.75197982788086, "logits/rejected": -18.294342041015625, "logps/chosen": -322.8780212402344, "logps/rejected": -265.6164855957031, "loss": 0.7726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9581589698791504, "rewards/margins": 1.0804858207702637, "rewards/rejected": 1.8776733875274658, "step": 86490 }, { "epoch": 4.015971029295696, "grad_norm": 17.45279884338379, "learning_rate": 5.915037838339756e-08, "logits/chosen": -18.482833862304688, "logits/rejected": -17.75796890258789, "logps/chosen": -344.77313232421875, "logps/rejected": -321.42486572265625, "loss": 0.4173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.151164531707764, "rewards/margins": 1.7921817302703857, "rewards/rejected": 2.358983278274536, "step": 86500 }, { "epoch": 4.016435303403129, "grad_norm": 46.09317398071289, "learning_rate": 5.912252193695158e-08, "logits/chosen": -18.9683837890625, "logits/rejected": -18.55569839477539, "logps/chosen": -430.9732360839844, "logps/rejected": -369.65093994140625, "loss": 0.5973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4986140727996826, "rewards/margins": 1.3068797588348389, "rewards/rejected": 2.1917343139648438, "step": 86510 }, { "epoch": 4.0168995775105625, "grad_norm": 20.3602352142334, "learning_rate": 5.9094665490505586e-08, "logits/chosen": -19.571203231811523, "logits/rejected": -18.74297332763672, "logps/chosen": -441.02069091796875, "logps/rejected": -352.91845703125, "loss": 0.5421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.36402702331543, "rewards/margins": 1.2971290349960327, "rewards/rejected": 3.0668981075286865, "step": 86520 }, { "epoch": 4.017363851617995, "grad_norm": 53.414913177490234, "learning_rate": 5.906680904405961e-08, "logits/chosen": -18.86233139038086, "logits/rejected": -18.713134765625, "logps/chosen": -347.9136657714844, "logps/rejected": -330.84326171875, "loss": 0.9093, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3136816024780273, "rewards/margins": 0.07378053665161133, "rewards/rejected": 2.239901065826416, "step": 86530 }, { "epoch": 4.017828125725428, "grad_norm": 102.58209228515625, "learning_rate": 5.903895259761363e-08, "logits/chosen": -19.228723526000977, "logits/rejected": -17.953691482543945, "logps/chosen": -457.69189453125, "logps/rejected": -326.90277099609375, "loss": 0.4189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.345427513122559, "rewards/margins": 2.1389498710632324, "rewards/rejected": 2.206477403640747, "step": 86540 }, { "epoch": 4.018292399832862, "grad_norm": 32.9239616394043, "learning_rate": 5.901109615116764e-08, "logits/chosen": -18.457611083984375, "logits/rejected": -18.133329391479492, "logps/chosen": -458.0185546875, "logps/rejected": -384.34368896484375, "loss": 0.9806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.011793613433838, "rewards/margins": 0.26763954758644104, "rewards/rejected": 3.7441534996032715, "step": 86550 }, { "epoch": 4.0187566739402945, "grad_norm": 11.08713150024414, "learning_rate": 5.8983239704721664e-08, "logits/chosen": -18.7208251953125, "logits/rejected": -17.763206481933594, "logps/chosen": -363.81072998046875, "logps/rejected": -269.78448486328125, "loss": 0.5518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.481941223144531, "rewards/margins": 2.6826841831207275, "rewards/rejected": 1.7992570400238037, "step": 86560 }, { "epoch": 4.019220948047727, "grad_norm": 16.13037872314453, "learning_rate": 5.895538325827568e-08, "logits/chosen": -19.05914878845215, "logits/rejected": -18.38387680053711, "logps/chosen": -391.23016357421875, "logps/rejected": -255.35147094726562, "loss": 0.4846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4847755432128906, "rewards/margins": 1.0287364721298218, "rewards/rejected": 2.4560389518737793, "step": 86570 }, { "epoch": 4.01968522215516, "grad_norm": 10.858599662780762, "learning_rate": 5.89275268118297e-08, "logits/chosen": -19.242048263549805, "logits/rejected": -18.55633544921875, "logps/chosen": -355.7055969238281, "logps/rejected": -235.13162231445312, "loss": 0.5512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.95173716545105, "rewards/margins": 1.1541955471038818, "rewards/rejected": 1.797541618347168, "step": 86580 }, { "epoch": 4.020149496262594, "grad_norm": 157.32073974609375, "learning_rate": 5.889967036538372e-08, "logits/chosen": -19.924335479736328, "logits/rejected": -18.25330352783203, "logps/chosen": -438.7872009277344, "logps/rejected": -296.29205322265625, "loss": 0.5701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7545013427734375, "rewards/margins": 1.8306547403335571, "rewards/rejected": 1.923846960067749, "step": 86590 }, { "epoch": 4.020613770370026, "grad_norm": 50.97209167480469, "learning_rate": 5.8871813918937736e-08, "logits/chosen": -19.354496002197266, "logits/rejected": -18.751338958740234, "logps/chosen": -551.9800415039062, "logps/rejected": -471.5250549316406, "loss": 0.451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.464121341705322, "rewards/margins": 1.5583913326263428, "rewards/rejected": 2.9057297706604004, "step": 86600 }, { "epoch": 4.021078044477459, "grad_norm": 4.5401458740234375, "learning_rate": 5.884395747249176e-08, "logits/chosen": -19.376636505126953, "logits/rejected": -18.426395416259766, "logps/chosen": -383.4201965332031, "logps/rejected": -321.2237854003906, "loss": 0.666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.30239200592041, "rewards/margins": 1.3436553478240967, "rewards/rejected": 2.9587368965148926, "step": 86610 }, { "epoch": 4.021542318584893, "grad_norm": 0.15004482865333557, "learning_rate": 5.881610102604578e-08, "logits/chosen": -19.0623836517334, "logits/rejected": -17.861862182617188, "logps/chosen": -437.7777404785156, "logps/rejected": -358.7422180175781, "loss": 0.356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8142170906066895, "rewards/margins": 2.3667354583740234, "rewards/rejected": 2.447481632232666, "step": 86620 }, { "epoch": 4.022006592692326, "grad_norm": 58.920108795166016, "learning_rate": 5.878824457959979e-08, "logits/chosen": -19.542057037353516, "logits/rejected": -18.440275192260742, "logps/chosen": -350.17132568359375, "logps/rejected": -297.26177978515625, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 4.352479457855225, "rewards/margins": 2.1431891918182373, "rewards/rejected": 2.2092905044555664, "step": 86630 }, { "epoch": 4.022470866799758, "grad_norm": 2.1535513401031494, "learning_rate": 5.8760388133153814e-08, "logits/chosen": -17.85682487487793, "logits/rejected": -17.888166427612305, "logps/chosen": -390.5859069824219, "logps/rejected": -384.71649169921875, "loss": 1.654, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.856478452682495, "rewards/margins": 0.010704517364501953, "rewards/rejected": 3.845773696899414, "step": 86640 }, { "epoch": 4.022935140907191, "grad_norm": 147.00851440429688, "learning_rate": 5.873253168670783e-08, "logits/chosen": -19.6892032623291, "logits/rejected": -18.65181541442871, "logps/chosen": -389.08233642578125, "logps/rejected": -430.354736328125, "loss": 1.0148, "rewards/accuracies": 0.5, "rewards/chosen": 4.429165840148926, "rewards/margins": 0.9076863527297974, "rewards/rejected": 3.521479845046997, "step": 86650 }, { "epoch": 4.023399415014625, "grad_norm": 99.78143310546875, "learning_rate": 5.870467524026185e-08, "logits/chosen": -19.60755729675293, "logits/rejected": -18.440309524536133, "logps/chosen": -456.275390625, "logps/rejected": -406.13409423828125, "loss": 0.4845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.418444633483887, "rewards/margins": 1.4689922332763672, "rewards/rejected": 2.9494528770446777, "step": 86660 }, { "epoch": 4.023863689122058, "grad_norm": 25.90447425842285, "learning_rate": 5.867681879381587e-08, "logits/chosen": -18.901966094970703, "logits/rejected": -17.665912628173828, "logps/chosen": -532.9896240234375, "logps/rejected": -369.47198486328125, "loss": 0.4037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.147792339324951, "rewards/margins": 1.9654327630996704, "rewards/rejected": 2.1823596954345703, "step": 86670 }, { "epoch": 4.02432796322949, "grad_norm": 5.317455768585205, "learning_rate": 5.8648962347369885e-08, "logits/chosen": -18.751907348632812, "logits/rejected": -18.645830154418945, "logps/chosen": -374.40130615234375, "logps/rejected": -344.8503723144531, "loss": 0.378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.395573377609253, "rewards/margins": 1.2030222415924072, "rewards/rejected": 1.1925512552261353, "step": 86680 }, { "epoch": 4.024792237336924, "grad_norm": 258.2686462402344, "learning_rate": 5.8621105900923906e-08, "logits/chosen": -19.366111755371094, "logits/rejected": -18.35915184020996, "logps/chosen": -436.07861328125, "logps/rejected": -351.93133544921875, "loss": 0.3957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.656315803527832, "rewards/margins": 2.154604434967041, "rewards/rejected": 2.5017106533050537, "step": 86690 }, { "epoch": 4.025256511444357, "grad_norm": 75.61048126220703, "learning_rate": 5.859324945447793e-08, "logits/chosen": -18.99990463256836, "logits/rejected": -17.47157096862793, "logps/chosen": -500.26055908203125, "logps/rejected": -336.89959716796875, "loss": 0.6471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.857661485671997, "rewards/margins": 1.533653974533081, "rewards/rejected": 1.3240077495574951, "step": 86700 }, { "epoch": 4.02572078555179, "grad_norm": 49.94235610961914, "learning_rate": 5.8565393008031935e-08, "logits/chosen": -19.44124984741211, "logits/rejected": -18.898361206054688, "logps/chosen": -364.24993896484375, "logps/rejected": -328.74993896484375, "loss": 1.0665, "rewards/accuracies": 0.5, "rewards/chosen": 3.2731919288635254, "rewards/margins": 0.359661728143692, "rewards/rejected": 2.913529872894287, "step": 86710 }, { "epoch": 4.026185059659223, "grad_norm": 19.551374435424805, "learning_rate": 5.8537536561585956e-08, "logits/chosen": -19.945621490478516, "logits/rejected": -17.93789291381836, "logps/chosen": -408.55389404296875, "logps/rejected": -191.85464477539062, "loss": 0.2189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2295455932617188, "rewards/margins": 2.5891170501708984, "rewards/rejected": 0.6404282450675964, "step": 86720 }, { "epoch": 4.026649333766656, "grad_norm": 4.580090522766113, "learning_rate": 5.850968011513997e-08, "logits/chosen": -18.733901977539062, "logits/rejected": -17.768756866455078, "logps/chosen": -426.0768127441406, "logps/rejected": -318.1281433105469, "loss": 0.9108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.951298236846924, "rewards/margins": 1.5579168796539307, "rewards/rejected": 2.393381118774414, "step": 86730 }, { "epoch": 4.027113607874089, "grad_norm": 57.81524658203125, "learning_rate": 5.848182366869399e-08, "logits/chosen": -19.53370475769043, "logits/rejected": -19.2930965423584, "logps/chosen": -416.9153747558594, "logps/rejected": -377.0180969238281, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.523869752883911, "rewards/margins": 1.2393481731414795, "rewards/rejected": 2.2845215797424316, "step": 86740 }, { "epoch": 4.027577881981522, "grad_norm": 95.2295913696289, "learning_rate": 5.845396722224801e-08, "logits/chosen": -20.247512817382812, "logits/rejected": -19.294376373291016, "logps/chosen": -377.93695068359375, "logps/rejected": -270.64422607421875, "loss": 0.4945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.975520133972168, "rewards/margins": 1.0019464492797852, "rewards/rejected": 1.973573923110962, "step": 86750 }, { "epoch": 4.028042156088955, "grad_norm": 1.2499438524246216, "learning_rate": 5.842611077580203e-08, "logits/chosen": -20.198131561279297, "logits/rejected": -19.131010055541992, "logps/chosen": -350.15093994140625, "logps/rejected": -274.7777099609375, "loss": 0.4545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6939990520477295, "rewards/margins": 1.816017746925354, "rewards/rejected": 1.8779809474945068, "step": 86760 }, { "epoch": 4.028506430196388, "grad_norm": 17.052034378051758, "learning_rate": 5.839825432935605e-08, "logits/chosen": -19.579120635986328, "logits/rejected": -18.95492172241211, "logps/chosen": -410.36724853515625, "logps/rejected": -374.90850830078125, "loss": 0.477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7764334678649902, "rewards/margins": 0.7028528451919556, "rewards/rejected": 3.0735812187194824, "step": 86770 }, { "epoch": 4.028970704303821, "grad_norm": 105.93791198730469, "learning_rate": 5.8370397882910063e-08, "logits/chosen": -19.185901641845703, "logits/rejected": -17.93982696533203, "logps/chosen": -403.15533447265625, "logps/rejected": -270.67230224609375, "loss": 0.4019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.334550380706787, "rewards/margins": 1.3327146768569946, "rewards/rejected": 2.001835584640503, "step": 86780 }, { "epoch": 4.029434978411254, "grad_norm": 3.8092994689941406, "learning_rate": 5.8342541436464085e-08, "logits/chosen": -19.014347076416016, "logits/rejected": -17.58388900756836, "logps/chosen": -442.218017578125, "logps/rejected": -360.2268371582031, "loss": 0.5947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.505340576171875, "rewards/margins": 1.6777302026748657, "rewards/rejected": 2.827610731124878, "step": 86790 }, { "epoch": 4.029899252518687, "grad_norm": 0.13772517442703247, "learning_rate": 5.8314684990018106e-08, "logits/chosen": -19.26247787475586, "logits/rejected": -18.83391761779785, "logps/chosen": -417.290771484375, "logps/rejected": -296.64031982421875, "loss": 0.5943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.295073986053467, "rewards/margins": 1.5512919425964355, "rewards/rejected": 1.7437822818756104, "step": 86800 }, { "epoch": 4.03036352662612, "grad_norm": 139.6085205078125, "learning_rate": 5.828682854357212e-08, "logits/chosen": -18.961467742919922, "logits/rejected": -18.667970657348633, "logps/chosen": -404.8894348144531, "logps/rejected": -358.305419921875, "loss": 0.643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.253742218017578, "rewards/margins": 0.815379798412323, "rewards/rejected": 2.4383623600006104, "step": 86810 }, { "epoch": 4.030827800733553, "grad_norm": 7.333747386932373, "learning_rate": 5.825897209712614e-08, "logits/chosen": -19.58840560913086, "logits/rejected": -16.961660385131836, "logps/chosen": -533.9058837890625, "logps/rejected": -228.47592163085938, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 5.6107659339904785, "rewards/margins": 4.229740619659424, "rewards/rejected": 1.3810253143310547, "step": 86820 }, { "epoch": 4.031292074840986, "grad_norm": 2.613154411315918, "learning_rate": 5.823111565068016e-08, "logits/chosen": -19.30575180053711, "logits/rejected": -18.579666137695312, "logps/chosen": -414.9869689941406, "logps/rejected": -350.0556640625, "loss": 0.4464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.782958984375, "rewards/margins": 1.8143947124481201, "rewards/rejected": 2.96856427192688, "step": 86830 }, { "epoch": 4.031756348948419, "grad_norm": 186.2588348388672, "learning_rate": 5.820325920423418e-08, "logits/chosen": -19.717498779296875, "logits/rejected": -18.933006286621094, "logps/chosen": -245.304931640625, "logps/rejected": -188.15077209472656, "loss": 0.4773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9704234600067139, "rewards/margins": 0.9740737080574036, "rewards/rejected": 0.9963496923446655, "step": 86840 }, { "epoch": 4.032220623055852, "grad_norm": 127.55741882324219, "learning_rate": 5.81754027577882e-08, "logits/chosen": -19.489307403564453, "logits/rejected": -18.459291458129883, "logps/chosen": -423.87872314453125, "logps/rejected": -330.3861999511719, "loss": 0.7014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6218955516815186, "rewards/margins": 0.8822606205940247, "rewards/rejected": 2.7396352291107178, "step": 86850 }, { "epoch": 4.032684897163286, "grad_norm": 94.89970397949219, "learning_rate": 5.814754631134221e-08, "logits/chosen": -19.964256286621094, "logits/rejected": -18.377155303955078, "logps/chosen": -400.71844482421875, "logps/rejected": -332.1771545410156, "loss": 0.6946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3341808319091797, "rewards/margins": 1.2160063982009888, "rewards/rejected": 2.1181750297546387, "step": 86860 }, { "epoch": 4.033149171270718, "grad_norm": 5.721868515014648, "learning_rate": 5.8119689864896234e-08, "logits/chosen": -18.103181838989258, "logits/rejected": -18.007997512817383, "logps/chosen": -292.1429748535156, "logps/rejected": -303.93109130859375, "loss": 0.9681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.737227201461792, "rewards/margins": 0.2961595952510834, "rewards/rejected": 1.4410674571990967, "step": 86870 }, { "epoch": 4.033613445378151, "grad_norm": 7.9358296394348145, "learning_rate": 5.8091833418450255e-08, "logits/chosen": -20.3646240234375, "logits/rejected": -18.853633880615234, "logps/chosen": -358.99114990234375, "logps/rejected": -261.1426086425781, "loss": 0.3051, "rewards/accuracies": 1.0, "rewards/chosen": 3.830907106399536, "rewards/margins": 1.294131875038147, "rewards/rejected": 2.5367753505706787, "step": 86880 }, { "epoch": 4.034077719485584, "grad_norm": 1.958028793334961, "learning_rate": 5.806397697200427e-08, "logits/chosen": -19.02128791809082, "logits/rejected": -17.60270118713379, "logps/chosen": -430.54046630859375, "logps/rejected": -289.814453125, "loss": 0.245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0428466796875, "rewards/margins": 1.9351446628570557, "rewards/rejected": 1.1077018976211548, "step": 86890 }, { "epoch": 4.034541993593018, "grad_norm": 17.001964569091797, "learning_rate": 5.803612052555829e-08, "logits/chosen": -19.341575622558594, "logits/rejected": -18.916934967041016, "logps/chosen": -334.73065185546875, "logps/rejected": -316.7342529296875, "loss": 0.772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.682248830795288, "rewards/margins": 0.7800281643867493, "rewards/rejected": 1.9022207260131836, "step": 86900 }, { "epoch": 4.03500626770045, "grad_norm": 248.2738037109375, "learning_rate": 5.80082640791123e-08, "logits/chosen": -18.15428352355957, "logits/rejected": -17.814380645751953, "logps/chosen": -318.1865539550781, "logps/rejected": -265.1943359375, "loss": 0.7758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0725677013397217, "rewards/margins": 0.6668955683708191, "rewards/rejected": 1.4056721925735474, "step": 86910 }, { "epoch": 4.035470541807883, "grad_norm": 60.48345947265625, "learning_rate": 5.798040763266632e-08, "logits/chosen": -19.06312370300293, "logits/rejected": -18.59206199645996, "logps/chosen": -334.44793701171875, "logps/rejected": -287.94866943359375, "loss": 0.6976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.23014497756958, "rewards/margins": 1.8561166524887085, "rewards/rejected": 2.374027729034424, "step": 86920 }, { "epoch": 4.035934815915317, "grad_norm": 250.8170166015625, "learning_rate": 5.795255118622034e-08, "logits/chosen": -18.4517765045166, "logits/rejected": -19.00493812561035, "logps/chosen": -414.271240234375, "logps/rejected": -436.77166748046875, "loss": 1.7784, "rewards/accuracies": 0.5, "rewards/chosen": 3.226710796356201, "rewards/margins": -0.5878416299819946, "rewards/rejected": 3.8145523071289062, "step": 86930 }, { "epoch": 4.0363990900227495, "grad_norm": 87.90239715576172, "learning_rate": 5.7924694739774356e-08, "logits/chosen": -19.578594207763672, "logits/rejected": -18.430744171142578, "logps/chosen": -367.62335205078125, "logps/rejected": -285.68182373046875, "loss": 0.5255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.598665952682495, "rewards/margins": 1.464648962020874, "rewards/rejected": 2.134016990661621, "step": 86940 }, { "epoch": 4.036863364130182, "grad_norm": 31.763935089111328, "learning_rate": 5.789683829332838e-08, "logits/chosen": -19.429719924926758, "logits/rejected": -18.127805709838867, "logps/chosen": -344.6983947753906, "logps/rejected": -263.8550109863281, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.376593589782715, "rewards/margins": 1.5455173254013062, "rewards/rejected": 0.8310762643814087, "step": 86950 }, { "epoch": 4.037327638237615, "grad_norm": 211.67373657226562, "learning_rate": 5.78689818468824e-08, "logits/chosen": -19.809059143066406, "logits/rejected": -18.818767547607422, "logps/chosen": -361.2579040527344, "logps/rejected": -279.30963134765625, "loss": 0.5101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4617786407470703, "rewards/margins": 1.2780975103378296, "rewards/rejected": 2.1836812496185303, "step": 86960 }, { "epoch": 4.037791912345049, "grad_norm": 11.711913108825684, "learning_rate": 5.784112540043641e-08, "logits/chosen": -18.106964111328125, "logits/rejected": -17.457605361938477, "logps/chosen": -274.31964111328125, "logps/rejected": -234.18264770507812, "loss": 0.3373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.726889133453369, "rewards/margins": 1.4543979167938232, "rewards/rejected": 1.2724910974502563, "step": 86970 }, { "epoch": 4.0382561864524815, "grad_norm": 202.79800415039062, "learning_rate": 5.7813268953990434e-08, "logits/chosen": -19.226123809814453, "logits/rejected": -18.80881118774414, "logps/chosen": -331.3252258300781, "logps/rejected": -332.4290466308594, "loss": 0.9748, "rewards/accuracies": 0.5, "rewards/chosen": 2.286367893218994, "rewards/margins": 0.5847350358963013, "rewards/rejected": 1.7016328573226929, "step": 86980 }, { "epoch": 4.038720460559914, "grad_norm": 51.06646728515625, "learning_rate": 5.778541250754445e-08, "logits/chosen": -19.56638526916504, "logits/rejected": -18.626483917236328, "logps/chosen": -420.763427734375, "logps/rejected": -322.8121643066406, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8171985149383545, "rewards/margins": 1.1289392709732056, "rewards/rejected": 1.6882593631744385, "step": 86990 }, { "epoch": 4.039184734667348, "grad_norm": 3.151857614517212, "learning_rate": 5.775755606109847e-08, "logits/chosen": -19.775697708129883, "logits/rejected": -18.537324905395508, "logps/chosen": -441.1764221191406, "logps/rejected": -349.7193908691406, "loss": 0.6858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.478023052215576, "rewards/margins": 1.5446780920028687, "rewards/rejected": 2.933344841003418, "step": 87000 }, { "epoch": 4.039649008774781, "grad_norm": 279.9143981933594, "learning_rate": 5.772969961465249e-08, "logits/chosen": -18.71364974975586, "logits/rejected": -18.123966217041016, "logps/chosen": -317.18719482421875, "logps/rejected": -303.07830810546875, "loss": 0.9019, "rewards/accuracies": 0.5, "rewards/chosen": 2.6778388023376465, "rewards/margins": 0.33095473051071167, "rewards/rejected": 2.34688401222229, "step": 87010 }, { "epoch": 4.0401132828822135, "grad_norm": 139.89234924316406, "learning_rate": 5.7701843168206505e-08, "logits/chosen": -18.328136444091797, "logits/rejected": -18.76688575744629, "logps/chosen": -376.2419128417969, "logps/rejected": -434.59716796875, "loss": 1.3888, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7172305583953857, "rewards/margins": -0.03986043855547905, "rewards/rejected": 3.7570910453796387, "step": 87020 }, { "epoch": 4.040577556989646, "grad_norm": 2.3890349864959717, "learning_rate": 5.7673986721760526e-08, "logits/chosen": -20.526100158691406, "logits/rejected": -20.00657081604004, "logps/chosen": -402.04644775390625, "logps/rejected": -326.42205810546875, "loss": 0.5191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.619938850402832, "rewards/margins": 1.4019962549209595, "rewards/rejected": 3.217942714691162, "step": 87030 }, { "epoch": 4.04104183109708, "grad_norm": 18.360591888427734, "learning_rate": 5.764613027531455e-08, "logits/chosen": -19.675073623657227, "logits/rejected": -19.151424407958984, "logps/chosen": -365.2886657714844, "logps/rejected": -390.180419921875, "loss": 1.2451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1338438987731934, "rewards/margins": -0.05974381044507027, "rewards/rejected": 3.1935880184173584, "step": 87040 }, { "epoch": 4.041506105204513, "grad_norm": 45.510406494140625, "learning_rate": 5.761827382886856e-08, "logits/chosen": -19.632064819335938, "logits/rejected": -19.342533111572266, "logps/chosen": -542.2838745117188, "logps/rejected": -421.921142578125, "loss": 0.6159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.07789945602417, "rewards/margins": 1.0999910831451416, "rewards/rejected": 2.9779086112976074, "step": 87050 }, { "epoch": 4.0419703793119455, "grad_norm": 146.88099670410156, "learning_rate": 5.759041738242258e-08, "logits/chosen": -19.80344581604004, "logits/rejected": -19.02688217163086, "logps/chosen": -317.55322265625, "logps/rejected": -276.8060302734375, "loss": 0.3366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5221545696258545, "rewards/margins": 2.142429828643799, "rewards/rejected": 1.3797252178192139, "step": 87060 }, { "epoch": 4.042434653419379, "grad_norm": 32.14464569091797, "learning_rate": 5.75625609359766e-08, "logits/chosen": -18.890329360961914, "logits/rejected": -18.692092895507812, "logps/chosen": -403.85089111328125, "logps/rejected": -429.80047607421875, "loss": 1.0433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9960391521453857, "rewards/margins": 0.6115080714225769, "rewards/rejected": 2.384531021118164, "step": 87070 }, { "epoch": 4.042898927526812, "grad_norm": 28.283842086791992, "learning_rate": 5.753470448953062e-08, "logits/chosen": -19.01900291442871, "logits/rejected": -18.251428604125977, "logps/chosen": -346.91571044921875, "logps/rejected": -278.6708984375, "loss": 0.5684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8045852184295654, "rewards/margins": 1.4355907440185547, "rewards/rejected": 2.3689944744110107, "step": 87080 }, { "epoch": 4.043363201634245, "grad_norm": 66.75089263916016, "learning_rate": 5.750684804308464e-08, "logits/chosen": -19.167678833007812, "logits/rejected": -18.700586318969727, "logps/chosen": -430.57330322265625, "logps/rejected": -349.30865478515625, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9757163524627686, "rewards/margins": 1.3073450326919556, "rewards/rejected": 2.6683712005615234, "step": 87090 }, { "epoch": 4.0438274757416774, "grad_norm": 9.60236930847168, "learning_rate": 5.7478991596638654e-08, "logits/chosen": -18.720590591430664, "logits/rejected": -17.5693416595459, "logps/chosen": -401.8218688964844, "logps/rejected": -294.0787353515625, "loss": 0.7665, "rewards/accuracies": 0.5, "rewards/chosen": 3.2374367713928223, "rewards/margins": 0.9333726167678833, "rewards/rejected": 2.304063558578491, "step": 87100 }, { "epoch": 4.044291749849111, "grad_norm": 135.05233764648438, "learning_rate": 5.745113515019267e-08, "logits/chosen": -20.000885009765625, "logits/rejected": -18.61240005493164, "logps/chosen": -433.150146484375, "logps/rejected": -294.58929443359375, "loss": 0.3938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.994842052459717, "rewards/margins": 2.3389391899108887, "rewards/rejected": 1.6559034585952759, "step": 87110 }, { "epoch": 4.044756023956544, "grad_norm": 9.265419006347656, "learning_rate": 5.7423278703746683e-08, "logits/chosen": -19.369491577148438, "logits/rejected": -18.006927490234375, "logps/chosen": -327.0283203125, "logps/rejected": -262.7679748535156, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6417908668518066, "rewards/margins": 1.5744459629058838, "rewards/rejected": 1.0673446655273438, "step": 87120 }, { "epoch": 4.045220298063977, "grad_norm": 201.13192749023438, "learning_rate": 5.7395422257300705e-08, "logits/chosen": -20.836559295654297, "logits/rejected": -19.056602478027344, "logps/chosen": -476.40484619140625, "logps/rejected": -361.8719177246094, "loss": 0.183, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.440699577331543, "rewards/margins": 2.806769847869873, "rewards/rejected": 2.63392972946167, "step": 87130 }, { "epoch": 4.04568457217141, "grad_norm": 5.548619270324707, "learning_rate": 5.7367565810854726e-08, "logits/chosen": -19.43436050415039, "logits/rejected": -18.926753997802734, "logps/chosen": -385.82110595703125, "logps/rejected": -345.3716735839844, "loss": 0.3385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0297040939331055, "rewards/margins": 1.693931221961975, "rewards/rejected": 2.335772752761841, "step": 87140 }, { "epoch": 4.046148846278843, "grad_norm": 8.623592376708984, "learning_rate": 5.733970936440874e-08, "logits/chosen": -18.5909366607666, "logits/rejected": -17.519367218017578, "logps/chosen": -384.92547607421875, "logps/rejected": -318.9737854003906, "loss": 0.7275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0502514839172363, "rewards/margins": 1.281408429145813, "rewards/rejected": 1.7688430547714233, "step": 87150 }, { "epoch": 4.046613120386276, "grad_norm": 169.80189514160156, "learning_rate": 5.731185291796276e-08, "logits/chosen": -19.332122802734375, "logits/rejected": -17.31495475769043, "logps/chosen": -461.62152099609375, "logps/rejected": -313.6221618652344, "loss": 0.8129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.841426372528076, "rewards/margins": 1.9083114862442017, "rewards/rejected": 2.933115243911743, "step": 87160 }, { "epoch": 4.0470773944937095, "grad_norm": 28.628921508789062, "learning_rate": 5.728399647151678e-08, "logits/chosen": -18.749475479125977, "logits/rejected": -18.721254348754883, "logps/chosen": -384.0887145996094, "logps/rejected": -416.67138671875, "loss": 0.7299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.17962646484375, "rewards/margins": 0.5782098770141602, "rewards/rejected": 3.601416826248169, "step": 87170 }, { "epoch": 4.047541668601142, "grad_norm": 0.9758105874061584, "learning_rate": 5.72561400250708e-08, "logits/chosen": -19.719053268432617, "logits/rejected": -18.166730880737305, "logps/chosen": -397.6524963378906, "logps/rejected": -266.75628662109375, "loss": 0.1985, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.100186824798584, "rewards/margins": 2.9158096313476562, "rewards/rejected": 1.1843773126602173, "step": 87180 }, { "epoch": 4.048005942708575, "grad_norm": 43.62083435058594, "learning_rate": 5.722828357862482e-08, "logits/chosen": -18.95638656616211, "logits/rejected": -18.17106056213379, "logps/chosen": -420.8868713378906, "logps/rejected": -351.5390625, "loss": 0.3522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4674482345581055, "rewards/margins": 1.6493217945098877, "rewards/rejected": 2.818126678466797, "step": 87190 }, { "epoch": 4.048470216816008, "grad_norm": 78.1231689453125, "learning_rate": 5.720042713217883e-08, "logits/chosen": -19.383148193359375, "logits/rejected": -18.676654815673828, "logps/chosen": -445.3125, "logps/rejected": -369.61614990234375, "loss": 0.6642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.78308367729187, "rewards/margins": 1.2670302391052246, "rewards/rejected": 2.5160536766052246, "step": 87200 }, { "epoch": 4.0489344909234415, "grad_norm": 246.1943817138672, "learning_rate": 5.7172570685732854e-08, "logits/chosen": -19.170700073242188, "logits/rejected": -18.5225830078125, "logps/chosen": -388.19140625, "logps/rejected": -358.9541931152344, "loss": 0.4774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2945923805236816, "rewards/margins": 1.1985597610473633, "rewards/rejected": 2.09603214263916, "step": 87210 }, { "epoch": 4.049398765030874, "grad_norm": 29.07253074645996, "learning_rate": 5.7144714239286875e-08, "logits/chosen": -18.726242065429688, "logits/rejected": -18.12411117553711, "logps/chosen": -468.77056884765625, "logps/rejected": -414.66204833984375, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": 4.100502014160156, "rewards/margins": 1.099944829940796, "rewards/rejected": 3.0005574226379395, "step": 87220 }, { "epoch": 4.049863039138307, "grad_norm": 211.0045166015625, "learning_rate": 5.711685779284089e-08, "logits/chosen": -18.918004989624023, "logits/rejected": -18.476383209228516, "logps/chosen": -362.666015625, "logps/rejected": -338.55450439453125, "loss": 0.6193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.809771776199341, "rewards/margins": 1.1663841009140015, "rewards/rejected": 1.6433875560760498, "step": 87230 }, { "epoch": 4.050327313245741, "grad_norm": 207.85662841796875, "learning_rate": 5.708900134639491e-08, "logits/chosen": -19.557361602783203, "logits/rejected": -18.212217330932617, "logps/chosen": -508.7528381347656, "logps/rejected": -361.55816650390625, "loss": 0.5766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.467570781707764, "rewards/margins": 1.943658471107483, "rewards/rejected": 2.523912191390991, "step": 87240 }, { "epoch": 4.0507915873531735, "grad_norm": 149.6630096435547, "learning_rate": 5.706114489994893e-08, "logits/chosen": -19.47466468811035, "logits/rejected": -18.180644989013672, "logps/chosen": -289.4476318359375, "logps/rejected": -244.5034942626953, "loss": 0.5249, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.800903081893921, "rewards/margins": 1.3138164281845093, "rewards/rejected": 1.4870867729187012, "step": 87250 }, { "epoch": 4.051255861460606, "grad_norm": 20.492225646972656, "learning_rate": 5.7033288453502946e-08, "logits/chosen": -18.993412017822266, "logits/rejected": -18.878145217895508, "logps/chosen": -364.11492919921875, "logps/rejected": -309.96484375, "loss": 0.849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5571398735046387, "rewards/margins": 0.5227766036987305, "rewards/rejected": 3.0343635082244873, "step": 87260 }, { "epoch": 4.051720135568039, "grad_norm": 0.3072318732738495, "learning_rate": 5.700543200705697e-08, "logits/chosen": -19.579587936401367, "logits/rejected": -18.437185287475586, "logps/chosen": -446.97210693359375, "logps/rejected": -296.72357177734375, "loss": 0.1645, "rewards/accuracies": 1.0, "rewards/chosen": 4.861237525939941, "rewards/margins": 3.1206536293029785, "rewards/rejected": 1.7405836582183838, "step": 87270 }, { "epoch": 4.052184409675473, "grad_norm": 9.330314636230469, "learning_rate": 5.697757556061098e-08, "logits/chosen": -19.137065887451172, "logits/rejected": -18.910226821899414, "logps/chosen": -327.4445495605469, "logps/rejected": -283.85882568359375, "loss": 0.7363, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.788373351097107, "rewards/margins": 0.13128450512886047, "rewards/rejected": 1.6570888757705688, "step": 87280 }, { "epoch": 4.052648683782905, "grad_norm": 0.0771927610039711, "learning_rate": 5.6949719114165003e-08, "logits/chosen": -18.29897689819336, "logits/rejected": -17.7298641204834, "logps/chosen": -262.81463623046875, "logps/rejected": -191.523681640625, "loss": 0.4807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7051677703857422, "rewards/margins": 1.8447277545928955, "rewards/rejected": -0.1395597904920578, "step": 87290 }, { "epoch": 4.053112957890338, "grad_norm": 71.41354370117188, "learning_rate": 5.6921862667719024e-08, "logits/chosen": -19.870464324951172, "logits/rejected": -18.369855880737305, "logps/chosen": -425.2633361816406, "logps/rejected": -384.7801818847656, "loss": 0.3956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2327494621276855, "rewards/margins": 2.115122079849243, "rewards/rejected": 2.1176276206970215, "step": 87300 }, { "epoch": 4.053577231997772, "grad_norm": 9.036701202392578, "learning_rate": 5.689400622127303e-08, "logits/chosen": -19.455224990844727, "logits/rejected": -18.313396453857422, "logps/chosen": -406.2422180175781, "logps/rejected": -298.3741760253906, "loss": 0.852, "rewards/accuracies": 0.5, "rewards/chosen": 2.944495677947998, "rewards/margins": 0.924758791923523, "rewards/rejected": 2.0197370052337646, "step": 87310 }, { "epoch": 4.054041506105205, "grad_norm": 49.01313018798828, "learning_rate": 5.6866149774827054e-08, "logits/chosen": -18.7692813873291, "logits/rejected": -18.659809112548828, "logps/chosen": -339.43463134765625, "logps/rejected": -301.4951477050781, "loss": 1.0035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5351955890655518, "rewards/margins": 0.7790995240211487, "rewards/rejected": 1.7560962438583374, "step": 87320 }, { "epoch": 4.054505780212637, "grad_norm": 9.034286499023438, "learning_rate": 5.683829332838107e-08, "logits/chosen": -19.16042137145996, "logits/rejected": -18.56359100341797, "logps/chosen": -496.2809143066406, "logps/rejected": -396.73870849609375, "loss": 0.3153, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7410106658935547, "rewards/margins": 1.2941573858261108, "rewards/rejected": 2.4468531608581543, "step": 87330 }, { "epoch": 4.05497005432007, "grad_norm": 210.35964965820312, "learning_rate": 5.681043688193509e-08, "logits/chosen": -19.52132797241211, "logits/rejected": -18.310474395751953, "logps/chosen": -358.7845764160156, "logps/rejected": -348.592041015625, "loss": 1.4472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9342819452285767, "rewards/margins": 0.315573513507843, "rewards/rejected": 1.6187082529067993, "step": 87340 }, { "epoch": 4.055434328427504, "grad_norm": 140.56375122070312, "learning_rate": 5.678258043548911e-08, "logits/chosen": -18.498268127441406, "logits/rejected": -17.96572494506836, "logps/chosen": -450.0735778808594, "logps/rejected": -355.7183837890625, "loss": 0.7007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.044173002243042, "rewards/margins": 0.7698177695274353, "rewards/rejected": 2.274355173110962, "step": 87350 }, { "epoch": 4.055898602534937, "grad_norm": 0.6320651769638062, "learning_rate": 5.6754723989043125e-08, "logits/chosen": -19.858287811279297, "logits/rejected": -19.10796356201172, "logps/chosen": -425.0870056152344, "logps/rejected": -323.0134582519531, "loss": 0.4936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.828496217727661, "rewards/margins": 1.438510537147522, "rewards/rejected": 2.3899855613708496, "step": 87360 }, { "epoch": 4.056362876642369, "grad_norm": 41.03743362426758, "learning_rate": 5.6726867542597146e-08, "logits/chosen": -18.821487426757812, "logits/rejected": -18.53122901916504, "logps/chosen": -399.8667907714844, "logps/rejected": -349.2838134765625, "loss": 0.7614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6329872608184814, "rewards/margins": 1.2467429637908936, "rewards/rejected": 2.386244058609009, "step": 87370 }, { "epoch": 4.056827150749803, "grad_norm": 0.20450446009635925, "learning_rate": 5.669901109615117e-08, "logits/chosen": -19.219322204589844, "logits/rejected": -18.020750045776367, "logps/chosen": -440.50274658203125, "logps/rejected": -309.1390686035156, "loss": 0.8293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.410545825958252, "rewards/margins": 1.8385931253433228, "rewards/rejected": 1.5719525814056396, "step": 87380 }, { "epoch": 4.057291424857236, "grad_norm": 269.5750732421875, "learning_rate": 5.667115464970518e-08, "logits/chosen": -18.18511199951172, "logits/rejected": -18.098804473876953, "logps/chosen": -332.88128662109375, "logps/rejected": -250.73440551757812, "loss": 1.0774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9233472347259521, "rewards/margins": 0.3128509521484375, "rewards/rejected": 1.610496163368225, "step": 87390 }, { "epoch": 4.057755698964669, "grad_norm": 19.253299713134766, "learning_rate": 5.66432982032592e-08, "logits/chosen": -18.340694427490234, "logits/rejected": -17.4678955078125, "logps/chosen": -309.4512634277344, "logps/rejected": -249.2924346923828, "loss": 0.282, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7610509395599365, "rewards/margins": 1.570130467414856, "rewards/rejected": 1.190920352935791, "step": 87400 }, { "epoch": 4.058219973072101, "grad_norm": 140.08396911621094, "learning_rate": 5.661544175681322e-08, "logits/chosen": -18.284374237060547, "logits/rejected": -18.178510665893555, "logps/chosen": -344.03436279296875, "logps/rejected": -298.29156494140625, "loss": 1.0759, "rewards/accuracies": 0.5, "rewards/chosen": 2.129830837249756, "rewards/margins": -0.11376331001520157, "rewards/rejected": 2.2435946464538574, "step": 87410 }, { "epoch": 4.058684247179535, "grad_norm": 19.205087661743164, "learning_rate": 5.658758531036724e-08, "logits/chosen": -19.131221771240234, "logits/rejected": -17.861225128173828, "logps/chosen": -304.63189697265625, "logps/rejected": -213.2861785888672, "loss": 0.3785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.261340379714966, "rewards/margins": 2.075805425643921, "rewards/rejected": 0.18553511798381805, "step": 87420 }, { "epoch": 4.059148521286968, "grad_norm": 80.72798156738281, "learning_rate": 5.655972886392126e-08, "logits/chosen": -19.67494773864746, "logits/rejected": -19.49575424194336, "logps/chosen": -398.9700622558594, "logps/rejected": -394.2217712402344, "loss": 1.152, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.237837553024292, "rewards/margins": 0.63570636510849, "rewards/rejected": 2.6021313667297363, "step": 87430 }, { "epoch": 4.059612795394401, "grad_norm": 31.314437866210938, "learning_rate": 5.6531872417475274e-08, "logits/chosen": -18.496225357055664, "logits/rejected": -17.815635681152344, "logps/chosen": -367.5648193359375, "logps/rejected": -280.36981201171875, "loss": 0.4662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5479865074157715, "rewards/margins": 1.1011251211166382, "rewards/rejected": 1.4468615055084229, "step": 87440 }, { "epoch": 4.060077069501834, "grad_norm": 2.6557981967926025, "learning_rate": 5.6504015971029295e-08, "logits/chosen": -18.273448944091797, "logits/rejected": -18.068218231201172, "logps/chosen": -320.5269470214844, "logps/rejected": -298.6838073730469, "loss": 0.9009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5490247011184692, "rewards/margins": 0.40831655263900757, "rewards/rejected": 1.1407082080841064, "step": 87450 }, { "epoch": 4.060541343609267, "grad_norm": 15.533596992492676, "learning_rate": 5.6476159524583317e-08, "logits/chosen": -20.132001876831055, "logits/rejected": -18.565425872802734, "logps/chosen": -496.9877014160156, "logps/rejected": -283.24957275390625, "loss": 0.4117, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.025815010070801, "rewards/margins": 2.5389342308044434, "rewards/rejected": 2.4868807792663574, "step": 87460 }, { "epoch": 4.0610056177167, "grad_norm": 43.363704681396484, "learning_rate": 5.644830307813733e-08, "logits/chosen": -18.620838165283203, "logits/rejected": -18.45609474182129, "logps/chosen": -364.65948486328125, "logps/rejected": -330.9296569824219, "loss": 1.7435, "rewards/accuracies": 0.5, "rewards/chosen": 2.5201799869537354, "rewards/margins": 0.021522855386137962, "rewards/rejected": 2.498656749725342, "step": 87470 }, { "epoch": 4.0614698918241325, "grad_norm": 85.39899444580078, "learning_rate": 5.642044663169135e-08, "logits/chosen": -20.198589324951172, "logits/rejected": -18.661556243896484, "logps/chosen": -451.7313537597656, "logps/rejected": -352.1598205566406, "loss": 0.3598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8257598876953125, "rewards/margins": 1.746239423751831, "rewards/rejected": 2.0795204639434814, "step": 87480 }, { "epoch": 4.061934165931566, "grad_norm": 231.2143096923828, "learning_rate": 5.639259018524537e-08, "logits/chosen": -19.50413703918457, "logits/rejected": -19.550395965576172, "logps/chosen": -490.8904724121094, "logps/rejected": -408.51678466796875, "loss": 1.0431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.180773735046387, "rewards/margins": 0.12835979461669922, "rewards/rejected": 4.0524139404296875, "step": 87490 }, { "epoch": 4.062398440038999, "grad_norm": 48.70881652832031, "learning_rate": 5.636473373879939e-08, "logits/chosen": -17.910648345947266, "logits/rejected": -18.181903839111328, "logps/chosen": -233.50637817382812, "logps/rejected": -288.4431457519531, "loss": 1.2507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7918297052383423, "rewards/margins": 0.4678448736667633, "rewards/rejected": 1.3239848613739014, "step": 87500 }, { "epoch": 4.062862714146432, "grad_norm": 31.30073356628418, "learning_rate": 5.6336877292353396e-08, "logits/chosen": -18.643545150756836, "logits/rejected": -17.844758987426758, "logps/chosen": -447.7708435058594, "logps/rejected": -365.2940979003906, "loss": 0.4991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.886486530303955, "rewards/margins": 1.0200493335723877, "rewards/rejected": 1.8664371967315674, "step": 87510 }, { "epoch": 4.063326988253865, "grad_norm": 65.3670883178711, "learning_rate": 5.630902084590742e-08, "logits/chosen": -17.779460906982422, "logits/rejected": -17.714393615722656, "logps/chosen": -318.3744201660156, "logps/rejected": -307.84814453125, "loss": 0.7934, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8779408931732178, "rewards/margins": 1.4193941354751587, "rewards/rejected": 1.4585468769073486, "step": 87520 }, { "epoch": 4.063791262361298, "grad_norm": 79.18240356445312, "learning_rate": 5.628116439946144e-08, "logits/chosen": -19.97661781311035, "logits/rejected": -19.577674865722656, "logps/chosen": -394.0165710449219, "logps/rejected": -366.40289306640625, "loss": 0.9987, "rewards/accuracies": 0.5, "rewards/chosen": 3.7947769165039062, "rewards/margins": 0.6346448063850403, "rewards/rejected": 3.1601321697235107, "step": 87530 }, { "epoch": 4.064255536468731, "grad_norm": 39.2580451965332, "learning_rate": 5.625330795301545e-08, "logits/chosen": -19.344158172607422, "logits/rejected": -18.74003028869629, "logps/chosen": -405.9359436035156, "logps/rejected": -353.1212158203125, "loss": 0.5289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5114264488220215, "rewards/margins": 1.0878127813339233, "rewards/rejected": 1.4236137866973877, "step": 87540 }, { "epoch": 4.064719810576165, "grad_norm": 35.617610931396484, "learning_rate": 5.6225451506569474e-08, "logits/chosen": -19.330406188964844, "logits/rejected": -18.53956413269043, "logps/chosen": -488.3570861816406, "logps/rejected": -377.4252624511719, "loss": 0.4541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.422123432159424, "rewards/margins": 1.636939287185669, "rewards/rejected": 2.785184144973755, "step": 87550 }, { "epoch": 4.065184084683597, "grad_norm": 33.672489166259766, "learning_rate": 5.6197595060123495e-08, "logits/chosen": -18.96245765686035, "logits/rejected": -17.23552131652832, "logps/chosen": -394.71502685546875, "logps/rejected": -220.8215789794922, "loss": 0.3531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3684535026550293, "rewards/margins": 2.4819836616516113, "rewards/rejected": 0.8864700198173523, "step": 87560 }, { "epoch": 4.06564835879103, "grad_norm": 30.44987678527832, "learning_rate": 5.616973861367751e-08, "logits/chosen": -19.312599182128906, "logits/rejected": -18.54973030090332, "logps/chosen": -404.59588623046875, "logps/rejected": -292.1833801269531, "loss": 0.4033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.517761707305908, "rewards/margins": 1.8035951852798462, "rewards/rejected": 1.7141664028167725, "step": 87570 }, { "epoch": 4.066112632898463, "grad_norm": 58.6794319152832, "learning_rate": 5.614188216723153e-08, "logits/chosen": -20.106346130371094, "logits/rejected": -18.59073257446289, "logps/chosen": -328.30499267578125, "logps/rejected": -297.62274169921875, "loss": 1.0145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7474286556243896, "rewards/margins": 1.0900294780731201, "rewards/rejected": 1.6573994159698486, "step": 87580 }, { "epoch": 4.066576907005897, "grad_norm": 159.16725158691406, "learning_rate": 5.611402572078555e-08, "logits/chosen": -19.5003662109375, "logits/rejected": -18.663436889648438, "logps/chosen": -403.8298034667969, "logps/rejected": -344.4601135253906, "loss": 0.7869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.474912166595459, "rewards/margins": 1.7338321208953857, "rewards/rejected": 2.741079807281494, "step": 87590 }, { "epoch": 4.067041181113329, "grad_norm": 2.5978751182556152, "learning_rate": 5.6086169274339566e-08, "logits/chosen": -18.755502700805664, "logits/rejected": -18.133899688720703, "logps/chosen": -416.2071228027344, "logps/rejected": -439.5376892089844, "loss": 0.7596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1706833839416504, "rewards/margins": 1.1867362260818481, "rewards/rejected": 1.983947515487671, "step": 87600 }, { "epoch": 4.067505455220762, "grad_norm": 10.737504959106445, "learning_rate": 5.605831282789359e-08, "logits/chosen": -19.015283584594727, "logits/rejected": -18.493831634521484, "logps/chosen": -399.49969482421875, "logps/rejected": -300.38055419921875, "loss": 0.356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.250061511993408, "rewards/margins": 2.0282912254333496, "rewards/rejected": 2.2217702865600586, "step": 87610 }, { "epoch": 4.067969729328196, "grad_norm": 174.72789001464844, "learning_rate": 5.60304563814476e-08, "logits/chosen": -18.630327224731445, "logits/rejected": -17.734418869018555, "logps/chosen": -491.60491943359375, "logps/rejected": -309.84124755859375, "loss": 0.6161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.325530052185059, "rewards/margins": 2.0522732734680176, "rewards/rejected": 2.273256301879883, "step": 87620 }, { "epoch": 4.0684340034356286, "grad_norm": 14.335982322692871, "learning_rate": 5.600259993500162e-08, "logits/chosen": -18.681884765625, "logits/rejected": -18.441387176513672, "logps/chosen": -448.65069580078125, "logps/rejected": -414.249267578125, "loss": 0.2859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.298918724060059, "rewards/margins": 1.3691591024398804, "rewards/rejected": 2.9297597408294678, "step": 87630 }, { "epoch": 4.068898277543061, "grad_norm": 140.6710662841797, "learning_rate": 5.5974743488555644e-08, "logits/chosen": -18.679712295532227, "logits/rejected": -18.94037628173828, "logps/chosen": -468.39227294921875, "logps/rejected": -489.10906982421875, "loss": 1.1866, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.056729316711426, "rewards/margins": -0.01961391046643257, "rewards/rejected": 4.076342582702637, "step": 87640 }, { "epoch": 4.069362551650494, "grad_norm": 121.9793472290039, "learning_rate": 5.594688704210966e-08, "logits/chosen": -19.34244728088379, "logits/rejected": -18.49839973449707, "logps/chosen": -410.67669677734375, "logps/rejected": -320.11981201171875, "loss": 0.3652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4241843223571777, "rewards/margins": 1.7603893280029297, "rewards/rejected": 1.6637952327728271, "step": 87650 }, { "epoch": 4.069826825757928, "grad_norm": 1.0180944204330444, "learning_rate": 5.591903059566368e-08, "logits/chosen": -18.648244857788086, "logits/rejected": -17.52213478088379, "logps/chosen": -342.81610107421875, "logps/rejected": -224.864501953125, "loss": 0.5662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.660407304763794, "rewards/margins": 1.9339497089385986, "rewards/rejected": 1.7264578342437744, "step": 87660 }, { "epoch": 4.0702910998653605, "grad_norm": 0.5750589966773987, "learning_rate": 5.58911741492177e-08, "logits/chosen": -19.004880905151367, "logits/rejected": -17.869613647460938, "logps/chosen": -238.52456665039062, "logps/rejected": -222.88552856445312, "loss": 1.0878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0871119499206543, "rewards/margins": 1.4828191995620728, "rewards/rejected": 0.6042929887771606, "step": 87670 }, { "epoch": 4.070755373972793, "grad_norm": 65.92361450195312, "learning_rate": 5.5863317702771716e-08, "logits/chosen": -20.14861488342285, "logits/rejected": -19.227893829345703, "logps/chosen": -496.9474182128906, "logps/rejected": -443.69525146484375, "loss": 0.5158, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.911444664001465, "rewards/margins": 1.3451707363128662, "rewards/rejected": 3.5662739276885986, "step": 87680 }, { "epoch": 4.071219648080227, "grad_norm": 99.64634704589844, "learning_rate": 5.583546125632574e-08, "logits/chosen": -19.7453670501709, "logits/rejected": -19.367727279663086, "logps/chosen": -341.8854675292969, "logps/rejected": -288.9708251953125, "loss": 0.5677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5608508586883545, "rewards/margins": 1.7166531085968018, "rewards/rejected": 1.8441978693008423, "step": 87690 }, { "epoch": 4.07168392218766, "grad_norm": 37.63563919067383, "learning_rate": 5.5807604809879745e-08, "logits/chosen": -19.262069702148438, "logits/rejected": -19.233095169067383, "logps/chosen": -337.9617004394531, "logps/rejected": -250.3741912841797, "loss": 0.4683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.210409641265869, "rewards/margins": 1.7632354497909546, "rewards/rejected": 1.4471739530563354, "step": 87700 }, { "epoch": 4.0721481962950925, "grad_norm": 24.2924747467041, "learning_rate": 5.5779748363433766e-08, "logits/chosen": -18.844194412231445, "logits/rejected": -18.442214965820312, "logps/chosen": -414.3478088378906, "logps/rejected": -400.85040283203125, "loss": 0.5687, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.711933135986328, "rewards/margins": 1.2545889616012573, "rewards/rejected": 2.4573440551757812, "step": 87710 }, { "epoch": 4.072612470402525, "grad_norm": 201.6494140625, "learning_rate": 5.575189191698778e-08, "logits/chosen": -18.57561492919922, "logits/rejected": -18.62394142150879, "logps/chosen": -334.2697448730469, "logps/rejected": -332.8799743652344, "loss": 0.7773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5557701587677, "rewards/margins": 0.9700374603271484, "rewards/rejected": 2.5857319831848145, "step": 87720 }, { "epoch": 4.073076744509959, "grad_norm": 79.34321594238281, "learning_rate": 5.57240354705418e-08, "logits/chosen": -19.667673110961914, "logits/rejected": -18.52129554748535, "logps/chosen": -375.38568115234375, "logps/rejected": -337.09393310546875, "loss": 0.9611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5165436267852783, "rewards/margins": 0.502556324005127, "rewards/rejected": 3.0139873027801514, "step": 87730 }, { "epoch": 4.073541018617392, "grad_norm": 0.866783857345581, "learning_rate": 5.569617902409582e-08, "logits/chosen": -18.797191619873047, "logits/rejected": -18.575307846069336, "logps/chosen": -331.5875549316406, "logps/rejected": -313.4136657714844, "loss": 0.8699, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6118438243865967, "rewards/margins": 1.3036667108535767, "rewards/rejected": 2.3081772327423096, "step": 87740 }, { "epoch": 4.0740052927248245, "grad_norm": 20.001789093017578, "learning_rate": 5.566832257764984e-08, "logits/chosen": -19.538333892822266, "logits/rejected": -18.613420486450195, "logps/chosen": -364.5213928222656, "logps/rejected": -332.7635498046875, "loss": 0.7556, "rewards/accuracies": 0.5, "rewards/chosen": 2.704418420791626, "rewards/margins": 1.0050852298736572, "rewards/rejected": 1.6993334293365479, "step": 87750 }, { "epoch": 4.074469566832258, "grad_norm": 161.09539794921875, "learning_rate": 5.564046613120386e-08, "logits/chosen": -19.41069984436035, "logits/rejected": -17.967557907104492, "logps/chosen": -477.7857360839844, "logps/rejected": -395.5375671386719, "loss": 0.8776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.282955169677734, "rewards/margins": 1.3453229665756226, "rewards/rejected": 2.9376320838928223, "step": 87760 }, { "epoch": 4.074933840939691, "grad_norm": 127.81067657470703, "learning_rate": 5.561260968475788e-08, "logits/chosen": -20.351835250854492, "logits/rejected": -19.2919864654541, "logps/chosen": -373.95477294921875, "logps/rejected": -333.47930908203125, "loss": 0.9162, "rewards/accuracies": 0.5, "rewards/chosen": 3.8681578636169434, "rewards/margins": 0.20026342570781708, "rewards/rejected": 3.667893886566162, "step": 87770 }, { "epoch": 4.075398115047124, "grad_norm": 89.61116027832031, "learning_rate": 5.5584753238311894e-08, "logits/chosen": -19.04054069519043, "logits/rejected": -18.47650718688965, "logps/chosen": -387.2305603027344, "logps/rejected": -305.07098388671875, "loss": 1.2416, "rewards/accuracies": 0.5, "rewards/chosen": 3.244345188140869, "rewards/margins": 0.8260015249252319, "rewards/rejected": 2.4183435440063477, "step": 87780 }, { "epoch": 4.0758623891545565, "grad_norm": 33.221519470214844, "learning_rate": 5.5556896791865915e-08, "logits/chosen": -19.393695831298828, "logits/rejected": -18.163623809814453, "logps/chosen": -495.5147399902344, "logps/rejected": -274.78271484375, "loss": 0.2333, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.042170524597168, "rewards/margins": 2.5706942081451416, "rewards/rejected": 1.4714761972427368, "step": 87790 }, { "epoch": 4.07632666326199, "grad_norm": 114.65737915039062, "learning_rate": 5.5529040345419937e-08, "logits/chosen": -19.154781341552734, "logits/rejected": -18.134885787963867, "logps/chosen": -402.08465576171875, "logps/rejected": -264.9625244140625, "loss": 0.2641, "rewards/accuracies": 1.0, "rewards/chosen": 3.790098190307617, "rewards/margins": 1.830198049545288, "rewards/rejected": 1.959900140762329, "step": 87800 }, { "epoch": 4.076790937369423, "grad_norm": 139.1973114013672, "learning_rate": 5.550118389897395e-08, "logits/chosen": -19.40189552307129, "logits/rejected": -18.914691925048828, "logps/chosen": -432.8274841308594, "logps/rejected": -365.57574462890625, "loss": 1.3764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.26495623588562, "rewards/margins": 0.41328272223472595, "rewards/rejected": 2.8516736030578613, "step": 87810 }, { "epoch": 4.077255211476856, "grad_norm": 0.43154487013816833, "learning_rate": 5.547332745252797e-08, "logits/chosen": -19.474008560180664, "logits/rejected": -18.37594985961914, "logps/chosen": -438.27081298828125, "logps/rejected": -315.80706787109375, "loss": 0.6108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.593997955322266, "rewards/margins": 2.3937132358551025, "rewards/rejected": 2.200284957885742, "step": 87820 }, { "epoch": 4.077719485584289, "grad_norm": 32.27836990356445, "learning_rate": 5.544547100608199e-08, "logits/chosen": -19.80099105834961, "logits/rejected": -18.99314308166504, "logps/chosen": -385.4771728515625, "logps/rejected": -286.4971923828125, "loss": 0.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.425050258636475, "rewards/margins": 2.285243511199951, "rewards/rejected": 2.1398067474365234, "step": 87830 }, { "epoch": 4.078183759691722, "grad_norm": 18.108837127685547, "learning_rate": 5.541761455963601e-08, "logits/chosen": -19.416553497314453, "logits/rejected": -18.92857551574707, "logps/chosen": -374.91400146484375, "logps/rejected": -300.1807556152344, "loss": 0.1826, "rewards/accuracies": 1.0, "rewards/chosen": 4.90163516998291, "rewards/margins": 2.265044689178467, "rewards/rejected": 2.6365907192230225, "step": 87840 }, { "epoch": 4.078648033799155, "grad_norm": 0.0928613543510437, "learning_rate": 5.538975811319003e-08, "logits/chosen": -19.94672393798828, "logits/rejected": -18.08053207397461, "logps/chosen": -346.218505859375, "logps/rejected": -252.98403930664062, "loss": 0.2646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.995903491973877, "rewards/margins": 3.3136062622070312, "rewards/rejected": 0.6822972893714905, "step": 87850 }, { "epoch": 4.079112307906588, "grad_norm": 85.71143341064453, "learning_rate": 5.5361901666744044e-08, "logits/chosen": -20.12564468383789, "logits/rejected": -19.394399642944336, "logps/chosen": -321.87725830078125, "logps/rejected": -314.5555114746094, "loss": 0.5428, "rewards/accuracies": 0.5, "rewards/chosen": 3.3958232402801514, "rewards/margins": 0.61128169298172, "rewards/rejected": 2.784541368484497, "step": 87860 }, { "epoch": 4.079576582014021, "grad_norm": 62.20464324951172, "learning_rate": 5.5334045220298065e-08, "logits/chosen": -18.83570671081543, "logits/rejected": -19.030582427978516, "logps/chosen": -427.26251220703125, "logps/rejected": -394.884765625, "loss": 1.0803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.126378059387207, "rewards/margins": 0.04074794054031372, "rewards/rejected": 4.085629940032959, "step": 87870 }, { "epoch": 4.080040856121454, "grad_norm": 6.3558669090271, "learning_rate": 5.5306188773852086e-08, "logits/chosen": -18.733327865600586, "logits/rejected": -18.459606170654297, "logps/chosen": -390.6214599609375, "logps/rejected": -430.01763916015625, "loss": 0.9643, "rewards/accuracies": 0.5, "rewards/chosen": 3.0887269973754883, "rewards/margins": -0.017253613099455833, "rewards/rejected": 3.105980396270752, "step": 87880 }, { "epoch": 4.080505130228887, "grad_norm": 0.2326568067073822, "learning_rate": 5.52783323274061e-08, "logits/chosen": -19.266910552978516, "logits/rejected": -17.958133697509766, "logps/chosen": -481.9195251464844, "logps/rejected": -334.8204345703125, "loss": 0.2923, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.306671142578125, "rewards/margins": 2.8673629760742188, "rewards/rejected": 2.439307928085327, "step": 87890 }, { "epoch": 4.0809694043363205, "grad_norm": 4.432521820068359, "learning_rate": 5.5250475880960115e-08, "logits/chosen": -18.38235855102539, "logits/rejected": -17.994644165039062, "logps/chosen": -320.3966369628906, "logps/rejected": -322.0006103515625, "loss": 0.7298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.162208080291748, "rewards/margins": 0.5858222246170044, "rewards/rejected": 1.5763860940933228, "step": 87900 }, { "epoch": 4.081433678443753, "grad_norm": 0.0057304720394313335, "learning_rate": 5.522261943451413e-08, "logits/chosen": -18.147335052490234, "logits/rejected": -16.795705795288086, "logps/chosen": -435.3943786621094, "logps/rejected": -342.5865173339844, "loss": 0.543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.468027591705322, "rewards/margins": 2.9984238147735596, "rewards/rejected": 1.4696038961410522, "step": 87910 }, { "epoch": 4.081897952551186, "grad_norm": 302.1156311035156, "learning_rate": 5.519476298806815e-08, "logits/chosen": -18.268455505371094, "logits/rejected": -18.408878326416016, "logps/chosen": -454.9427185058594, "logps/rejected": -473.957275390625, "loss": 1.2915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.414991855621338, "rewards/margins": 0.6085599064826965, "rewards/rejected": 3.806431531906128, "step": 87920 }, { "epoch": 4.082362226658619, "grad_norm": 53.97417449951172, "learning_rate": 5.5166906541622165e-08, "logits/chosen": -20.58296775817871, "logits/rejected": -20.54330825805664, "logps/chosen": -453.19891357421875, "logps/rejected": -373.6977844238281, "loss": 0.4625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.024196147918701, "rewards/margins": 0.8587077856063843, "rewards/rejected": 2.1654882431030273, "step": 87930 }, { "epoch": 4.0828265007660525, "grad_norm": 20.13783836364746, "learning_rate": 5.5139050095176186e-08, "logits/chosen": -18.788288116455078, "logits/rejected": -17.94107437133789, "logps/chosen": -467.13592529296875, "logps/rejected": -355.3260192871094, "loss": 1.0021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.806366920471191, "rewards/margins": 1.170140027999878, "rewards/rejected": 3.6362271308898926, "step": 87940 }, { "epoch": 4.083290774873485, "grad_norm": 75.29762268066406, "learning_rate": 5.511119364873021e-08, "logits/chosen": -19.19321632385254, "logits/rejected": -18.346458435058594, "logps/chosen": -422.25726318359375, "logps/rejected": -341.0212097167969, "loss": 0.7144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8075528144836426, "rewards/margins": 1.2525094747543335, "rewards/rejected": 2.5550427436828613, "step": 87950 }, { "epoch": 4.083755048980918, "grad_norm": 0.0032191902864724398, "learning_rate": 5.508333720228422e-08, "logits/chosen": -19.53531265258789, "logits/rejected": -18.727384567260742, "logps/chosen": -390.9161682128906, "logps/rejected": -334.10711669921875, "loss": 0.572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6886868476867676, "rewards/margins": 1.8234612941741943, "rewards/rejected": 1.8652255535125732, "step": 87960 }, { "epoch": 4.084219323088352, "grad_norm": 143.98541259765625, "learning_rate": 5.505548075583824e-08, "logits/chosen": -20.246156692504883, "logits/rejected": -18.5081787109375, "logps/chosen": -500.03814697265625, "logps/rejected": -390.7248229980469, "loss": 0.2937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.894174575805664, "rewards/margins": 2.624654531478882, "rewards/rejected": 2.2695200443267822, "step": 87970 }, { "epoch": 4.084683597195784, "grad_norm": 236.45040893554688, "learning_rate": 5.5027624309392264e-08, "logits/chosen": -19.150264739990234, "logits/rejected": -18.885778427124023, "logps/chosen": -364.93646240234375, "logps/rejected": -345.92303466796875, "loss": 0.7874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3059113025665283, "rewards/margins": 0.6436936259269714, "rewards/rejected": 2.662217855453491, "step": 87980 }, { "epoch": 4.085147871303217, "grad_norm": 28.170875549316406, "learning_rate": 5.499976786294628e-08, "logits/chosen": -19.23358917236328, "logits/rejected": -18.727924346923828, "logps/chosen": -515.8404541015625, "logps/rejected": -408.8056640625, "loss": 0.4254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.732184410095215, "rewards/margins": 2.158013343811035, "rewards/rejected": 3.574171543121338, "step": 87990 }, { "epoch": 4.085612145410651, "grad_norm": 2.3421037197113037, "learning_rate": 5.49719114165003e-08, "logits/chosen": -19.38817024230957, "logits/rejected": -17.834609985351562, "logps/chosen": -438.1905822753906, "logps/rejected": -290.07745361328125, "loss": 0.251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.250463008880615, "rewards/margins": 2.9899728298187256, "rewards/rejected": 1.2604906558990479, "step": 88000 }, { "epoch": 4.086076419518084, "grad_norm": 144.255615234375, "learning_rate": 5.494405497005432e-08, "logits/chosen": -18.631460189819336, "logits/rejected": -19.317764282226562, "logps/chosen": -269.72698974609375, "logps/rejected": -314.8492126464844, "loss": 0.9041, "rewards/accuracies": 0.5, "rewards/chosen": 2.797593116760254, "rewards/margins": 0.11226880550384521, "rewards/rejected": 2.6853246688842773, "step": 88010 }, { "epoch": 4.086540693625516, "grad_norm": 8.544687271118164, "learning_rate": 5.4916198523608336e-08, "logits/chosen": -20.116405487060547, "logits/rejected": -18.449207305908203, "logps/chosen": -429.8416442871094, "logps/rejected": -290.742431640625, "loss": 0.2808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.881538391113281, "rewards/margins": 2.5173277854919434, "rewards/rejected": 2.364210844039917, "step": 88020 }, { "epoch": 4.087004967732949, "grad_norm": 53.23345947265625, "learning_rate": 5.488834207716236e-08, "logits/chosen": -18.840404510498047, "logits/rejected": -18.30527687072754, "logps/chosen": -301.8161926269531, "logps/rejected": -258.51385498046875, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7051331996917725, "rewards/margins": 0.9132998585700989, "rewards/rejected": 1.791833519935608, "step": 88030 }, { "epoch": 4.087469241840383, "grad_norm": 59.644779205322266, "learning_rate": 5.486048563071637e-08, "logits/chosen": -18.44943618774414, "logits/rejected": -18.227041244506836, "logps/chosen": -400.3034362792969, "logps/rejected": -311.2657775878906, "loss": 0.846, "rewards/accuracies": 0.5, "rewards/chosen": 3.216027021408081, "rewards/margins": 0.6488113403320312, "rewards/rejected": 2.5672154426574707, "step": 88040 }, { "epoch": 4.087933515947816, "grad_norm": 35.95929718017578, "learning_rate": 5.483262918427039e-08, "logits/chosen": -19.44203758239746, "logits/rejected": -18.523164749145508, "logps/chosen": -319.80328369140625, "logps/rejected": -261.76763916015625, "loss": 0.4022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.586613893508911, "rewards/margins": 1.241946816444397, "rewards/rejected": 1.3446667194366455, "step": 88050 }, { "epoch": 4.088397790055248, "grad_norm": 72.1394271850586, "learning_rate": 5.4804772737824414e-08, "logits/chosen": -18.224346160888672, "logits/rejected": -17.947566986083984, "logps/chosen": -237.16262817382812, "logps/rejected": -221.79452514648438, "loss": 0.7294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.418217658996582, "rewards/margins": 0.3318791091442108, "rewards/rejected": 1.086338758468628, "step": 88060 }, { "epoch": 4.088862064162682, "grad_norm": 58.15019989013672, "learning_rate": 5.477691629137843e-08, "logits/chosen": -18.77075958251953, "logits/rejected": -17.96084213256836, "logps/chosen": -329.4635925292969, "logps/rejected": -287.3272399902344, "loss": 0.6325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6943328380584717, "rewards/margins": 0.636849045753479, "rewards/rejected": 2.0574839115142822, "step": 88070 }, { "epoch": 4.089326338270115, "grad_norm": 224.8909912109375, "learning_rate": 5.474905984493245e-08, "logits/chosen": -19.751705169677734, "logits/rejected": -18.79134750366211, "logps/chosen": -331.7220764160156, "logps/rejected": -323.80731201171875, "loss": 0.7879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.759753704071045, "rewards/margins": 1.0233848094940186, "rewards/rejected": 1.7363688945770264, "step": 88080 }, { "epoch": 4.089790612377548, "grad_norm": 52.29473114013672, "learning_rate": 5.472120339848647e-08, "logits/chosen": -20.658634185791016, "logits/rejected": -19.236652374267578, "logps/chosen": -489.56988525390625, "logps/rejected": -376.2237548828125, "loss": 0.4304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.81577730178833, "rewards/margins": 2.011746406555176, "rewards/rejected": 2.8040313720703125, "step": 88090 }, { "epoch": 4.09025488648498, "grad_norm": 37.03621292114258, "learning_rate": 5.469334695204048e-08, "logits/chosen": -20.042150497436523, "logits/rejected": -19.448190689086914, "logps/chosen": -385.970947265625, "logps/rejected": -367.9952697753906, "loss": 0.9279, "rewards/accuracies": 0.5, "rewards/chosen": 3.259890079498291, "rewards/margins": 0.10153564065694809, "rewards/rejected": 3.1583540439605713, "step": 88100 }, { "epoch": 4.090719160592414, "grad_norm": 51.595428466796875, "learning_rate": 5.46654905055945e-08, "logits/chosen": -19.226415634155273, "logits/rejected": -18.313325881958008, "logps/chosen": -440.4613342285156, "logps/rejected": -366.01336669921875, "loss": 0.4669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4935479164123535, "rewards/margins": 1.3538776636123657, "rewards/rejected": 2.1396703720092773, "step": 88110 }, { "epoch": 4.091183434699847, "grad_norm": 144.90072631835938, "learning_rate": 5.4637634059148514e-08, "logits/chosen": -18.998876571655273, "logits/rejected": -18.779909133911133, "logps/chosen": -410.39959716796875, "logps/rejected": -409.9397888183594, "loss": 1.3252, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5663819313049316, "rewards/margins": -0.501998782157898, "rewards/rejected": 3.068380355834961, "step": 88120 }, { "epoch": 4.09164770880728, "grad_norm": 8.739027976989746, "learning_rate": 5.4609777612702535e-08, "logits/chosen": -18.892789840698242, "logits/rejected": -17.444799423217773, "logps/chosen": -434.3921813964844, "logps/rejected": -240.49984741210938, "loss": 0.4749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.899557590484619, "rewards/margins": 2.1171295642852783, "rewards/rejected": 0.7824279069900513, "step": 88130 }, { "epoch": 4.092111982914713, "grad_norm": 18.898590087890625, "learning_rate": 5.458192116625655e-08, "logits/chosen": -19.397048950195312, "logits/rejected": -19.04665756225586, "logps/chosen": -381.91790771484375, "logps/rejected": -374.06390380859375, "loss": 0.7674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9828152656555176, "rewards/margins": 0.9729375839233398, "rewards/rejected": 3.009878158569336, "step": 88140 }, { "epoch": 4.092576257022146, "grad_norm": 4.084582805633545, "learning_rate": 5.455406471981057e-08, "logits/chosen": -18.87971305847168, "logits/rejected": -18.230117797851562, "logps/chosen": -321.8685302734375, "logps/rejected": -299.7957458496094, "loss": 1.0391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.535064697265625, "rewards/margins": 1.0174052715301514, "rewards/rejected": 1.5176595449447632, "step": 88150 }, { "epoch": 4.093040531129579, "grad_norm": 35.25815200805664, "learning_rate": 5.452620827336459e-08, "logits/chosen": -18.908342361450195, "logits/rejected": -17.99601173400879, "logps/chosen": -300.3124084472656, "logps/rejected": -224.9798583984375, "loss": 0.7111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4235646724700928, "rewards/margins": 1.1792737245559692, "rewards/rejected": 1.2442914247512817, "step": 88160 }, { "epoch": 4.0935048052370115, "grad_norm": 63.38849639892578, "learning_rate": 5.449835182691861e-08, "logits/chosen": -19.088228225708008, "logits/rejected": -17.576248168945312, "logps/chosen": -413.4610900878906, "logps/rejected": -325.37017822265625, "loss": 0.5046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8975651264190674, "rewards/margins": 2.286135673522949, "rewards/rejected": 1.611429214477539, "step": 88170 }, { "epoch": 4.093969079344445, "grad_norm": 160.1681365966797, "learning_rate": 5.447049538047263e-08, "logits/chosen": -18.067625045776367, "logits/rejected": -17.52825927734375, "logps/chosen": -321.07440185546875, "logps/rejected": -261.221923828125, "loss": 0.8248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1547117233276367, "rewards/margins": 1.5847809314727783, "rewards/rejected": 1.5699307918548584, "step": 88180 }, { "epoch": 4.094433353451878, "grad_norm": 3.2313380241394043, "learning_rate": 5.444263893402665e-08, "logits/chosen": -17.890417098999023, "logits/rejected": -17.32740020751953, "logps/chosen": -350.09967041015625, "logps/rejected": -216.21255493164062, "loss": 1.0179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5192104578018188, "rewards/margins": 0.3143690228462219, "rewards/rejected": 1.2048414945602417, "step": 88190 }, { "epoch": 4.094897627559311, "grad_norm": 22.21270751953125, "learning_rate": 5.4414782487580664e-08, "logits/chosen": -19.069570541381836, "logits/rejected": -17.560192108154297, "logps/chosen": -399.66058349609375, "logps/rejected": -272.68939208984375, "loss": 0.4854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6652920246124268, "rewards/margins": 2.131655693054199, "rewards/rejected": 1.5336360931396484, "step": 88200 }, { "epoch": 4.095361901666744, "grad_norm": 0.9910971522331238, "learning_rate": 5.4386926041134685e-08, "logits/chosen": -19.652652740478516, "logits/rejected": -19.066715240478516, "logps/chosen": -459.47686767578125, "logps/rejected": -300.8116760253906, "loss": 0.7304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.695674180984497, "rewards/margins": 1.2974958419799805, "rewards/rejected": 2.398178815841675, "step": 88210 }, { "epoch": 4.095826175774177, "grad_norm": 26.52846908569336, "learning_rate": 5.4359069594688706e-08, "logits/chosen": -19.00197410583496, "logits/rejected": -17.336681365966797, "logps/chosen": -422.97808837890625, "logps/rejected": -205.4238739013672, "loss": 0.1913, "rewards/accuracies": 1.0, "rewards/chosen": 3.1307284832000732, "rewards/margins": 3.1189959049224854, "rewards/rejected": 0.011732960119843483, "step": 88220 }, { "epoch": 4.09629044988161, "grad_norm": 20.666099548339844, "learning_rate": 5.433121314824272e-08, "logits/chosen": -19.71413803100586, "logits/rejected": -18.84322166442871, "logps/chosen": -433.06988525390625, "logps/rejected": -293.1703186035156, "loss": 0.4653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.162147045135498, "rewards/margins": 1.5715296268463135, "rewards/rejected": 2.5906176567077637, "step": 88230 }, { "epoch": 4.096754723989043, "grad_norm": 0.22311994433403015, "learning_rate": 5.430335670179674e-08, "logits/chosen": -18.814178466796875, "logits/rejected": -17.7157039642334, "logps/chosen": -332.3865966796875, "logps/rejected": -242.6892852783203, "loss": 0.6347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4224343299865723, "rewards/margins": 1.2391002178192139, "rewards/rejected": 1.1833341121673584, "step": 88240 }, { "epoch": 4.097218998096476, "grad_norm": 59.30512237548828, "learning_rate": 5.4275500255350756e-08, "logits/chosen": -18.12374496459961, "logits/rejected": -17.94425392150879, "logps/chosen": -368.7588806152344, "logps/rejected": -345.603271484375, "loss": 0.9486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6868488788604736, "rewards/margins": 0.4738657474517822, "rewards/rejected": 2.2129831314086914, "step": 88250 }, { "epoch": 4.097683272203909, "grad_norm": 53.659446716308594, "learning_rate": 5.424764380890478e-08, "logits/chosen": -19.345680236816406, "logits/rejected": -18.826316833496094, "logps/chosen": -527.1710205078125, "logps/rejected": -416.09259033203125, "loss": 0.3298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.121662616729736, "rewards/margins": 1.700918436050415, "rewards/rejected": 2.420743703842163, "step": 88260 }, { "epoch": 4.098147546311342, "grad_norm": 96.07408142089844, "learning_rate": 5.42197873624588e-08, "logits/chosen": -19.314167022705078, "logits/rejected": -18.02425765991211, "logps/chosen": -380.16571044921875, "logps/rejected": -298.57513427734375, "loss": 0.6336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0738348960876465, "rewards/margins": 1.691976547241211, "rewards/rejected": 1.3818585872650146, "step": 88270 }, { "epoch": 4.098611820418776, "grad_norm": 90.1679458618164, "learning_rate": 5.419193091601281e-08, "logits/chosen": -18.980533599853516, "logits/rejected": -17.984806060791016, "logps/chosen": -463.7259216308594, "logps/rejected": -336.7532043457031, "loss": 0.6286, "rewards/accuracies": 0.5, "rewards/chosen": 3.5193276405334473, "rewards/margins": 1.395076036453247, "rewards/rejected": 2.1242518424987793, "step": 88280 }, { "epoch": 4.099076094526208, "grad_norm": 0.9022465348243713, "learning_rate": 5.4164074469566834e-08, "logits/chosen": -20.311948776245117, "logits/rejected": -18.629474639892578, "logps/chosen": -512.7236938476562, "logps/rejected": -422.2606506347656, "loss": 0.6188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.99652361869812, "rewards/margins": 1.3833727836608887, "rewards/rejected": 2.6131508350372314, "step": 88290 }, { "epoch": 4.099540368633641, "grad_norm": 47.7110595703125, "learning_rate": 5.413621802312084e-08, "logits/chosen": -19.981704711914062, "logits/rejected": -19.706737518310547, "logps/chosen": -403.52777099609375, "logps/rejected": -384.065673828125, "loss": 1.0481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.95717191696167, "rewards/margins": 0.16300591826438904, "rewards/rejected": 2.794166088104248, "step": 88300 }, { "epoch": 4.100004642741074, "grad_norm": 3.2056052684783936, "learning_rate": 5.410836157667486e-08, "logits/chosen": -19.3927001953125, "logits/rejected": -17.793315887451172, "logps/chosen": -406.8897399902344, "logps/rejected": -278.6191711425781, "loss": 0.2558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6441256999969482, "rewards/margins": 2.340200901031494, "rewards/rejected": 1.3039250373840332, "step": 88310 }, { "epoch": 4.100468916848508, "grad_norm": 280.3338623046875, "learning_rate": 5.4080505130228884e-08, "logits/chosen": -18.619958877563477, "logits/rejected": -17.90292739868164, "logps/chosen": -439.20947265625, "logps/rejected": -349.09722900390625, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": 4.6394362449646, "rewards/margins": 1.7243725061416626, "rewards/rejected": 2.9150640964508057, "step": 88320 }, { "epoch": 4.10093319095594, "grad_norm": 14.514494895935059, "learning_rate": 5.40526486837829e-08, "logits/chosen": -19.35147476196289, "logits/rejected": -18.057334899902344, "logps/chosen": -481.84527587890625, "logps/rejected": -343.4397888183594, "loss": 0.6625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.660612106323242, "rewards/margins": 2.7018160820007324, "rewards/rejected": 1.9587961435317993, "step": 88330 }, { "epoch": 4.101397465063373, "grad_norm": 131.57199096679688, "learning_rate": 5.402479223733692e-08, "logits/chosen": -19.70395278930664, "logits/rejected": -19.708797454833984, "logps/chosen": -424.2637634277344, "logps/rejected": -407.070556640625, "loss": 0.9319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6327056884765625, "rewards/margins": 0.5526095628738403, "rewards/rejected": 4.080096244812012, "step": 88340 }, { "epoch": 4.101861739170807, "grad_norm": 0.36624354124069214, "learning_rate": 5.3996935790890935e-08, "logits/chosen": -19.50882339477539, "logits/rejected": -17.359569549560547, "logps/chosen": -489.2362365722656, "logps/rejected": -320.594970703125, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 5.281741142272949, "rewards/margins": 3.455531597137451, "rewards/rejected": 1.8262100219726562, "step": 88350 }, { "epoch": 4.1023260132782395, "grad_norm": 55.88205337524414, "learning_rate": 5.3969079344444956e-08, "logits/chosen": -18.876842498779297, "logits/rejected": -17.11149024963379, "logps/chosen": -396.650146484375, "logps/rejected": -217.4137420654297, "loss": 0.4591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6799724102020264, "rewards/margins": 2.0565202236175537, "rewards/rejected": 1.6234524250030518, "step": 88360 }, { "epoch": 4.102790287385672, "grad_norm": 0.5403894186019897, "learning_rate": 5.394122289799898e-08, "logits/chosen": -18.7640438079834, "logits/rejected": -17.994258880615234, "logps/chosen": -432.6797790527344, "logps/rejected": -395.1539001464844, "loss": 0.4943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.063136100769043, "rewards/margins": 2.3036108016967773, "rewards/rejected": 1.7595250606536865, "step": 88370 }, { "epoch": 4.103254561493106, "grad_norm": 14.209012031555176, "learning_rate": 5.391336645155299e-08, "logits/chosen": -19.34400177001953, "logits/rejected": -17.56753158569336, "logps/chosen": -362.5127868652344, "logps/rejected": -299.61248779296875, "loss": 0.8066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.793496608734131, "rewards/margins": 2.570648431777954, "rewards/rejected": 1.2228481769561768, "step": 88380 }, { "epoch": 4.103718835600539, "grad_norm": 160.54769897460938, "learning_rate": 5.388551000510701e-08, "logits/chosen": -18.82890510559082, "logits/rejected": -18.21249008178711, "logps/chosen": -325.913818359375, "logps/rejected": -410.95928955078125, "loss": 1.1915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5020933151245117, "rewards/margins": 0.14278434216976166, "rewards/rejected": 2.3593087196350098, "step": 88390 }, { "epoch": 4.1041831097079715, "grad_norm": 4.237366676330566, "learning_rate": 5.3857653558661034e-08, "logits/chosen": -19.025245666503906, "logits/rejected": -18.1850528717041, "logps/chosen": -354.93707275390625, "logps/rejected": -356.11260986328125, "loss": 0.5181, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5265583992004395, "rewards/margins": 1.1534733772277832, "rewards/rejected": 2.373084545135498, "step": 88400 }, { "epoch": 4.104647383815404, "grad_norm": 39.78645706176758, "learning_rate": 5.382979711221505e-08, "logits/chosen": -19.666690826416016, "logits/rejected": -17.826709747314453, "logps/chosen": -392.7994384765625, "logps/rejected": -232.38534545898438, "loss": 0.72, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6897971630096436, "rewards/margins": 1.7647273540496826, "rewards/rejected": 1.9250694513320923, "step": 88410 }, { "epoch": 4.105111657922838, "grad_norm": 48.62060546875, "learning_rate": 5.380194066576907e-08, "logits/chosen": -19.271970748901367, "logits/rejected": -18.60378074645996, "logps/chosen": -385.1769104003906, "logps/rejected": -401.696044921875, "loss": 1.5897, "rewards/accuracies": 0.5, "rewards/chosen": 2.3990702629089355, "rewards/margins": -0.6320046782493591, "rewards/rejected": 3.0310750007629395, "step": 88420 }, { "epoch": 4.105575932030271, "grad_norm": 103.37901306152344, "learning_rate": 5.377408421932309e-08, "logits/chosen": -19.371620178222656, "logits/rejected": -18.34624671936035, "logps/chosen": -429.2911071777344, "logps/rejected": -291.1756896972656, "loss": 0.2382, "rewards/accuracies": 1.0, "rewards/chosen": 4.632590293884277, "rewards/margins": 3.0345444679260254, "rewards/rejected": 1.5980459451675415, "step": 88430 }, { "epoch": 4.1060402061377035, "grad_norm": 51.967647552490234, "learning_rate": 5.3746227772877105e-08, "logits/chosen": -18.583660125732422, "logits/rejected": -17.629276275634766, "logps/chosen": -395.7762145996094, "logps/rejected": -312.6534118652344, "loss": 0.3914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.480551242828369, "rewards/margins": 1.8063023090362549, "rewards/rejected": 1.674249291419983, "step": 88440 }, { "epoch": 4.106504480245137, "grad_norm": 14.890382766723633, "learning_rate": 5.3718371326431126e-08, "logits/chosen": -20.268314361572266, "logits/rejected": -19.423439025878906, "logps/chosen": -483.9119567871094, "logps/rejected": -476.1749572753906, "loss": 0.4221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9345791339874268, "rewards/margins": 1.0970414876937866, "rewards/rejected": 2.8375377655029297, "step": 88450 }, { "epoch": 4.10696875435257, "grad_norm": 14.557973861694336, "learning_rate": 5.369051487998514e-08, "logits/chosen": -18.89270782470703, "logits/rejected": -17.989538192749023, "logps/chosen": -430.58026123046875, "logps/rejected": -313.004150390625, "loss": 0.5912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3366196155548096, "rewards/margins": 1.5019153356552124, "rewards/rejected": 1.8347041606903076, "step": 88460 }, { "epoch": 4.107433028460003, "grad_norm": 1.3299978971481323, "learning_rate": 5.366265843353916e-08, "logits/chosen": -19.888282775878906, "logits/rejected": -18.5421085357666, "logps/chosen": -405.3365478515625, "logps/rejected": -333.80084228515625, "loss": 0.5422, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.901689529418945, "rewards/margins": 2.3883538246154785, "rewards/rejected": 2.513335704803467, "step": 88470 }, { "epoch": 4.1078973025674355, "grad_norm": 128.22398376464844, "learning_rate": 5.363480198709318e-08, "logits/chosen": -19.02066421508789, "logits/rejected": -17.998363494873047, "logps/chosen": -463.3197326660156, "logps/rejected": -311.65216064453125, "loss": 0.5797, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.200467109680176, "rewards/margins": 3.140442371368408, "rewards/rejected": 2.0600247383117676, "step": 88480 }, { "epoch": 4.108361576674869, "grad_norm": 32.38543701171875, "learning_rate": 5.360694554064719e-08, "logits/chosen": -19.753053665161133, "logits/rejected": -18.340669631958008, "logps/chosen": -347.0214538574219, "logps/rejected": -226.45608520507812, "loss": 0.2848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.001660346984863, "rewards/margins": 2.8592171669006348, "rewards/rejected": 1.1424429416656494, "step": 88490 }, { "epoch": 4.108825850782302, "grad_norm": 9.70044994354248, "learning_rate": 5.357908909420121e-08, "logits/chosen": -19.351959228515625, "logits/rejected": -18.40936279296875, "logps/chosen": -460.792724609375, "logps/rejected": -316.7472229003906, "loss": 0.3543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.377838134765625, "rewards/margins": 2.310495615005493, "rewards/rejected": 2.06734299659729, "step": 88500 }, { "epoch": 4.109290124889735, "grad_norm": 215.34884643554688, "learning_rate": 5.355123264775523e-08, "logits/chosen": -19.752094268798828, "logits/rejected": -19.22397232055664, "logps/chosen": -350.3782043457031, "logps/rejected": -376.00341796875, "loss": 0.5275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8233606815338135, "rewards/margins": 0.7725729942321777, "rewards/rejected": 2.0507874488830566, "step": 88510 }, { "epoch": 4.109754398997168, "grad_norm": 55.32825469970703, "learning_rate": 5.352337620130925e-08, "logits/chosen": -20.569351196289062, "logits/rejected": -20.12851905822754, "logps/chosen": -351.5078430175781, "logps/rejected": -386.8692932128906, "loss": 1.2058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.143270969390869, "rewards/margins": 0.3219192922115326, "rewards/rejected": 2.8213515281677246, "step": 88520 }, { "epoch": 4.110218673104601, "grad_norm": 0.8257951736450195, "learning_rate": 5.349551975486327e-08, "logits/chosen": -19.382291793823242, "logits/rejected": -18.374608993530273, "logps/chosen": -404.88800048828125, "logps/rejected": -275.53118896484375, "loss": 0.4439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9418082237243652, "rewards/margins": 2.724088191986084, "rewards/rejected": 1.2177202701568604, "step": 88530 }, { "epoch": 4.110682947212034, "grad_norm": 85.19377136230469, "learning_rate": 5.3467663308417284e-08, "logits/chosen": -19.152482986450195, "logits/rejected": -18.893054962158203, "logps/chosen": -413.22119140625, "logps/rejected": -349.79461669921875, "loss": 0.5076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3651256561279297, "rewards/margins": 1.1737494468688965, "rewards/rejected": 2.1913764476776123, "step": 88540 }, { "epoch": 4.111147221319467, "grad_norm": 42.63365936279297, "learning_rate": 5.3439806861971305e-08, "logits/chosen": -19.37012481689453, "logits/rejected": -18.372913360595703, "logps/chosen": -354.72344970703125, "logps/rejected": -302.3307800292969, "loss": 0.561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1991994380950928, "rewards/margins": 1.3062316179275513, "rewards/rejected": 1.8929675817489624, "step": 88550 }, { "epoch": 4.1116114954269, "grad_norm": 1.4493823051452637, "learning_rate": 5.341195041552532e-08, "logits/chosen": -20.18426513671875, "logits/rejected": -18.65890884399414, "logps/chosen": -407.6827087402344, "logps/rejected": -242.60009765625, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 4.408477783203125, "rewards/margins": 3.103731155395508, "rewards/rejected": 1.3047466278076172, "step": 88560 }, { "epoch": 4.112075769534333, "grad_norm": 290.9344482421875, "learning_rate": 5.338409396907934e-08, "logits/chosen": -19.140531539916992, "logits/rejected": -18.124156951904297, "logps/chosen": -432.1747131347656, "logps/rejected": -360.3928527832031, "loss": 0.7223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.372803688049316, "rewards/margins": 1.5704396963119507, "rewards/rejected": 2.802363872528076, "step": 88570 }, { "epoch": 4.112540043641766, "grad_norm": 8.548849105834961, "learning_rate": 5.335623752263336e-08, "logits/chosen": -19.615428924560547, "logits/rejected": -17.809062957763672, "logps/chosen": -365.2621765136719, "logps/rejected": -238.4878692626953, "loss": 0.2993, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4747796058654785, "rewards/margins": 2.365705966949463, "rewards/rejected": 1.109073281288147, "step": 88580 }, { "epoch": 4.1130043177491995, "grad_norm": 31.47927474975586, "learning_rate": 5.3328381076187376e-08, "logits/chosen": -19.099042892456055, "logits/rejected": -18.84910774230957, "logps/chosen": -296.7079162597656, "logps/rejected": -286.18817138671875, "loss": 0.8315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7523818016052246, "rewards/margins": 0.5463134050369263, "rewards/rejected": 2.206068515777588, "step": 88590 }, { "epoch": 4.113468591856632, "grad_norm": 177.063720703125, "learning_rate": 5.33005246297414e-08, "logits/chosen": -18.60434913635254, "logits/rejected": -18.387216567993164, "logps/chosen": -416.0039978027344, "logps/rejected": -387.86102294921875, "loss": 0.9505, "rewards/accuracies": 0.5, "rewards/chosen": 3.715134859085083, "rewards/margins": 1.0106613636016846, "rewards/rejected": 2.7044734954833984, "step": 88600 }, { "epoch": 4.113932865964065, "grad_norm": 10.709646224975586, "learning_rate": 5.327266818329542e-08, "logits/chosen": -19.247201919555664, "logits/rejected": -17.31396484375, "logps/chosen": -392.3731384277344, "logps/rejected": -293.66943359375, "loss": 0.8011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4661312103271484, "rewards/margins": 2.085860013961792, "rewards/rejected": 1.3802717924118042, "step": 88610 }, { "epoch": 4.114397140071498, "grad_norm": 205.3813018798828, "learning_rate": 5.324481173684943e-08, "logits/chosen": -18.07761573791504, "logits/rejected": -18.60724639892578, "logps/chosen": -293.68499755859375, "logps/rejected": -305.1242980957031, "loss": 1.4443, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9947789907455444, "rewards/margins": -0.548763632774353, "rewards/rejected": 2.5435426235198975, "step": 88620 }, { "epoch": 4.1148614141789315, "grad_norm": 109.1440658569336, "learning_rate": 5.3216955290403454e-08, "logits/chosen": -19.448007583618164, "logits/rejected": -19.053035736083984, "logps/chosen": -403.81195068359375, "logps/rejected": -296.2432861328125, "loss": 0.6337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5640172958374023, "rewards/margins": 0.6945110559463501, "rewards/rejected": 1.8695061206817627, "step": 88630 }, { "epoch": 4.115325688286364, "grad_norm": 38.47166442871094, "learning_rate": 5.3189098843957475e-08, "logits/chosen": -20.176349639892578, "logits/rejected": -19.12328338623047, "logps/chosen": -463.80389404296875, "logps/rejected": -381.0209045410156, "loss": 0.6315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.630761623382568, "rewards/margins": 1.4717979431152344, "rewards/rejected": 3.158964157104492, "step": 88640 }, { "epoch": 4.115789962393797, "grad_norm": 48.23757553100586, "learning_rate": 5.316124239751149e-08, "logits/chosen": -19.501977920532227, "logits/rejected": -18.413644790649414, "logps/chosen": -397.6639404296875, "logps/rejected": -285.4499816894531, "loss": 0.2943, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.535728931427002, "rewards/margins": 2.02016282081604, "rewards/rejected": 1.5155662298202515, "step": 88650 }, { "epoch": 4.116254236501231, "grad_norm": 161.9502716064453, "learning_rate": 5.313338595106551e-08, "logits/chosen": -19.369455337524414, "logits/rejected": -19.172725677490234, "logps/chosen": -321.21405029296875, "logps/rejected": -264.53936767578125, "loss": 0.7706, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4032156467437744, "rewards/margins": 0.4385804533958435, "rewards/rejected": 1.9646351337432861, "step": 88660 }, { "epoch": 4.1167185106086635, "grad_norm": 22.113889694213867, "learning_rate": 5.3105529504619526e-08, "logits/chosen": -18.99016761779785, "logits/rejected": -18.93968391418457, "logps/chosen": -394.26416015625, "logps/rejected": -342.82098388671875, "loss": 0.3809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5371780395507812, "rewards/margins": 1.2366105318069458, "rewards/rejected": 1.3005675077438354, "step": 88670 }, { "epoch": 4.117182784716096, "grad_norm": 194.0795135498047, "learning_rate": 5.307767305817355e-08, "logits/chosen": -19.129833221435547, "logits/rejected": -18.0144100189209, "logps/chosen": -395.78436279296875, "logps/rejected": -309.6180419921875, "loss": 0.6631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4516360759735107, "rewards/margins": 1.3452911376953125, "rewards/rejected": 2.1063449382781982, "step": 88680 }, { "epoch": 4.117647058823529, "grad_norm": 0.5953729748725891, "learning_rate": 5.3049816611727555e-08, "logits/chosen": -18.41403579711914, "logits/rejected": -17.824787139892578, "logps/chosen": -383.95904541015625, "logps/rejected": -308.23638916015625, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1050403118133545, "rewards/margins": 1.4025623798370361, "rewards/rejected": 1.702478051185608, "step": 88690 }, { "epoch": 4.118111332930963, "grad_norm": 40.604530334472656, "learning_rate": 5.3021960165281576e-08, "logits/chosen": -19.273754119873047, "logits/rejected": -19.02073860168457, "logps/chosen": -354.9163513183594, "logps/rejected": -298.3055114746094, "loss": 1.2145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3986401557922363, "rewards/margins": 0.21471023559570312, "rewards/rejected": 3.183929920196533, "step": 88700 }, { "epoch": 4.118575607038395, "grad_norm": 172.6907958984375, "learning_rate": 5.29941037188356e-08, "logits/chosen": -18.75275421142578, "logits/rejected": -18.418724060058594, "logps/chosen": -427.92547607421875, "logps/rejected": -318.66632080078125, "loss": 1.3171, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.027648448944092, "rewards/margins": 0.6513289213180542, "rewards/rejected": 2.376319408416748, "step": 88710 }, { "epoch": 4.119039881145828, "grad_norm": 247.60610961914062, "learning_rate": 5.296624727238961e-08, "logits/chosen": -18.38798713684082, "logits/rejected": -18.653026580810547, "logps/chosen": -411.77301025390625, "logps/rejected": -452.12744140625, "loss": 1.3801, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.977586269378662, "rewards/margins": -0.4046275019645691, "rewards/rejected": 4.382213592529297, "step": 88720 }, { "epoch": 4.119504155253262, "grad_norm": 37.115020751953125, "learning_rate": 5.293839082594363e-08, "logits/chosen": -19.46668243408203, "logits/rejected": -18.313608169555664, "logps/chosen": -405.0603332519531, "logps/rejected": -386.814453125, "loss": 0.7788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1279103755950928, "rewards/margins": 1.1267175674438477, "rewards/rejected": 2.001192808151245, "step": 88730 }, { "epoch": 4.119968429360695, "grad_norm": 56.924407958984375, "learning_rate": 5.2910534379497654e-08, "logits/chosen": -19.225202560424805, "logits/rejected": -18.859140396118164, "logps/chosen": -400.63720703125, "logps/rejected": -354.2302551269531, "loss": 0.7572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5579957962036133, "rewards/margins": 1.0050790309906006, "rewards/rejected": 1.5529167652130127, "step": 88740 }, { "epoch": 4.120432703468127, "grad_norm": 120.0875244140625, "learning_rate": 5.288267793305167e-08, "logits/chosen": -19.038074493408203, "logits/rejected": -18.92507553100586, "logps/chosen": -382.0807189941406, "logps/rejected": -298.5713195800781, "loss": 1.1619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.455228805541992, "rewards/margins": 0.532710611820221, "rewards/rejected": 2.922518491744995, "step": 88750 }, { "epoch": 4.12089697757556, "grad_norm": 32.48278045654297, "learning_rate": 5.285482148660569e-08, "logits/chosen": -19.542818069458008, "logits/rejected": -19.086748123168945, "logps/chosen": -340.90472412109375, "logps/rejected": -291.7933654785156, "loss": 0.5615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.329270124435425, "rewards/margins": 1.5996049642562866, "rewards/rejected": 1.7296651601791382, "step": 88760 }, { "epoch": 4.121361251682994, "grad_norm": 181.23374938964844, "learning_rate": 5.2826965040159704e-08, "logits/chosen": -19.849714279174805, "logits/rejected": -19.22154998779297, "logps/chosen": -385.0262145996094, "logps/rejected": -226.4681854248047, "loss": 0.4435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7462515830993652, "rewards/margins": 2.18867826461792, "rewards/rejected": 1.5575731992721558, "step": 88770 }, { "epoch": 4.121825525790427, "grad_norm": 3.674933910369873, "learning_rate": 5.2799108593713725e-08, "logits/chosen": -18.078702926635742, "logits/rejected": -17.841489791870117, "logps/chosen": -347.87091064453125, "logps/rejected": -412.4283752441406, "loss": 0.9284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.442969799041748, "rewards/margins": 0.47652092576026917, "rewards/rejected": 1.9664487838745117, "step": 88780 }, { "epoch": 4.122289799897859, "grad_norm": 16.250770568847656, "learning_rate": 5.2771252147267746e-08, "logits/chosen": -19.756099700927734, "logits/rejected": -18.696653366088867, "logps/chosen": -360.27142333984375, "logps/rejected": -365.42413330078125, "loss": 0.4307, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9386062622070312, "rewards/margins": 1.8441352844238281, "rewards/rejected": 2.094470500946045, "step": 88790 }, { "epoch": 4.122754074005293, "grad_norm": 48.349002838134766, "learning_rate": 5.274339570082176e-08, "logits/chosen": -19.00011444091797, "logits/rejected": -17.975902557373047, "logps/chosen": -356.48797607421875, "logps/rejected": -256.32342529296875, "loss": 0.364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.01627779006958, "rewards/margins": 1.8458446264266968, "rewards/rejected": 1.1704330444335938, "step": 88800 }, { "epoch": 4.123218348112726, "grad_norm": 173.219970703125, "learning_rate": 5.271553925437578e-08, "logits/chosen": -18.9211483001709, "logits/rejected": -18.10727310180664, "logps/chosen": -347.64373779296875, "logps/rejected": -338.04962158203125, "loss": 0.648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.180253505706787, "rewards/margins": 1.2843902111053467, "rewards/rejected": 1.8958631753921509, "step": 88810 }, { "epoch": 4.123682622220159, "grad_norm": 42.598968505859375, "learning_rate": 5.26876828079298e-08, "logits/chosen": -18.301130294799805, "logits/rejected": -17.669509887695312, "logps/chosen": -330.190673828125, "logps/rejected": -286.0214538574219, "loss": 0.8646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.061734199523926, "rewards/margins": 0.853603720664978, "rewards/rejected": 1.2081302404403687, "step": 88820 }, { "epoch": 4.124146896327592, "grad_norm": 58.43550109863281, "learning_rate": 5.265982636148382e-08, "logits/chosen": -18.51238441467285, "logits/rejected": -18.31302261352539, "logps/chosen": -316.1868591308594, "logps/rejected": -255.7566375732422, "loss": 0.3528, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2183480262756348, "rewards/margins": 2.807015895843506, "rewards/rejected": 0.4113319516181946, "step": 88830 }, { "epoch": 4.124611170435025, "grad_norm": 11.677511215209961, "learning_rate": 5.263196991503784e-08, "logits/chosen": -19.061080932617188, "logits/rejected": -17.90506935119629, "logps/chosen": -380.8970947265625, "logps/rejected": -211.54708862304688, "loss": 0.314, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8031840324401855, "rewards/margins": 2.092090129852295, "rewards/rejected": 1.7110939025878906, "step": 88840 }, { "epoch": 4.125075444542458, "grad_norm": 12.007513999938965, "learning_rate": 5.260411346859186e-08, "logits/chosen": -19.50541114807129, "logits/rejected": -18.879192352294922, "logps/chosen": -345.452880859375, "logps/rejected": -287.45404052734375, "loss": 0.5547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2979094982147217, "rewards/margins": 1.5406994819641113, "rewards/rejected": 1.7572100162506104, "step": 88850 }, { "epoch": 4.1255397186498906, "grad_norm": 58.24189758300781, "learning_rate": 5.2576257022145875e-08, "logits/chosen": -18.600788116455078, "logits/rejected": -17.102460861206055, "logps/chosen": -316.66278076171875, "logps/rejected": -173.6294403076172, "loss": 0.3615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9344978332519531, "rewards/margins": 1.7886377573013306, "rewards/rejected": 0.14586010575294495, "step": 88860 }, { "epoch": 4.126003992757324, "grad_norm": 39.95395278930664, "learning_rate": 5.2548400575699896e-08, "logits/chosen": -18.879737854003906, "logits/rejected": -19.410680770874023, "logps/chosen": -288.30072021484375, "logps/rejected": -308.5834045410156, "loss": 1.2715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.443164587020874, "rewards/margins": -0.5345865488052368, "rewards/rejected": 2.9777512550354004, "step": 88870 }, { "epoch": 4.126468266864757, "grad_norm": 9.081633567810059, "learning_rate": 5.252054412925391e-08, "logits/chosen": -18.0844669342041, "logits/rejected": -17.064815521240234, "logps/chosen": -392.176025390625, "logps/rejected": -309.5068359375, "loss": 0.9493, "rewards/accuracies": 0.5, "rewards/chosen": 2.745607376098633, "rewards/margins": 0.8634132146835327, "rewards/rejected": 1.8821942806243896, "step": 88880 }, { "epoch": 4.12693254097219, "grad_norm": 7.030005931854248, "learning_rate": 5.2492687682807925e-08, "logits/chosen": -19.605701446533203, "logits/rejected": -18.806045532226562, "logps/chosen": -396.5187072753906, "logps/rejected": -308.83203125, "loss": 0.9185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1820666790008545, "rewards/margins": 0.5578907132148743, "rewards/rejected": 2.624176263809204, "step": 88890 }, { "epoch": 4.127396815079623, "grad_norm": 32.86577224731445, "learning_rate": 5.246483123636194e-08, "logits/chosen": -18.98586654663086, "logits/rejected": -18.509784698486328, "logps/chosen": -373.8885498046875, "logps/rejected": -323.5055236816406, "loss": 0.8534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.384697437286377, "rewards/margins": 1.6668474674224854, "rewards/rejected": 1.7178500890731812, "step": 88900 }, { "epoch": 4.127861089187056, "grad_norm": 128.5223388671875, "learning_rate": 5.243697478991596e-08, "logits/chosen": -19.125621795654297, "logits/rejected": -19.224571228027344, "logps/chosen": -402.82000732421875, "logps/rejected": -409.62188720703125, "loss": 0.9847, "rewards/accuracies": 0.5, "rewards/chosen": 3.3179619312286377, "rewards/margins": -0.10825216770172119, "rewards/rejected": 3.4262137413024902, "step": 88910 }, { "epoch": 4.128325363294489, "grad_norm": 36.1657600402832, "learning_rate": 5.240911834346998e-08, "logits/chosen": -18.580652236938477, "logits/rejected": -18.737829208374023, "logps/chosen": -447.93194580078125, "logps/rejected": -439.65093994140625, "loss": 0.8526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.698859691619873, "rewards/margins": 0.7447272539138794, "rewards/rejected": 2.954132556915283, "step": 88920 }, { "epoch": 4.128789637401922, "grad_norm": 58.242984771728516, "learning_rate": 5.2381261897023996e-08, "logits/chosen": -18.715164184570312, "logits/rejected": -17.954544067382812, "logps/chosen": -419.0223693847656, "logps/rejected": -340.2294006347656, "loss": 0.695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2306969165802, "rewards/margins": 1.264202356338501, "rewards/rejected": 1.9664943218231201, "step": 88930 }, { "epoch": 4.129253911509355, "grad_norm": 0.04537982866168022, "learning_rate": 5.235340545057802e-08, "logits/chosen": -19.67743492126465, "logits/rejected": -18.731094360351562, "logps/chosen": -473.4878845214844, "logps/rejected": -386.93023681640625, "loss": 0.6469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.76345682144165, "rewards/margins": 1.4232597351074219, "rewards/rejected": 3.3401970863342285, "step": 88940 }, { "epoch": 4.129718185616788, "grad_norm": 107.19410705566406, "learning_rate": 5.232554900413204e-08, "logits/chosen": -19.953256607055664, "logits/rejected": -17.789026260375977, "logps/chosen": -427.4280700683594, "logps/rejected": -232.8319091796875, "loss": 0.3345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.19892692565918, "rewards/margins": 3.9116370677948, "rewards/rejected": 1.2872897386550903, "step": 88950 }, { "epoch": 4.130182459724221, "grad_norm": 239.73561096191406, "learning_rate": 5.229769255768605e-08, "logits/chosen": -18.77011489868164, "logits/rejected": -18.261749267578125, "logps/chosen": -427.3424377441406, "logps/rejected": -433.9278869628906, "loss": 0.8939, "rewards/accuracies": 0.5, "rewards/chosen": 2.875460624694824, "rewards/margins": 0.8713468313217163, "rewards/rejected": 2.0041136741638184, "step": 88960 }, { "epoch": 4.130646733831655, "grad_norm": 64.13668060302734, "learning_rate": 5.2269836111240074e-08, "logits/chosen": -19.568485260009766, "logits/rejected": -19.3327693939209, "logps/chosen": -412.23699951171875, "logps/rejected": -366.2776794433594, "loss": 0.6608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.761157274246216, "rewards/margins": 0.5454975962638855, "rewards/rejected": 3.2156596183776855, "step": 88970 }, { "epoch": 4.131111007939087, "grad_norm": 76.78627014160156, "learning_rate": 5.224197966479409e-08, "logits/chosen": -20.455699920654297, "logits/rejected": -19.508298873901367, "logps/chosen": -451.838623046875, "logps/rejected": -371.095947265625, "loss": 0.4994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.199286460876465, "rewards/margins": 0.7629138827323914, "rewards/rejected": 3.4363720417022705, "step": 88980 }, { "epoch": 4.13157528204652, "grad_norm": 89.32598114013672, "learning_rate": 5.221412321834811e-08, "logits/chosen": -19.711162567138672, "logits/rejected": -18.546857833862305, "logps/chosen": -512.6446533203125, "logps/rejected": -349.6772155761719, "loss": 0.5521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.41552472114563, "rewards/margins": 1.5063319206237793, "rewards/rejected": 1.909192681312561, "step": 88990 }, { "epoch": 4.132039556153953, "grad_norm": 83.16764831542969, "learning_rate": 5.218626677190213e-08, "logits/chosen": -19.282054901123047, "logits/rejected": -18.80026626586914, "logps/chosen": -474.97552490234375, "logps/rejected": -412.16375732421875, "loss": 0.757, "rewards/accuracies": 0.5, "rewards/chosen": 3.8839004039764404, "rewards/margins": 0.8663544654846191, "rewards/rejected": 3.017545461654663, "step": 89000 }, { "epoch": 4.132503830261387, "grad_norm": 113.0005111694336, "learning_rate": 5.2158410325456146e-08, "logits/chosen": -19.383352279663086, "logits/rejected": -18.482418060302734, "logps/chosen": -422.869384765625, "logps/rejected": -327.9219665527344, "loss": 0.6826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4772255420684814, "rewards/margins": 1.2371782064437866, "rewards/rejected": 2.240046977996826, "step": 89010 }, { "epoch": 4.132968104368819, "grad_norm": 154.22523498535156, "learning_rate": 5.213055387901017e-08, "logits/chosen": -18.581218719482422, "logits/rejected": -17.75235366821289, "logps/chosen": -305.58978271484375, "logps/rejected": -250.326416015625, "loss": 0.615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5632333755493164, "rewards/margins": 1.7754398584365845, "rewards/rejected": 0.787793755531311, "step": 89020 }, { "epoch": 4.133432378476252, "grad_norm": 35.26101303100586, "learning_rate": 5.210269743256419e-08, "logits/chosen": -19.78811264038086, "logits/rejected": -19.17715072631836, "logps/chosen": -463.3602600097656, "logps/rejected": -404.0315246582031, "loss": 0.1963, "rewards/accuracies": 1.0, "rewards/chosen": 5.463122367858887, "rewards/margins": 1.9663368463516235, "rewards/rejected": 3.496786594390869, "step": 89030 }, { "epoch": 4.133896652583686, "grad_norm": 26.630088806152344, "learning_rate": 5.20748409861182e-08, "logits/chosen": -19.905841827392578, "logits/rejected": -18.164342880249023, "logps/chosen": -360.18341064453125, "logps/rejected": -204.54478454589844, "loss": 0.2524, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.958540678024292, "rewards/margins": 2.4343018531799316, "rewards/rejected": 1.524239182472229, "step": 89040 }, { "epoch": 4.1343609266911185, "grad_norm": 2.036555051803589, "learning_rate": 5.2046984539672224e-08, "logits/chosen": -19.24447250366211, "logits/rejected": -18.041595458984375, "logps/chosen": -436.5062561035156, "logps/rejected": -306.83599853515625, "loss": 0.506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.668480396270752, "rewards/margins": 1.4318180084228516, "rewards/rejected": 2.2366621494293213, "step": 89050 }, { "epoch": 4.134825200798551, "grad_norm": 73.47915649414062, "learning_rate": 5.2019128093226245e-08, "logits/chosen": -18.795516967773438, "logits/rejected": -17.597742080688477, "logps/chosen": -415.347900390625, "logps/rejected": -246.51931762695312, "loss": 0.3236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7388572692871094, "rewards/margins": 2.762377977371216, "rewards/rejected": 0.9764793515205383, "step": 89060 }, { "epoch": 4.135289474905984, "grad_norm": 27.443483352661133, "learning_rate": 5.199127164678026e-08, "logits/chosen": -18.12419319152832, "logits/rejected": -18.150556564331055, "logps/chosen": -369.18701171875, "logps/rejected": -337.02740478515625, "loss": 1.5965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4189400672912598, "rewards/margins": 0.2485344409942627, "rewards/rejected": 2.170405626296997, "step": 89070 }, { "epoch": 4.135753749013418, "grad_norm": 42.54096603393555, "learning_rate": 5.196341520033428e-08, "logits/chosen": -19.691322326660156, "logits/rejected": -19.07232666015625, "logps/chosen": -344.6796875, "logps/rejected": -262.35064697265625, "loss": 0.4819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.555948257446289, "rewards/margins": 1.809281587600708, "rewards/rejected": 0.746666669845581, "step": 89080 }, { "epoch": 4.1362180231208505, "grad_norm": 52.7508430480957, "learning_rate": 5.193555875388829e-08, "logits/chosen": -18.732221603393555, "logits/rejected": -18.50367546081543, "logps/chosen": -419.2069396972656, "logps/rejected": -350.1866455078125, "loss": 0.2334, "rewards/accuracies": 1.0, "rewards/chosen": 4.001763820648193, "rewards/margins": 1.9784963130950928, "rewards/rejected": 2.0232675075531006, "step": 89090 }, { "epoch": 4.136682297228283, "grad_norm": 42.44437026977539, "learning_rate": 5.190770230744231e-08, "logits/chosen": -20.85328483581543, "logits/rejected": -19.300230026245117, "logps/chosen": -508.29083251953125, "logps/rejected": -490.2599182128906, "loss": 0.5689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.494567394256592, "rewards/margins": 1.2118775844573975, "rewards/rejected": 3.2826900482177734, "step": 89100 }, { "epoch": 4.137146571335717, "grad_norm": 5.189941883087158, "learning_rate": 5.1879845860996324e-08, "logits/chosen": -18.93368148803711, "logits/rejected": -16.965408325195312, "logps/chosen": -519.0262451171875, "logps/rejected": -223.36514282226562, "loss": 0.3018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.71917724609375, "rewards/margins": 2.73860239982605, "rewards/rejected": 0.9805749654769897, "step": 89110 }, { "epoch": 4.13761084544315, "grad_norm": 166.66397094726562, "learning_rate": 5.1851989414550345e-08, "logits/chosen": -18.650800704956055, "logits/rejected": -18.05275535583496, "logps/chosen": -402.56695556640625, "logps/rejected": -326.6853332519531, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9998602867126465, "rewards/margins": 1.120926856994629, "rewards/rejected": 1.878933310508728, "step": 89120 }, { "epoch": 4.1380751195505825, "grad_norm": 57.73348617553711, "learning_rate": 5.1824132968104366e-08, "logits/chosen": -19.72516441345215, "logits/rejected": -17.70686912536621, "logps/chosen": -433.62933349609375, "logps/rejected": -317.65106201171875, "loss": 0.2948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.773598670959473, "rewards/margins": 2.7421023845672607, "rewards/rejected": 2.031496286392212, "step": 89130 }, { "epoch": 4.138539393658015, "grad_norm": 21.586074829101562, "learning_rate": 5.179627652165838e-08, "logits/chosen": -18.90658187866211, "logits/rejected": -18.297405242919922, "logps/chosen": -481.211669921875, "logps/rejected": -417.07672119140625, "loss": 0.8532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.029944896697998, "rewards/margins": 1.393332839012146, "rewards/rejected": 3.6366124153137207, "step": 89140 }, { "epoch": 4.139003667765449, "grad_norm": 63.9537353515625, "learning_rate": 5.17684200752124e-08, "logits/chosen": -18.964832305908203, "logits/rejected": -18.44825553894043, "logps/chosen": -550.3328857421875, "logps/rejected": -373.510009765625, "loss": 0.6839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.10707426071167, "rewards/margins": 2.11431622505188, "rewards/rejected": 2.99275803565979, "step": 89150 }, { "epoch": 4.139467941872882, "grad_norm": 46.81494903564453, "learning_rate": 5.174056362876642e-08, "logits/chosen": -18.871374130249023, "logits/rejected": -17.569721221923828, "logps/chosen": -460.70794677734375, "logps/rejected": -324.87274169921875, "loss": 0.979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.29229211807251, "rewards/margins": 1.6451860666275024, "rewards/rejected": 2.647106170654297, "step": 89160 }, { "epoch": 4.1399322159803145, "grad_norm": 260.18218994140625, "learning_rate": 5.171270718232044e-08, "logits/chosen": -18.861431121826172, "logits/rejected": -17.041149139404297, "logps/chosen": -392.0663757324219, "logps/rejected": -246.4253692626953, "loss": 0.9366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.064730167388916, "rewards/margins": 1.4569095373153687, "rewards/rejected": 1.6078208684921265, "step": 89170 }, { "epoch": 4.140396490087748, "grad_norm": 29.93349838256836, "learning_rate": 5.168485073587446e-08, "logits/chosen": -19.044658660888672, "logits/rejected": -18.897769927978516, "logps/chosen": -307.26226806640625, "logps/rejected": -354.41363525390625, "loss": 0.7879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8510985374450684, "rewards/margins": 0.4889706075191498, "rewards/rejected": 2.3621277809143066, "step": 89180 }, { "epoch": 4.140860764195181, "grad_norm": 45.648319244384766, "learning_rate": 5.1656994289428473e-08, "logits/chosen": -19.576278686523438, "logits/rejected": -17.756624221801758, "logps/chosen": -539.1444702148438, "logps/rejected": -298.40240478515625, "loss": 0.119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.0573625564575195, "rewards/margins": 3.659151077270508, "rewards/rejected": 2.3982110023498535, "step": 89190 }, { "epoch": 4.141325038302614, "grad_norm": 239.9373321533203, "learning_rate": 5.1629137842982495e-08, "logits/chosen": -19.726198196411133, "logits/rejected": -18.84209632873535, "logps/chosen": -410.4373474121094, "logps/rejected": -331.88690185546875, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.171675205230713, "rewards/margins": 1.4925822019577026, "rewards/rejected": 2.6790928840637207, "step": 89200 }, { "epoch": 4.141789312410047, "grad_norm": 54.36091232299805, "learning_rate": 5.1601281396536516e-08, "logits/chosen": -18.835229873657227, "logits/rejected": -17.70586585998535, "logps/chosen": -398.3929138183594, "logps/rejected": -305.3699035644531, "loss": 0.4431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2112765312194824, "rewards/margins": 1.6984504461288452, "rewards/rejected": 1.5128259658813477, "step": 89210 }, { "epoch": 4.14225358651748, "grad_norm": 3.8755953311920166, "learning_rate": 5.157342495009053e-08, "logits/chosen": -19.29363250732422, "logits/rejected": -17.85250473022461, "logps/chosen": -385.3446350097656, "logps/rejected": -262.19329833984375, "loss": 0.5423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1816444396972656, "rewards/margins": 1.4684280157089233, "rewards/rejected": 1.7132164239883423, "step": 89220 }, { "epoch": 4.142717860624913, "grad_norm": 0.28278833627700806, "learning_rate": 5.154556850364455e-08, "logits/chosen": -18.96072006225586, "logits/rejected": -17.677030563354492, "logps/chosen": -433.42303466796875, "logps/rejected": -330.5916748046875, "loss": 0.7264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.296782970428467, "rewards/margins": 2.2628731727600098, "rewards/rejected": 2.033910036087036, "step": 89230 }, { "epoch": 4.143182134732346, "grad_norm": 214.29718017578125, "learning_rate": 5.151771205719857e-08, "logits/chosen": -19.42730712890625, "logits/rejected": -19.479549407958984, "logps/chosen": -403.9513244628906, "logps/rejected": -399.06866455078125, "loss": 0.8757, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.504610538482666, "rewards/margins": 0.3329820930957794, "rewards/rejected": 3.17162823677063, "step": 89240 }, { "epoch": 4.143646408839779, "grad_norm": 5.275020599365234, "learning_rate": 5.148985561075259e-08, "logits/chosen": -19.153154373168945, "logits/rejected": -19.402618408203125, "logps/chosen": -426.6490173339844, "logps/rejected": -402.86846923828125, "loss": 1.1007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7106499671936035, "rewards/margins": 0.6621707677841187, "rewards/rejected": 3.048478603363037, "step": 89250 }, { "epoch": 4.144110682947212, "grad_norm": 294.2132873535156, "learning_rate": 5.146199916430661e-08, "logits/chosen": -19.446531295776367, "logits/rejected": -18.9122314453125, "logps/chosen": -397.80841064453125, "logps/rejected": -368.78448486328125, "loss": 0.581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.290792465209961, "rewards/margins": 1.2528026103973389, "rewards/rejected": 2.037989854812622, "step": 89260 }, { "epoch": 4.144574957054645, "grad_norm": 79.36689758300781, "learning_rate": 5.143414271786063e-08, "logits/chosen": -18.86180877685547, "logits/rejected": -18.922536849975586, "logps/chosen": -448.95758056640625, "logps/rejected": -374.79010009765625, "loss": 1.588, "rewards/accuracies": 0.5, "rewards/chosen": 3.6305480003356934, "rewards/margins": 0.14489276707172394, "rewards/rejected": 3.4856555461883545, "step": 89270 }, { "epoch": 4.1450392311620785, "grad_norm": 53.30229187011719, "learning_rate": 5.1406286271414644e-08, "logits/chosen": -19.303483963012695, "logits/rejected": -18.138914108276367, "logps/chosen": -515.7252197265625, "logps/rejected": -385.1897277832031, "loss": 0.8797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.462899684906006, "rewards/margins": 1.7996103763580322, "rewards/rejected": 2.6632890701293945, "step": 89280 }, { "epoch": 4.145503505269511, "grad_norm": 52.290855407714844, "learning_rate": 5.137842982496866e-08, "logits/chosen": -19.127805709838867, "logits/rejected": -18.141557693481445, "logps/chosen": -471.90704345703125, "logps/rejected": -340.9212951660156, "loss": 0.9684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.263642311096191, "rewards/margins": 1.911259651184082, "rewards/rejected": 3.352382183074951, "step": 89290 }, { "epoch": 4.145967779376944, "grad_norm": 26.446924209594727, "learning_rate": 5.135057337852267e-08, "logits/chosen": -18.803789138793945, "logits/rejected": -18.41986846923828, "logps/chosen": -458.66387939453125, "logps/rejected": -413.3355407714844, "loss": 0.7112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.478448390960693, "rewards/margins": 0.7793253660202026, "rewards/rejected": 3.699122905731201, "step": 89300 }, { "epoch": 4.146432053484377, "grad_norm": 57.03623962402344, "learning_rate": 5.1322716932076694e-08, "logits/chosen": -19.265146255493164, "logits/rejected": -18.657623291015625, "logps/chosen": -373.07769775390625, "logps/rejected": -353.34832763671875, "loss": 0.5375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.553769588470459, "rewards/margins": 0.9912382960319519, "rewards/rejected": 2.5625317096710205, "step": 89310 }, { "epoch": 4.1468963275918105, "grad_norm": 32.41063690185547, "learning_rate": 5.129486048563071e-08, "logits/chosen": -19.581483840942383, "logits/rejected": -18.654672622680664, "logps/chosen": -334.931396484375, "logps/rejected": -317.66357421875, "loss": 0.8873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7594573497772217, "rewards/margins": 0.3506627380847931, "rewards/rejected": 2.40879487991333, "step": 89320 }, { "epoch": 4.147360601699243, "grad_norm": 84.49919891357422, "learning_rate": 5.126700403918473e-08, "logits/chosen": -18.786945343017578, "logits/rejected": -17.69216537475586, "logps/chosen": -559.68212890625, "logps/rejected": -387.9374084472656, "loss": 0.3878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.286181449890137, "rewards/margins": 2.287810802459717, "rewards/rejected": 1.9983704090118408, "step": 89330 }, { "epoch": 4.147824875806676, "grad_norm": 116.05602264404297, "learning_rate": 5.123914759273875e-08, "logits/chosen": -19.104068756103516, "logits/rejected": -18.38600730895996, "logps/chosen": -448.79364013671875, "logps/rejected": -402.9901428222656, "loss": 0.8909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.570284366607666, "rewards/margins": 1.0600379705429077, "rewards/rejected": 3.5102462768554688, "step": 89340 }, { "epoch": 4.14828914991411, "grad_norm": 121.4468994140625, "learning_rate": 5.1211291146292766e-08, "logits/chosen": -18.80489730834961, "logits/rejected": -19.480335235595703, "logps/chosen": -377.2759094238281, "logps/rejected": -368.80291748046875, "loss": 1.0076, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4482109546661377, "rewards/margins": 0.06413020938634872, "rewards/rejected": 3.3840808868408203, "step": 89350 }, { "epoch": 4.1487534240215425, "grad_norm": 193.63796997070312, "learning_rate": 5.1183434699846787e-08, "logits/chosen": -19.394718170166016, "logits/rejected": -19.125621795654297, "logps/chosen": -478.82110595703125, "logps/rejected": -344.2921447753906, "loss": 0.8303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.985456943511963, "rewards/margins": 1.2277281284332275, "rewards/rejected": 3.7577285766601562, "step": 89360 }, { "epoch": 4.149217698128975, "grad_norm": 11.51877212524414, "learning_rate": 5.115557825340081e-08, "logits/chosen": -19.632165908813477, "logits/rejected": -19.19562339782715, "logps/chosen": -360.801025390625, "logps/rejected": -274.83770751953125, "loss": 0.6489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.715763568878174, "rewards/margins": 0.966840386390686, "rewards/rejected": 1.7489233016967773, "step": 89370 }, { "epoch": 4.149681972236408, "grad_norm": 5.717001914978027, "learning_rate": 5.112772180695482e-08, "logits/chosen": -19.60377311706543, "logits/rejected": -19.00723648071289, "logps/chosen": -424.41455078125, "logps/rejected": -403.6580505371094, "loss": 0.2945, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4733593463897705, "rewards/margins": 1.4124778509140015, "rewards/rejected": 2.0608813762664795, "step": 89380 }, { "epoch": 4.150146246343842, "grad_norm": 25.00177574157715, "learning_rate": 5.1099865360508844e-08, "logits/chosen": -18.75312614440918, "logits/rejected": -18.223806381225586, "logps/chosen": -389.2271423339844, "logps/rejected": -331.3255615234375, "loss": 0.4289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7819857597351074, "rewards/margins": 1.4054962396621704, "rewards/rejected": 1.376489520072937, "step": 89390 }, { "epoch": 4.150610520451274, "grad_norm": 33.68556594848633, "learning_rate": 5.107200891406286e-08, "logits/chosen": -19.123064041137695, "logits/rejected": -17.63791275024414, "logps/chosen": -511.80023193359375, "logps/rejected": -335.59930419921875, "loss": 0.4511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.313432216644287, "rewards/margins": 1.3927114009857178, "rewards/rejected": 1.9207210540771484, "step": 89400 }, { "epoch": 4.151074794558707, "grad_norm": 10.279759407043457, "learning_rate": 5.104415246761688e-08, "logits/chosen": -18.4545955657959, "logits/rejected": -18.508546829223633, "logps/chosen": -348.8484802246094, "logps/rejected": -376.2117614746094, "loss": 1.016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.735976457595825, "rewards/margins": 0.5539836287498474, "rewards/rejected": 3.181992769241333, "step": 89410 }, { "epoch": 4.151539068666141, "grad_norm": NaN, "learning_rate": 5.1019081665815494e-08, "logits/chosen": -19.235795974731445, "logits/rejected": -19.42760467529297, "logps/chosen": -412.8470153808594, "logps/rejected": -346.9790954589844, "loss": 1.2823, "rewards/accuracies": 0.5, "rewards/chosen": 3.1366705894470215, "rewards/margins": 0.6899770498275757, "rewards/rejected": 2.4466938972473145, "step": 89420 }, { "epoch": 4.152003342773574, "grad_norm": 0.07301021367311478, "learning_rate": 5.0991225219369515e-08, "logits/chosen": -18.492197036743164, "logits/rejected": -17.266761779785156, "logps/chosen": -398.5809020996094, "logps/rejected": -277.7100830078125, "loss": 1.1551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.583268642425537, "rewards/margins": 3.0304412841796875, "rewards/rejected": 2.552827835083008, "step": 89430 }, { "epoch": 4.152467616881006, "grad_norm": 103.62205505371094, "learning_rate": 5.096336877292353e-08, "logits/chosen": -18.34760093688965, "logits/rejected": -17.571552276611328, "logps/chosen": -257.88092041015625, "logps/rejected": -231.8563995361328, "loss": 0.8516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.74723482131958, "rewards/margins": 1.7530028820037842, "rewards/rejected": 0.994232177734375, "step": 89440 }, { "epoch": 4.152931890988439, "grad_norm": 133.11965942382812, "learning_rate": 5.093551232647755e-08, "logits/chosen": -19.012924194335938, "logits/rejected": -18.804136276245117, "logps/chosen": -428.4287109375, "logps/rejected": -392.5130310058594, "loss": 0.7507, "rewards/accuracies": 0.5, "rewards/chosen": 3.1222729682922363, "rewards/margins": 0.39976105093955994, "rewards/rejected": 2.7225117683410645, "step": 89450 }, { "epoch": 4.153396165095873, "grad_norm": 220.77520751953125, "learning_rate": 5.090765588003157e-08, "logits/chosen": -19.129344940185547, "logits/rejected": -19.500146865844727, "logps/chosen": -475.8290100097656, "logps/rejected": -442.9376525878906, "loss": 1.1925, "rewards/accuracies": 0.5, "rewards/chosen": 5.2372236251831055, "rewards/margins": 0.331295907497406, "rewards/rejected": 4.905927658081055, "step": 89460 }, { "epoch": 4.153860439203306, "grad_norm": 96.1175537109375, "learning_rate": 5.0879799433585586e-08, "logits/chosen": -17.518997192382812, "logits/rejected": -18.18172836303711, "logps/chosen": -331.15338134765625, "logps/rejected": -385.3338623046875, "loss": 1.3938, "rewards/accuracies": 0.5, "rewards/chosen": 2.4348535537719727, "rewards/margins": -0.4473504424095154, "rewards/rejected": 2.8822038173675537, "step": 89470 }, { "epoch": 4.154324713310738, "grad_norm": 41.055389404296875, "learning_rate": 5.085194298713961e-08, "logits/chosen": -19.241859436035156, "logits/rejected": -19.112215042114258, "logps/chosen": -340.6648864746094, "logps/rejected": -301.3813171386719, "loss": 0.9268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5159592628479004, "rewards/margins": 0.6249912977218628, "rewards/rejected": 1.8909679651260376, "step": 89480 }, { "epoch": 4.154788987418172, "grad_norm": 23.984350204467773, "learning_rate": 5.082408654069363e-08, "logits/chosen": -19.71950912475586, "logits/rejected": -19.41020393371582, "logps/chosen": -457.8124084472656, "logps/rejected": -336.26666259765625, "loss": 0.3784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.830313205718994, "rewards/margins": 1.936884880065918, "rewards/rejected": 2.8934288024902344, "step": 89490 }, { "epoch": 4.155253261525605, "grad_norm": 133.41783142089844, "learning_rate": 5.079623009424764e-08, "logits/chosen": -18.835224151611328, "logits/rejected": -19.61508560180664, "logps/chosen": -316.2579345703125, "logps/rejected": -326.7911071777344, "loss": 1.6646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4139702320098877, "rewards/margins": -0.8301488161087036, "rewards/rejected": 3.2441189289093018, "step": 89500 }, { "epoch": 4.155717535633038, "grad_norm": 120.26545715332031, "learning_rate": 5.0768373647801664e-08, "logits/chosen": -19.105514526367188, "logits/rejected": -18.367542266845703, "logps/chosen": -504.207275390625, "logps/rejected": -406.35406494140625, "loss": 0.7409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.023579120635986, "rewards/margins": 2.5631344318389893, "rewards/rejected": 1.4604442119598389, "step": 89510 }, { "epoch": 4.15618180974047, "grad_norm": 4.583002090454102, "learning_rate": 5.074051720135568e-08, "logits/chosen": -18.720251083374023, "logits/rejected": -17.635738372802734, "logps/chosen": -350.1143798828125, "logps/rejected": -231.5545654296875, "loss": 0.3598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.079747200012207, "rewards/margins": 1.5361357927322388, "rewards/rejected": 1.5436112880706787, "step": 89520 }, { "epoch": 4.156646083847904, "grad_norm": 1.197655200958252, "learning_rate": 5.07126607549097e-08, "logits/chosen": -20.094772338867188, "logits/rejected": -19.502519607543945, "logps/chosen": -412.5359802246094, "logps/rejected": -391.3789367675781, "loss": 0.2117, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.090060234069824, "rewards/margins": 2.244048833847046, "rewards/rejected": 1.8460114002227783, "step": 89530 }, { "epoch": 4.157110357955337, "grad_norm": 34.45088577270508, "learning_rate": 5.068480430846371e-08, "logits/chosen": -18.94968032836914, "logits/rejected": -18.229928970336914, "logps/chosen": -413.9356384277344, "logps/rejected": -324.4281005859375, "loss": 0.672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4963879585266113, "rewards/margins": 0.687584638595581, "rewards/rejected": 2.808803081512451, "step": 89540 }, { "epoch": 4.15757463206277, "grad_norm": 33.041419982910156, "learning_rate": 5.065694786201773e-08, "logits/chosen": -17.967708587646484, "logits/rejected": -18.797199249267578, "logps/chosen": -251.13461303710938, "logps/rejected": -369.194091796875, "loss": 1.8928, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.9388377070426941, "rewards/margins": -1.3002901077270508, "rewards/rejected": 2.2391281127929688, "step": 89550 }, { "epoch": 4.158038906170203, "grad_norm": 84.10984802246094, "learning_rate": 5.062909141557175e-08, "logits/chosen": -19.455814361572266, "logits/rejected": -19.146013259887695, "logps/chosen": -360.6484375, "logps/rejected": -317.22088623046875, "loss": 0.7444, "rewards/accuracies": 0.5, "rewards/chosen": 3.3488757610321045, "rewards/margins": 0.4742390513420105, "rewards/rejected": 2.8746368885040283, "step": 89560 }, { "epoch": 4.158503180277636, "grad_norm": 8.0608491897583, "learning_rate": 5.0601234969125765e-08, "logits/chosen": -19.575143814086914, "logits/rejected": -18.40121078491211, "logps/chosen": -482.482421875, "logps/rejected": -379.41107177734375, "loss": 0.251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.295586585998535, "rewards/margins": 2.066641330718994, "rewards/rejected": 2.228944778442383, "step": 89570 }, { "epoch": 4.158967454385069, "grad_norm": 1.3029659986495972, "learning_rate": 5.0573378522679786e-08, "logits/chosen": -19.032224655151367, "logits/rejected": -16.9852294921875, "logps/chosen": -457.7259826660156, "logps/rejected": -227.6142578125, "loss": 0.3509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.621194362640381, "rewards/margins": 2.9240200519561768, "rewards/rejected": 0.6971741318702698, "step": 89580 }, { "epoch": 4.1594317284925015, "grad_norm": 14.845064163208008, "learning_rate": 5.054552207623381e-08, "logits/chosen": -19.10127067565918, "logits/rejected": -18.233665466308594, "logps/chosen": -430.9060974121094, "logps/rejected": -320.6781005859375, "loss": 0.409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.00919246673584, "rewards/margins": 2.227426052093506, "rewards/rejected": 1.7817662954330444, "step": 89590 }, { "epoch": 4.159896002599935, "grad_norm": 125.13778686523438, "learning_rate": 5.051766562978782e-08, "logits/chosen": -19.046491622924805, "logits/rejected": -17.818096160888672, "logps/chosen": -400.3031311035156, "logps/rejected": -261.97430419921875, "loss": 0.8139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0951926708221436, "rewards/margins": 1.4503326416015625, "rewards/rejected": 1.644860029220581, "step": 89600 }, { "epoch": 4.160360276707368, "grad_norm": 25.77845573425293, "learning_rate": 5.048980918334184e-08, "logits/chosen": -18.848941802978516, "logits/rejected": -18.178829193115234, "logps/chosen": -339.51483154296875, "logps/rejected": -316.58001708984375, "loss": 0.4602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2601306438446045, "rewards/margins": 1.1117888689041138, "rewards/rejected": 2.1483421325683594, "step": 89610 }, { "epoch": 4.160824550814801, "grad_norm": 20.205617904663086, "learning_rate": 5.046195273689586e-08, "logits/chosen": -19.45825958251953, "logits/rejected": -17.859554290771484, "logps/chosen": -398.2908630371094, "logps/rejected": -301.4209899902344, "loss": 0.164, "rewards/accuracies": 1.0, "rewards/chosen": 3.6697421073913574, "rewards/margins": 2.6262354850769043, "rewards/rejected": 1.0435067415237427, "step": 89620 }, { "epoch": 4.161288824922234, "grad_norm": 61.64310073852539, "learning_rate": 5.043409629044988e-08, "logits/chosen": -19.381139755249023, "logits/rejected": -19.019527435302734, "logps/chosen": -494.8349609375, "logps/rejected": -387.65386962890625, "loss": 0.8426, "rewards/accuracies": 0.5, "rewards/chosen": 2.8389434814453125, "rewards/margins": 0.1794436126947403, "rewards/rejected": 2.6594996452331543, "step": 89630 }, { "epoch": 4.161753099029667, "grad_norm": 53.507904052734375, "learning_rate": 5.04062398440039e-08, "logits/chosen": -19.49805450439453, "logits/rejected": -19.019752502441406, "logps/chosen": -369.4169616699219, "logps/rejected": -353.62542724609375, "loss": 0.6838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.274693250656128, "rewards/margins": 1.305904746055603, "rewards/rejected": 1.968788743019104, "step": 89640 }, { "epoch": 4.1622173731371, "grad_norm": 102.24465942382812, "learning_rate": 5.0378383397557914e-08, "logits/chosen": -19.358970642089844, "logits/rejected": -19.323978424072266, "logps/chosen": -373.95831298828125, "logps/rejected": -414.4940490722656, "loss": 0.3216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7855467796325684, "rewards/margins": 1.5435779094696045, "rewards/rejected": 2.241968870162964, "step": 89650 }, { "epoch": 4.162681647244534, "grad_norm": 121.51136779785156, "learning_rate": 5.0350526951111935e-08, "logits/chosen": -19.823673248291016, "logits/rejected": -17.535818099975586, "logps/chosen": -469.3788146972656, "logps/rejected": -335.9039001464844, "loss": 0.2511, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.171635150909424, "rewards/margins": 2.0456185340881348, "rewards/rejected": 1.1260168552398682, "step": 89660 }, { "epoch": 4.163145921351966, "grad_norm": 57.21350860595703, "learning_rate": 5.0322670504665956e-08, "logits/chosen": -19.37105941772461, "logits/rejected": -18.805946350097656, "logps/chosen": -381.1181945800781, "logps/rejected": -276.39080810546875, "loss": 0.1989, "rewards/accuracies": 1.0, "rewards/chosen": 2.6784605979919434, "rewards/margins": 1.9123731851577759, "rewards/rejected": 0.7660874128341675, "step": 89670 }, { "epoch": 4.163610195459399, "grad_norm": 14.837015151977539, "learning_rate": 5.029481405821997e-08, "logits/chosen": -18.494497299194336, "logits/rejected": -17.496273040771484, "logps/chosen": -348.1418151855469, "logps/rejected": -302.6583557128906, "loss": 0.7929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.311924457550049, "rewards/margins": 0.9213699102401733, "rewards/rejected": 1.3905545473098755, "step": 89680 }, { "epoch": 4.164074469566832, "grad_norm": 29.645496368408203, "learning_rate": 5.026695761177399e-08, "logits/chosen": -19.654592514038086, "logits/rejected": -19.00068473815918, "logps/chosen": -378.341796875, "logps/rejected": -387.87164306640625, "loss": 0.4856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.498957395553589, "rewards/margins": 0.9834991693496704, "rewards/rejected": 2.51545786857605, "step": 89690 }, { "epoch": 4.164538743674266, "grad_norm": 3.260497570037842, "learning_rate": 5.023910116532801e-08, "logits/chosen": -18.874723434448242, "logits/rejected": -18.492786407470703, "logps/chosen": -384.15863037109375, "logps/rejected": -315.4490661621094, "loss": 1.0079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6992244720458984, "rewards/margins": 1.075331449508667, "rewards/rejected": 2.6238930225372314, "step": 89700 }, { "epoch": 4.165003017781698, "grad_norm": 61.658851623535156, "learning_rate": 5.021124471888203e-08, "logits/chosen": -18.670137405395508, "logits/rejected": -18.485797882080078, "logps/chosen": -407.051025390625, "logps/rejected": -384.08636474609375, "loss": 0.7485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.62926983833313, "rewards/margins": 0.9519180059432983, "rewards/rejected": 2.6773521900177, "step": 89710 }, { "epoch": 4.165467291889131, "grad_norm": 167.13609313964844, "learning_rate": 5.018338827243605e-08, "logits/chosen": -18.815235137939453, "logits/rejected": -18.611705780029297, "logps/chosen": -309.08453369140625, "logps/rejected": -276.91278076171875, "loss": 0.6563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9056898355484009, "rewards/margins": 0.6994833946228027, "rewards/rejected": 1.2062066793441772, "step": 89720 }, { "epoch": 4.165931565996565, "grad_norm": 40.54217529296875, "learning_rate": 5.0155531825990063e-08, "logits/chosen": -19.574115753173828, "logits/rejected": -17.67346954345703, "logps/chosen": -454.80120849609375, "logps/rejected": -259.8367614746094, "loss": 0.3671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6240429878234863, "rewards/margins": 2.352114200592041, "rewards/rejected": 1.2719284296035767, "step": 89730 }, { "epoch": 4.1663958401039975, "grad_norm": 0.11058198660612106, "learning_rate": 5.012767537954408e-08, "logits/chosen": -20.271265029907227, "logits/rejected": -18.555360794067383, "logps/chosen": -333.0160217285156, "logps/rejected": -224.97463989257812, "loss": 0.7571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.005767822265625, "rewards/margins": 2.357637405395508, "rewards/rejected": 0.6481305956840515, "step": 89740 }, { "epoch": 4.16686011421143, "grad_norm": 39.14134979248047, "learning_rate": 5.009981893309809e-08, "logits/chosen": -18.39376449584961, "logits/rejected": -18.15258026123047, "logps/chosen": -402.0079040527344, "logps/rejected": -319.63360595703125, "loss": 0.5944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3431313037872314, "rewards/margins": 1.879828691482544, "rewards/rejected": 1.4633026123046875, "step": 89750 }, { "epoch": 4.167324388318863, "grad_norm": 51.14677047729492, "learning_rate": 5.0071962486652114e-08, "logits/chosen": -19.526409149169922, "logits/rejected": -19.330759048461914, "logps/chosen": -306.66876220703125, "logps/rejected": -273.6763610839844, "loss": 0.8003, "rewards/accuracies": 0.5, "rewards/chosen": 1.632331132888794, "rewards/margins": 0.48075300455093384, "rewards/rejected": 1.1515779495239258, "step": 89760 }, { "epoch": 4.167788662426297, "grad_norm": 34.888545989990234, "learning_rate": 5.0044106040206135e-08, "logits/chosen": -19.344303131103516, "logits/rejected": -18.646259307861328, "logps/chosen": -462.0850524902344, "logps/rejected": -433.3407287597656, "loss": 0.6811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.48673152923584, "rewards/margins": 1.0476982593536377, "rewards/rejected": 3.439033031463623, "step": 89770 }, { "epoch": 4.1682529365337295, "grad_norm": 36.30490493774414, "learning_rate": 5.001624959376015e-08, "logits/chosen": -18.514196395874023, "logits/rejected": -17.705812454223633, "logps/chosen": -319.69158935546875, "logps/rejected": -268.7651062011719, "loss": 0.4414, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.854943037033081, "rewards/margins": 1.816234827041626, "rewards/rejected": 1.0387083292007446, "step": 89780 }, { "epoch": 4.168717210641162, "grad_norm": 44.89519119262695, "learning_rate": 4.998839314731417e-08, "logits/chosen": -19.086856842041016, "logits/rejected": -18.919076919555664, "logps/chosen": -409.05462646484375, "logps/rejected": -372.1469421386719, "loss": 0.7234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.038462162017822, "rewards/margins": 0.7653480768203735, "rewards/rejected": 3.273113250732422, "step": 89790 }, { "epoch": 4.169181484748596, "grad_norm": 12.720513343811035, "learning_rate": 4.996053670086819e-08, "logits/chosen": -19.514238357543945, "logits/rejected": -18.97647476196289, "logps/chosen": -387.99560546875, "logps/rejected": -316.1314392089844, "loss": 0.7102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.095031261444092, "rewards/margins": 1.1443501710891724, "rewards/rejected": 2.950681209564209, "step": 89800 }, { "epoch": 4.169645758856029, "grad_norm": 45.758567810058594, "learning_rate": 4.9932680254422206e-08, "logits/chosen": -19.23423957824707, "logits/rejected": -18.85115623474121, "logps/chosen": -352.59637451171875, "logps/rejected": -260.8873291015625, "loss": 0.7443, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4635074138641357, "rewards/margins": 0.9763820767402649, "rewards/rejected": 1.4871253967285156, "step": 89810 }, { "epoch": 4.1701100329634615, "grad_norm": 176.13282775878906, "learning_rate": 4.990482380797623e-08, "logits/chosen": -19.362953186035156, "logits/rejected": -18.423124313354492, "logps/chosen": -355.3238220214844, "logps/rejected": -231.44985961914062, "loss": 0.737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.962268829345703, "rewards/margins": 1.4584895372390747, "rewards/rejected": 1.5037792921066284, "step": 89820 }, { "epoch": 4.170574307070894, "grad_norm": 0.3349805474281311, "learning_rate": 4.987696736153024e-08, "logits/chosen": -18.262598037719727, "logits/rejected": -17.337730407714844, "logps/chosen": -331.0685119628906, "logps/rejected": -314.05645751953125, "loss": 0.7183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.991645097732544, "rewards/margins": 1.4858052730560303, "rewards/rejected": 1.5058397054672241, "step": 89830 }, { "epoch": 4.171038581178328, "grad_norm": 193.4601287841797, "learning_rate": 4.984911091508426e-08, "logits/chosen": -20.130483627319336, "logits/rejected": -19.351299285888672, "logps/chosen": -454.20751953125, "logps/rejected": -357.10272216796875, "loss": 0.5683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.831632137298584, "rewards/margins": 1.1522457599639893, "rewards/rejected": 2.6793863773345947, "step": 89840 }, { "epoch": 4.171502855285761, "grad_norm": 35.34150695800781, "learning_rate": 4.9821254468638284e-08, "logits/chosen": -19.2434139251709, "logits/rejected": -18.794836044311523, "logps/chosen": -330.5625, "logps/rejected": -330.18768310546875, "loss": 0.5184, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4122676849365234, "rewards/margins": 0.7843897938728333, "rewards/rejected": 1.627877950668335, "step": 89850 }, { "epoch": 4.1719671293931935, "grad_norm": 12.346696853637695, "learning_rate": 4.97933980221923e-08, "logits/chosen": -19.581632614135742, "logits/rejected": -18.9069881439209, "logps/chosen": -393.0591735839844, "logps/rejected": -311.6795349121094, "loss": 0.579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.302148342132568, "rewards/margins": 2.5286600589752197, "rewards/rejected": 2.7734885215759277, "step": 89860 }, { "epoch": 4.172431403500627, "grad_norm": 238.47483825683594, "learning_rate": 4.976554157574632e-08, "logits/chosen": -19.442550659179688, "logits/rejected": -18.672344207763672, "logps/chosen": -474.8871154785156, "logps/rejected": -448.24896240234375, "loss": 0.5869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.352710247039795, "rewards/margins": 1.2695895433425903, "rewards/rejected": 3.083120822906494, "step": 89870 }, { "epoch": 4.17289567760806, "grad_norm": 241.05938720703125, "learning_rate": 4.973768512930034e-08, "logits/chosen": -19.485883712768555, "logits/rejected": -18.37393569946289, "logps/chosen": -391.18719482421875, "logps/rejected": -304.1924743652344, "loss": 1.1768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.558753252029419, "rewards/margins": 1.3450170755386353, "rewards/rejected": 2.2137362957000732, "step": 89880 }, { "epoch": 4.173359951715493, "grad_norm": 16.426668167114258, "learning_rate": 4.9709828682854356e-08, "logits/chosen": -19.71074104309082, "logits/rejected": -18.84389877319336, "logps/chosen": -434.1625061035156, "logps/rejected": -318.8984375, "loss": 0.443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.548051357269287, "rewards/margins": 1.0362430810928345, "rewards/rejected": 2.511808395385742, "step": 89890 }, { "epoch": 4.1738242258229254, "grad_norm": 131.9426727294922, "learning_rate": 4.9681972236408377e-08, "logits/chosen": -20.006786346435547, "logits/rejected": -19.219472885131836, "logps/chosen": -374.57940673828125, "logps/rejected": -396.69390869140625, "loss": 1.2846, "rewards/accuracies": 0.5, "rewards/chosen": 2.9199540615081787, "rewards/margins": 0.18901792168617249, "rewards/rejected": 2.730936050415039, "step": 89900 }, { "epoch": 4.174288499930359, "grad_norm": 8.897499084472656, "learning_rate": 4.96541157899624e-08, "logits/chosen": -19.05855369567871, "logits/rejected": -18.203163146972656, "logps/chosen": -435.5335388183594, "logps/rejected": -295.0204772949219, "loss": 0.3929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3254547119140625, "rewards/margins": 2.1761069297790527, "rewards/rejected": 2.1493477821350098, "step": 89910 }, { "epoch": 4.174752774037792, "grad_norm": 168.592041015625, "learning_rate": 4.962625934351641e-08, "logits/chosen": -18.796585083007812, "logits/rejected": -18.754539489746094, "logps/chosen": -342.6094665527344, "logps/rejected": -264.3675231933594, "loss": 0.5725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7510294914245605, "rewards/margins": 1.8294368982315063, "rewards/rejected": 1.9215927124023438, "step": 89920 }, { "epoch": 4.175217048145225, "grad_norm": 21.589601516723633, "learning_rate": 4.9598402897070434e-08, "logits/chosen": -19.72081184387207, "logits/rejected": -19.159704208374023, "logps/chosen": -403.5296630859375, "logps/rejected": -401.7209777832031, "loss": 0.5491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.69646954536438, "rewards/margins": 0.9656375646591187, "rewards/rejected": 2.7308316230773926, "step": 89930 }, { "epoch": 4.175681322252658, "grad_norm": 34.31351852416992, "learning_rate": 4.957054645062444e-08, "logits/chosen": -19.562923431396484, "logits/rejected": -18.646554946899414, "logps/chosen": -432.71295166015625, "logps/rejected": -387.72894287109375, "loss": 0.3007, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.924592971801758, "rewards/margins": 2.0171761512756348, "rewards/rejected": 1.9074169397354126, "step": 89940 }, { "epoch": 4.176145596360091, "grad_norm": 10.161785125732422, "learning_rate": 4.954269000417846e-08, "logits/chosen": -18.317943572998047, "logits/rejected": -17.854761123657227, "logps/chosen": -311.32073974609375, "logps/rejected": -285.1163024902344, "loss": 1.104, "rewards/accuracies": 0.5, "rewards/chosen": 1.5197060108184814, "rewards/margins": 0.19316554069519043, "rewards/rejected": 1.326540470123291, "step": 89950 }, { "epoch": 4.176609870467524, "grad_norm": 0.30362623929977417, "learning_rate": 4.951483355773248e-08, "logits/chosen": -19.174467086791992, "logits/rejected": -18.78152847290039, "logps/chosen": -335.6309814453125, "logps/rejected": -268.7201232910156, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.629143238067627, "rewards/margins": 1.9236301183700562, "rewards/rejected": 0.7055128216743469, "step": 89960 }, { "epoch": 4.177074144574957, "grad_norm": 6.589104175567627, "learning_rate": 4.94869771112865e-08, "logits/chosen": -19.272634506225586, "logits/rejected": -18.459463119506836, "logps/chosen": -373.63763427734375, "logps/rejected": -348.1776428222656, "loss": 0.4004, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.903519868850708, "rewards/margins": 1.1534111499786377, "rewards/rejected": 2.750108480453491, "step": 89970 }, { "epoch": 4.17753841868239, "grad_norm": 34.40972900390625, "learning_rate": 4.945912066484052e-08, "logits/chosen": -18.217790603637695, "logits/rejected": -17.9515323638916, "logps/chosen": -404.0450744628906, "logps/rejected": -346.8494567871094, "loss": 0.9556, "rewards/accuracies": 0.5, "rewards/chosen": 2.372389316558838, "rewards/margins": 0.07112616300582886, "rewards/rejected": 2.3012630939483643, "step": 89980 }, { "epoch": 4.178002692789823, "grad_norm": 85.1019287109375, "learning_rate": 4.9431264218394534e-08, "logits/chosen": -19.14455795288086, "logits/rejected": -18.16118812561035, "logps/chosen": -430.56121826171875, "logps/rejected": -310.9874572753906, "loss": 0.4344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5062904357910156, "rewards/margins": 1.8400652408599854, "rewards/rejected": 1.6662254333496094, "step": 89990 }, { "epoch": 4.178466966897256, "grad_norm": 8.267544746398926, "learning_rate": 4.9403407771948555e-08, "logits/chosen": -18.475765228271484, "logits/rejected": -19.089521408081055, "logps/chosen": -346.39215087890625, "logps/rejected": -357.461669921875, "loss": 1.4782, "rewards/accuracies": 0.5, "rewards/chosen": 3.23835825920105, "rewards/margins": 0.1398102343082428, "rewards/rejected": 3.0985474586486816, "step": 90000 }, { "epoch": 4.1789312410046895, "grad_norm": 1.8970063924789429, "learning_rate": 4.9375551325502576e-08, "logits/chosen": -18.36180305480957, "logits/rejected": -16.552715301513672, "logps/chosen": -446.6854553222656, "logps/rejected": -221.560546875, "loss": 0.3331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.661165714263916, "rewards/margins": 2.361416816711426, "rewards/rejected": 0.29974883794784546, "step": 90010 }, { "epoch": 4.179395515112122, "grad_norm": 96.4898452758789, "learning_rate": 4.934769487905659e-08, "logits/chosen": -19.463655471801758, "logits/rejected": -17.72833251953125, "logps/chosen": -353.60858154296875, "logps/rejected": -211.89004516601562, "loss": 0.3682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.409560441970825, "rewards/margins": 1.8418718576431274, "rewards/rejected": 1.5676885843276978, "step": 90020 }, { "epoch": 4.179859789219555, "grad_norm": 28.03484535217285, "learning_rate": 4.931983843261061e-08, "logits/chosen": -18.810420989990234, "logits/rejected": -18.207561492919922, "logps/chosen": -493.372314453125, "logps/rejected": -370.15411376953125, "loss": 0.6239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.000469207763672, "rewards/margins": 0.7582486867904663, "rewards/rejected": 2.242220401763916, "step": 90030 }, { "epoch": 4.180324063326989, "grad_norm": 10.714192390441895, "learning_rate": 4.9291981986164627e-08, "logits/chosen": -19.365266799926758, "logits/rejected": -19.1287784576416, "logps/chosen": -363.6427307128906, "logps/rejected": -353.8389892578125, "loss": 0.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4835352897644043, "rewards/margins": 1.0389589071273804, "rewards/rejected": 1.4445762634277344, "step": 90040 }, { "epoch": 4.1807883374344215, "grad_norm": 16.846250534057617, "learning_rate": 4.926412553971865e-08, "logits/chosen": -19.797534942626953, "logits/rejected": -18.955524444580078, "logps/chosen": -348.72869873046875, "logps/rejected": -298.60174560546875, "loss": 0.7583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7794384956359863, "rewards/margins": 0.3425436317920685, "rewards/rejected": 2.4368948936462402, "step": 90050 }, { "epoch": 4.181252611541854, "grad_norm": 4.825197219848633, "learning_rate": 4.923626909327267e-08, "logits/chosen": -18.391469955444336, "logits/rejected": -17.844409942626953, "logps/chosen": -328.228759765625, "logps/rejected": -274.10296630859375, "loss": 0.7206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.009185791015625, "rewards/margins": 1.357356309890747, "rewards/rejected": 1.651829719543457, "step": 90060 }, { "epoch": 4.181716885649287, "grad_norm": 87.26184844970703, "learning_rate": 4.9208412646826683e-08, "logits/chosen": -19.261951446533203, "logits/rejected": -18.53480339050293, "logps/chosen": -514.2503662109375, "logps/rejected": -387.3310546875, "loss": 0.6482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.842443466186523, "rewards/margins": 1.9185497760772705, "rewards/rejected": 2.923894166946411, "step": 90070 }, { "epoch": 4.182181159756721, "grad_norm": 235.95655822753906, "learning_rate": 4.9180556200380705e-08, "logits/chosen": -20.80623435974121, "logits/rejected": -19.9365234375, "logps/chosen": -538.190185546875, "logps/rejected": -487.94879150390625, "loss": 0.5507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.348828315734863, "rewards/margins": 1.355339765548706, "rewards/rejected": 2.993488311767578, "step": 90080 }, { "epoch": 4.182645433864153, "grad_norm": 45.52949142456055, "learning_rate": 4.9152699753934726e-08, "logits/chosen": -19.56769371032715, "logits/rejected": -18.56062126159668, "logps/chosen": -374.97198486328125, "logps/rejected": -243.47055053710938, "loss": 0.8349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.27227520942688, "rewards/margins": 1.4474207162857056, "rewards/rejected": 1.8248544931411743, "step": 90090 }, { "epoch": 4.183109707971586, "grad_norm": 62.37263488769531, "learning_rate": 4.912484330748874e-08, "logits/chosen": -18.70210838317871, "logits/rejected": -18.2418212890625, "logps/chosen": -274.5529479980469, "logps/rejected": -227.03091430664062, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.404191732406616, "rewards/margins": 1.1338516473770142, "rewards/rejected": 1.2703402042388916, "step": 90100 }, { "epoch": 4.18357398207902, "grad_norm": 156.02273559570312, "learning_rate": 4.909698686104276e-08, "logits/chosen": -18.782011032104492, "logits/rejected": -18.4873046875, "logps/chosen": -508.4000549316406, "logps/rejected": -411.0029296875, "loss": 0.6819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8893635272979736, "rewards/margins": 1.0722767114639282, "rewards/rejected": 2.817086696624756, "step": 90110 }, { "epoch": 4.184038256186453, "grad_norm": 30.920244216918945, "learning_rate": 4.906913041459678e-08, "logits/chosen": -19.7825870513916, "logits/rejected": -18.57101821899414, "logps/chosen": -338.21636962890625, "logps/rejected": -289.1182861328125, "loss": 0.5705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.296696186065674, "rewards/margins": 0.7382818460464478, "rewards/rejected": 2.5584144592285156, "step": 90120 }, { "epoch": 4.184502530293885, "grad_norm": 114.13662719726562, "learning_rate": 4.90412739681508e-08, "logits/chosen": -20.277507781982422, "logits/rejected": -19.359764099121094, "logps/chosen": -309.4605407714844, "logps/rejected": -275.2114562988281, "loss": 0.5266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.008619785308838, "rewards/margins": 1.2561036348342896, "rewards/rejected": 1.7525163888931274, "step": 90130 }, { "epoch": 4.184966804401318, "grad_norm": 11.691131591796875, "learning_rate": 4.901341752170481e-08, "logits/chosen": -18.674266815185547, "logits/rejected": -18.295093536376953, "logps/chosen": -402.67449951171875, "logps/rejected": -353.4432067871094, "loss": 0.539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.98419189453125, "rewards/margins": 1.0237963199615479, "rewards/rejected": 2.9603958129882812, "step": 90140 }, { "epoch": 4.185431078508752, "grad_norm": 33.017417907714844, "learning_rate": 4.8985561075258826e-08, "logits/chosen": -19.259201049804688, "logits/rejected": -18.230031967163086, "logps/chosen": -378.8180847167969, "logps/rejected": -253.07858276367188, "loss": 0.4495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6120445728302, "rewards/margins": 1.9287347793579102, "rewards/rejected": 0.68330979347229, "step": 90150 }, { "epoch": 4.185895352616185, "grad_norm": 138.23399353027344, "learning_rate": 4.895770462881285e-08, "logits/chosen": -19.02962875366211, "logits/rejected": -18.90699577331543, "logps/chosen": -390.62127685546875, "logps/rejected": -357.68011474609375, "loss": 0.4492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.526369333267212, "rewards/margins": 0.8434098958969116, "rewards/rejected": 1.6829591989517212, "step": 90160 }, { "epoch": 4.186359626723617, "grad_norm": 47.222232818603516, "learning_rate": 4.892984818236686e-08, "logits/chosen": -19.168901443481445, "logits/rejected": -17.48848533630371, "logps/chosen": -460.3016662597656, "logps/rejected": -315.603515625, "loss": 0.1674, "rewards/accuracies": 1.0, "rewards/chosen": 4.573580741882324, "rewards/margins": 2.6168200969696045, "rewards/rejected": 1.9567598104476929, "step": 90170 }, { "epoch": 4.186823900831051, "grad_norm": 1.271165370941162, "learning_rate": 4.890199173592088e-08, "logits/chosen": -18.936683654785156, "logits/rejected": -17.597000122070312, "logps/chosen": -451.59722900390625, "logps/rejected": -347.8204650878906, "loss": 0.346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.022426605224609, "rewards/margins": 1.8218063116073608, "rewards/rejected": 3.200620174407959, "step": 90180 }, { "epoch": 4.187288174938484, "grad_norm": 54.30198669433594, "learning_rate": 4.8874135289474904e-08, "logits/chosen": -19.33310317993164, "logits/rejected": -18.908416748046875, "logps/chosen": -429.32421875, "logps/rejected": -373.0062561035156, "loss": 0.9129, "rewards/accuracies": 0.5, "rewards/chosen": 3.2121520042419434, "rewards/margins": 0.5178099870681763, "rewards/rejected": 2.6943423748016357, "step": 90190 }, { "epoch": 4.187752449045917, "grad_norm": 0.2855672240257263, "learning_rate": 4.884627884302892e-08, "logits/chosen": -19.7526912689209, "logits/rejected": -18.281658172607422, "logps/chosen": -580.5154418945312, "logps/rejected": -337.92535400390625, "loss": 0.1713, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.114989280700684, "rewards/margins": 4.104074954986572, "rewards/rejected": 2.0109143257141113, "step": 90200 }, { "epoch": 4.188216723153349, "grad_norm": 66.94577026367188, "learning_rate": 4.881842239658294e-08, "logits/chosen": -19.295459747314453, "logits/rejected": -19.078208923339844, "logps/chosen": -373.9283752441406, "logps/rejected": -297.78741455078125, "loss": 0.421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2728919982910156, "rewards/margins": 0.8793681859970093, "rewards/rejected": 2.393523693084717, "step": 90210 }, { "epoch": 4.188680997260783, "grad_norm": 71.20223999023438, "learning_rate": 4.879056595013696e-08, "logits/chosen": -18.97005844116211, "logits/rejected": -18.613744735717773, "logps/chosen": -422.4619140625, "logps/rejected": -380.1159362792969, "loss": 0.9221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.276566982269287, "rewards/margins": 0.5893941521644592, "rewards/rejected": 2.6871726512908936, "step": 90220 }, { "epoch": 4.189145271368216, "grad_norm": 5.375184059143066, "learning_rate": 4.8762709503690976e-08, "logits/chosen": -19.813411712646484, "logits/rejected": -18.988544464111328, "logps/chosen": -391.6762390136719, "logps/rejected": -355.00750732421875, "loss": 1.0714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.695887804031372, "rewards/margins": 1.698704719543457, "rewards/rejected": 1.997183084487915, "step": 90230 }, { "epoch": 4.189609545475649, "grad_norm": 48.94646072387695, "learning_rate": 4.8734853057244997e-08, "logits/chosen": -18.849170684814453, "logits/rejected": -17.834657669067383, "logps/chosen": -369.3785705566406, "logps/rejected": -320.8152770996094, "loss": 0.4016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7831263542175293, "rewards/margins": 1.9690879583358765, "rewards/rejected": 1.814038634300232, "step": 90240 }, { "epoch": 4.190073819583082, "grad_norm": 2.9951069355010986, "learning_rate": 4.870699661079901e-08, "logits/chosen": -19.582931518554688, "logits/rejected": -18.192163467407227, "logps/chosen": -342.5020446777344, "logps/rejected": -201.62538146972656, "loss": 0.3738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4239840507507324, "rewards/margins": 2.4504029750823975, "rewards/rejected": 0.9735806584358215, "step": 90250 }, { "epoch": 4.190538093690515, "grad_norm": 79.20894622802734, "learning_rate": 4.867914016435303e-08, "logits/chosen": -20.43124008178711, "logits/rejected": -18.834957122802734, "logps/chosen": -455.46038818359375, "logps/rejected": -303.74029541015625, "loss": 0.2812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.613389015197754, "rewards/margins": 1.9249656200408936, "rewards/rejected": 2.6884236335754395, "step": 90260 }, { "epoch": 4.191002367797948, "grad_norm": 37.04096221923828, "learning_rate": 4.8651283717907054e-08, "logits/chosen": -18.849254608154297, "logits/rejected": -18.42376708984375, "logps/chosen": -266.53070068359375, "logps/rejected": -258.3210144042969, "loss": 0.6105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8185733556747437, "rewards/margins": 1.08482027053833, "rewards/rejected": 0.7337530851364136, "step": 90270 }, { "epoch": 4.1914666419053805, "grad_norm": 0.8092012405395508, "learning_rate": 4.862342727146107e-08, "logits/chosen": -19.562414169311523, "logits/rejected": -17.905851364135742, "logps/chosen": -451.59185791015625, "logps/rejected": -268.3448791503906, "loss": 0.2476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9118733406066895, "rewards/margins": 2.4512064456939697, "rewards/rejected": 1.4606667757034302, "step": 90280 }, { "epoch": 4.191930916012814, "grad_norm": 34.04755401611328, "learning_rate": 4.859557082501509e-08, "logits/chosen": -19.036163330078125, "logits/rejected": -19.370161056518555, "logps/chosen": -392.46240234375, "logps/rejected": -415.64678955078125, "loss": 1.5034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5810654163360596, "rewards/margins": 0.48270025849342346, "rewards/rejected": 3.098365306854248, "step": 90290 }, { "epoch": 4.192395190120247, "grad_norm": 112.2305679321289, "learning_rate": 4.856771437856911e-08, "logits/chosen": -20.070144653320312, "logits/rejected": -19.376089096069336, "logps/chosen": -514.0543212890625, "logps/rejected": -374.3387145996094, "loss": 0.4832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3011345863342285, "rewards/margins": 0.8900471925735474, "rewards/rejected": 2.4110872745513916, "step": 90300 }, { "epoch": 4.19285946422768, "grad_norm": 27.360795974731445, "learning_rate": 4.8539857932123125e-08, "logits/chosen": -18.40951156616211, "logits/rejected": -17.742084503173828, "logps/chosen": -343.5780029296875, "logps/rejected": -245.940185546875, "loss": 0.4477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.259812593460083, "rewards/margins": 1.883978247642517, "rewards/rejected": 1.3758342266082764, "step": 90310 }, { "epoch": 4.193323738335113, "grad_norm": 101.48442840576172, "learning_rate": 4.8512001485677146e-08, "logits/chosen": -18.231319427490234, "logits/rejected": -17.999881744384766, "logps/chosen": -284.8700866699219, "logps/rejected": -209.49960327148438, "loss": 0.4354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9734967947006226, "rewards/margins": 1.6351616382598877, "rewards/rejected": 0.3383350968360901, "step": 90320 }, { "epoch": 4.193788012442546, "grad_norm": 268.3858337402344, "learning_rate": 4.8484145039231154e-08, "logits/chosen": -19.970443725585938, "logits/rejected": -19.45672035217285, "logps/chosen": -407.0632629394531, "logps/rejected": -376.1329040527344, "loss": 0.3892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.398630142211914, "rewards/margins": 2.4626543521881104, "rewards/rejected": 1.935975432395935, "step": 90330 }, { "epoch": 4.194252286549979, "grad_norm": 79.64154815673828, "learning_rate": 4.8456288592785175e-08, "logits/chosen": -18.905916213989258, "logits/rejected": -17.026241302490234, "logps/chosen": -440.13824462890625, "logps/rejected": -219.37759399414062, "loss": 0.4912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.493640899658203, "rewards/margins": 3.8542773723602295, "rewards/rejected": 0.639363706111908, "step": 90340 }, { "epoch": 4.194716560657412, "grad_norm": 12.238899230957031, "learning_rate": 4.8428432146339196e-08, "logits/chosen": -18.3000545501709, "logits/rejected": -17.44643783569336, "logps/chosen": -386.1941833496094, "logps/rejected": -257.12591552734375, "loss": 0.2372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.860738515853882, "rewards/margins": 1.9809579849243164, "rewards/rejected": 0.8797805905342102, "step": 90350 }, { "epoch": 4.195180834764845, "grad_norm": 87.34355926513672, "learning_rate": 4.840057569989321e-08, "logits/chosen": -19.48404312133789, "logits/rejected": -19.448226928710938, "logps/chosen": -312.20330810546875, "logps/rejected": -278.04833984375, "loss": 0.7575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8533401489257812, "rewards/margins": 0.27377909421920776, "rewards/rejected": 2.579561233520508, "step": 90360 }, { "epoch": 4.195645108872278, "grad_norm": 31.46500015258789, "learning_rate": 4.837271925344723e-08, "logits/chosen": -19.121841430664062, "logits/rejected": -17.55229377746582, "logps/chosen": -420.14031982421875, "logps/rejected": -275.45751953125, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": 4.259149074554443, "rewards/margins": 2.7417659759521484, "rewards/rejected": 1.5173829793930054, "step": 90370 }, { "epoch": 4.196109382979711, "grad_norm": 44.146148681640625, "learning_rate": 4.8344862807001247e-08, "logits/chosen": -19.29245948791504, "logits/rejected": -18.146093368530273, "logps/chosen": -391.94805908203125, "logps/rejected": -280.03265380859375, "loss": 0.3557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5446267127990723, "rewards/margins": 1.553147792816162, "rewards/rejected": 1.991478681564331, "step": 90380 }, { "epoch": 4.196573657087145, "grad_norm": 112.3954086303711, "learning_rate": 4.831700636055527e-08, "logits/chosen": -18.179718017578125, "logits/rejected": -17.821374893188477, "logps/chosen": -359.26995849609375, "logps/rejected": -347.1463317871094, "loss": 1.3622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1028847694396973, "rewards/margins": -0.1276136338710785, "rewards/rejected": 2.2304985523223877, "step": 90390 }, { "epoch": 4.197037931194577, "grad_norm": 122.0141372680664, "learning_rate": 4.828914991410929e-08, "logits/chosen": -18.924779891967773, "logits/rejected": -18.053239822387695, "logps/chosen": -520.4381713867188, "logps/rejected": -346.09649658203125, "loss": 0.3874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.207906723022461, "rewards/margins": 1.8481136560440063, "rewards/rejected": 3.359793186187744, "step": 90400 }, { "epoch": 4.19750220530201, "grad_norm": 77.10848999023438, "learning_rate": 4.8261293467663303e-08, "logits/chosen": -19.86056137084961, "logits/rejected": -17.83859634399414, "logps/chosen": -501.32891845703125, "logps/rejected": -254.4861297607422, "loss": 0.2124, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.809676647186279, "rewards/margins": 2.9690380096435547, "rewards/rejected": 1.8406387567520142, "step": 90410 }, { "epoch": 4.197966479409443, "grad_norm": 20.186527252197266, "learning_rate": 4.8233437021217325e-08, "logits/chosen": -18.5673885345459, "logits/rejected": -17.670799255371094, "logps/chosen": -385.79266357421875, "logps/rejected": -265.37567138671875, "loss": 0.8046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.053530216217041, "rewards/margins": 0.8710254430770874, "rewards/rejected": 1.1825047731399536, "step": 90420 }, { "epoch": 4.198430753516877, "grad_norm": 2.22817063331604, "learning_rate": 4.8205580574771346e-08, "logits/chosen": -19.442594528198242, "logits/rejected": -18.612577438354492, "logps/chosen": -432.6722106933594, "logps/rejected": -353.74261474609375, "loss": 0.5134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8656787872314453, "rewards/margins": 1.816092848777771, "rewards/rejected": 2.0495858192443848, "step": 90430 }, { "epoch": 4.198895027624309, "grad_norm": 35.93241882324219, "learning_rate": 4.817772412832536e-08, "logits/chosen": -18.75245475769043, "logits/rejected": -18.198366165161133, "logps/chosen": -272.71240234375, "logps/rejected": -248.25625610351562, "loss": 0.5587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9742529392242432, "rewards/margins": 0.5831796526908875, "rewards/rejected": 1.391073226928711, "step": 90440 }, { "epoch": 4.199359301731742, "grad_norm": 111.19039916992188, "learning_rate": 4.814986768187938e-08, "logits/chosen": -20.20950698852539, "logits/rejected": -19.106571197509766, "logps/chosen": -449.234375, "logps/rejected": -401.4031982421875, "loss": 0.5686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.395150661468506, "rewards/margins": 0.7293238639831543, "rewards/rejected": 2.6658270359039307, "step": 90450 }, { "epoch": 4.199823575839176, "grad_norm": 22.568374633789062, "learning_rate": 4.8122011235433396e-08, "logits/chosen": -17.53472137451172, "logits/rejected": -17.680986404418945, "logps/chosen": -246.7906951904297, "logps/rejected": -264.44281005859375, "loss": 1.7138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.469727635383606, "rewards/margins": -0.339131623506546, "rewards/rejected": 1.80885910987854, "step": 90460 }, { "epoch": 4.2002878499466085, "grad_norm": 1.202694058418274, "learning_rate": 4.809415478898742e-08, "logits/chosen": -18.51500129699707, "logits/rejected": -17.850370407104492, "logps/chosen": -309.1619567871094, "logps/rejected": -273.2765808105469, "loss": 0.5801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5549428462982178, "rewards/margins": 1.236580491065979, "rewards/rejected": 1.3183624744415283, "step": 90470 }, { "epoch": 4.200752124054041, "grad_norm": 119.43879699707031, "learning_rate": 4.806629834254144e-08, "logits/chosen": -19.69173240661621, "logits/rejected": -18.85834503173828, "logps/chosen": -365.6618347167969, "logps/rejected": -331.017822265625, "loss": 0.5676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9718399047851562, "rewards/margins": 2.1266109943389893, "rewards/rejected": 1.845228910446167, "step": 90480 }, { "epoch": 4.201216398161475, "grad_norm": 77.1358642578125, "learning_rate": 4.803844189609545e-08, "logits/chosen": -19.047212600708008, "logits/rejected": -18.925939559936523, "logps/chosen": -368.8360290527344, "logps/rejected": -312.56390380859375, "loss": 0.7366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.567995548248291, "rewards/margins": 0.34386616945266724, "rewards/rejected": 2.2241296768188477, "step": 90490 }, { "epoch": 4.201680672268908, "grad_norm": 12.413910865783691, "learning_rate": 4.8010585449649474e-08, "logits/chosen": -19.034860610961914, "logits/rejected": -18.961000442504883, "logps/chosen": -322.57159423828125, "logps/rejected": -293.21533203125, "loss": 0.6445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7809932231903076, "rewards/margins": 0.7499221563339233, "rewards/rejected": 2.031071186065674, "step": 90500 }, { "epoch": 4.2021449463763405, "grad_norm": 44.87733840942383, "learning_rate": 4.7982729003203495e-08, "logits/chosen": -18.473255157470703, "logits/rejected": -18.180789947509766, "logps/chosen": -332.287109375, "logps/rejected": -333.3358459472656, "loss": 0.7697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.238659143447876, "rewards/margins": 1.234134554862976, "rewards/rejected": 1.0045242309570312, "step": 90510 }, { "epoch": 4.202609220483773, "grad_norm": 4.7008748054504395, "learning_rate": 4.795487255675751e-08, "logits/chosen": -18.698820114135742, "logits/rejected": -18.023569107055664, "logps/chosen": -389.4950256347656, "logps/rejected": -232.65478515625, "loss": 0.4023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4077274799346924, "rewards/margins": 2.0019781589508057, "rewards/rejected": 1.4057496786117554, "step": 90520 }, { "epoch": 4.203073494591207, "grad_norm": 3.715601921081543, "learning_rate": 4.7927016110311524e-08, "logits/chosen": -18.54043197631836, "logits/rejected": -17.248912811279297, "logps/chosen": -483.58984375, "logps/rejected": -277.4148254394531, "loss": 0.4055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.034916877746582, "rewards/margins": 1.994150161743164, "rewards/rejected": 2.040766716003418, "step": 90530 }, { "epoch": 4.20353776869864, "grad_norm": 347.7180480957031, "learning_rate": 4.789915966386554e-08, "logits/chosen": -18.647903442382812, "logits/rejected": -18.29827308654785, "logps/chosen": -448.0304260253906, "logps/rejected": -407.21307373046875, "loss": 1.3468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.514697313308716, "rewards/margins": 0.6363996267318726, "rewards/rejected": 2.8782975673675537, "step": 90540 }, { "epoch": 4.2040020428060725, "grad_norm": 16.309194564819336, "learning_rate": 4.787130321741956e-08, "logits/chosen": -18.165006637573242, "logits/rejected": -17.369762420654297, "logps/chosen": -414.78912353515625, "logps/rejected": -309.5235290527344, "loss": 0.617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1835126876831055, "rewards/margins": 1.1344316005706787, "rewards/rejected": 2.0490808486938477, "step": 90550 }, { "epoch": 4.204466316913506, "grad_norm": 4.452674388885498, "learning_rate": 4.784344677097358e-08, "logits/chosen": -19.363445281982422, "logits/rejected": -18.818815231323242, "logps/chosen": -300.4841613769531, "logps/rejected": -275.5030517578125, "loss": 0.7325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9930362701416016, "rewards/margins": 1.0319544076919556, "rewards/rejected": 1.9610817432403564, "step": 90560 }, { "epoch": 4.204930591020939, "grad_norm": 146.45700073242188, "learning_rate": 4.7815590324527596e-08, "logits/chosen": -18.675432205200195, "logits/rejected": -18.471160888671875, "logps/chosen": -464.5235290527344, "logps/rejected": -315.7784423828125, "loss": 1.1832, "rewards/accuracies": 0.5, "rewards/chosen": 3.706235408782959, "rewards/margins": 0.24180534482002258, "rewards/rejected": 3.464430332183838, "step": 90570 }, { "epoch": 4.205394865128372, "grad_norm": 21.824859619140625, "learning_rate": 4.7787733878081617e-08, "logits/chosen": -18.412723541259766, "logits/rejected": -17.7574520111084, "logps/chosen": -377.1300048828125, "logps/rejected": -301.7153625488281, "loss": 0.6811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1491751670837402, "rewards/margins": 1.5217092037200928, "rewards/rejected": 1.6274659633636475, "step": 90580 }, { "epoch": 4.2058591392358045, "grad_norm": 55.329437255859375, "learning_rate": 4.775987743163563e-08, "logits/chosen": -17.842273712158203, "logits/rejected": -18.047100067138672, "logps/chosen": -275.07696533203125, "logps/rejected": -310.40875244140625, "loss": 1.7737, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8185592889785767, "rewards/margins": -0.6228623986244202, "rewards/rejected": 2.4414219856262207, "step": 90590 }, { "epoch": 4.206323413343238, "grad_norm": 69.52351379394531, "learning_rate": 4.773202098518965e-08, "logits/chosen": -19.374996185302734, "logits/rejected": -18.736194610595703, "logps/chosen": -375.93853759765625, "logps/rejected": -283.7166442871094, "loss": 0.6018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0969302654266357, "rewards/margins": 0.7364567518234253, "rewards/rejected": 1.3604735136032104, "step": 90600 }, { "epoch": 4.206787687450671, "grad_norm": 0.013670134358108044, "learning_rate": 4.7704164538743674e-08, "logits/chosen": -18.784687042236328, "logits/rejected": -18.421422958374023, "logps/chosen": -355.8338623046875, "logps/rejected": -333.4654846191406, "loss": 0.4761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8271002769470215, "rewards/margins": 1.8582919836044312, "rewards/rejected": 1.9688084125518799, "step": 90610 }, { "epoch": 4.207251961558104, "grad_norm": 0.019304897636175156, "learning_rate": 4.767630809229769e-08, "logits/chosen": -18.648258209228516, "logits/rejected": -18.06863021850586, "logps/chosen": -372.85577392578125, "logps/rejected": -324.0585021972656, "loss": 0.7436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.045748710632324, "rewards/margins": 1.562000036239624, "rewards/rejected": 1.4837486743927002, "step": 90620 }, { "epoch": 4.207716235665537, "grad_norm": 97.4951400756836, "learning_rate": 4.764845164585171e-08, "logits/chosen": -18.29931640625, "logits/rejected": -18.257253646850586, "logps/chosen": -352.41082763671875, "logps/rejected": -306.4787292480469, "loss": 0.6235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.959728956222534, "rewards/margins": 1.5841825008392334, "rewards/rejected": 2.37554669380188, "step": 90630 }, { "epoch": 4.20818050977297, "grad_norm": 41.44553756713867, "learning_rate": 4.762059519940573e-08, "logits/chosen": -19.10658836364746, "logits/rejected": -18.648727416992188, "logps/chosen": -478.55682373046875, "logps/rejected": -365.07977294921875, "loss": 0.4555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.404097080230713, "rewards/margins": 1.5694608688354492, "rewards/rejected": 2.8346362113952637, "step": 90640 }, { "epoch": 4.208644783880403, "grad_norm": 252.82362365722656, "learning_rate": 4.7592738752959745e-08, "logits/chosen": -20.187105178833008, "logits/rejected": -19.7814884185791, "logps/chosen": -414.32965087890625, "logps/rejected": -379.58782958984375, "loss": 0.9095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.515036106109619, "rewards/margins": 0.7171918153762817, "rewards/rejected": 2.797844409942627, "step": 90650 }, { "epoch": 4.209109057987836, "grad_norm": 96.2318344116211, "learning_rate": 4.7564882306513766e-08, "logits/chosen": -19.13650894165039, "logits/rejected": -18.027557373046875, "logps/chosen": -370.31396484375, "logps/rejected": -240.2583465576172, "loss": 0.7852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.389146327972412, "rewards/margins": 1.2621480226516724, "rewards/rejected": 2.1269984245300293, "step": 90660 }, { "epoch": 4.209573332095269, "grad_norm": 17.652481079101562, "learning_rate": 4.753702586006778e-08, "logits/chosen": -18.998226165771484, "logits/rejected": -17.837657928466797, "logps/chosen": -557.7767944335938, "logps/rejected": -372.1009216308594, "loss": 0.3685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8877363204956055, "rewards/margins": 2.785254955291748, "rewards/rejected": 2.102480888366699, "step": 90670 }, { "epoch": 4.210037606202702, "grad_norm": 112.50420379638672, "learning_rate": 4.75091694136218e-08, "logits/chosen": -19.754981994628906, "logits/rejected": -19.707279205322266, "logps/chosen": -460.7625427246094, "logps/rejected": -343.6656188964844, "loss": 0.573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9564778804779053, "rewards/margins": 0.8076931238174438, "rewards/rejected": 2.14878511428833, "step": 90680 }, { "epoch": 4.210501880310135, "grad_norm": 8.637808799743652, "learning_rate": 4.748131296717582e-08, "logits/chosen": -19.747779846191406, "logits/rejected": -18.85748863220215, "logps/chosen": -387.5625, "logps/rejected": -328.3453674316406, "loss": 0.5752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.529040813446045, "rewards/margins": 1.3385639190673828, "rewards/rejected": 3.190476894378662, "step": 90690 }, { "epoch": 4.2109661544175685, "grad_norm": 18.905014038085938, "learning_rate": 4.745345652072984e-08, "logits/chosen": -18.183637619018555, "logits/rejected": -18.134965896606445, "logps/chosen": -365.38031005859375, "logps/rejected": -367.81170654296875, "loss": 1.0824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.298887252807617, "rewards/margins": 0.0033564926125109196, "rewards/rejected": 2.2955310344696045, "step": 90700 }, { "epoch": 4.211430428525001, "grad_norm": 20.325908660888672, "learning_rate": 4.742560007428386e-08, "logits/chosen": -18.87276840209961, "logits/rejected": -17.970985412597656, "logps/chosen": -440.2890625, "logps/rejected": -313.83447265625, "loss": 0.2763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6415605545043945, "rewards/margins": 2.0892317295074463, "rewards/rejected": 2.5523290634155273, "step": 90710 }, { "epoch": 4.211894702632434, "grad_norm": 37.09942626953125, "learning_rate": 4.739774362783788e-08, "logits/chosen": -19.97238540649414, "logits/rejected": -18.98056983947754, "logps/chosen": -565.2752685546875, "logps/rejected": -389.03338623046875, "loss": 0.4599, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.036366939544678, "rewards/margins": 2.2086730003356934, "rewards/rejected": 2.827693462371826, "step": 90720 }, { "epoch": 4.212358976739867, "grad_norm": 75.46810913085938, "learning_rate": 4.736988718139189e-08, "logits/chosen": -18.83806800842285, "logits/rejected": -18.559289932250977, "logps/chosen": -346.80548095703125, "logps/rejected": -297.9422607421875, "loss": 0.6931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.953291177749634, "rewards/margins": 0.5901080965995789, "rewards/rejected": 2.36318302154541, "step": 90730 }, { "epoch": 4.2128232508473005, "grad_norm": 3.760836362838745, "learning_rate": 4.734203073494591e-08, "logits/chosen": -18.806140899658203, "logits/rejected": -17.957679748535156, "logps/chosen": -361.078125, "logps/rejected": -308.1474304199219, "loss": 0.7525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3540139198303223, "rewards/margins": 0.9734077453613281, "rewards/rejected": 1.3806062936782837, "step": 90740 }, { "epoch": 4.213287524954733, "grad_norm": 264.04364013671875, "learning_rate": 4.7314174288499923e-08, "logits/chosen": -19.47104835510254, "logits/rejected": -18.36477279663086, "logps/chosen": -486.679931640625, "logps/rejected": -435.26629638671875, "loss": 0.4928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.566128253936768, "rewards/margins": 1.5596497058868408, "rewards/rejected": 3.0064785480499268, "step": 90750 }, { "epoch": 4.213751799062166, "grad_norm": 18.358318328857422, "learning_rate": 4.7286317842053944e-08, "logits/chosen": -19.997652053833008, "logits/rejected": -17.85466957092285, "logps/chosen": -513.4759521484375, "logps/rejected": -321.9110412597656, "loss": 0.1854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.259252071380615, "rewards/margins": 3.8062057495117188, "rewards/rejected": 1.4530463218688965, "step": 90760 }, { "epoch": 4.2142160731696, "grad_norm": 1.0784789323806763, "learning_rate": 4.7258461395607966e-08, "logits/chosen": -18.77780532836914, "logits/rejected": -18.946264266967773, "logps/chosen": -279.0140686035156, "logps/rejected": -256.29644775390625, "loss": 0.5888, "rewards/accuracies": 0.5, "rewards/chosen": 2.2219111919403076, "rewards/margins": 0.7304224371910095, "rewards/rejected": 1.4914889335632324, "step": 90770 }, { "epoch": 4.2146803472770324, "grad_norm": 83.15593719482422, "learning_rate": 4.723060494916198e-08, "logits/chosen": -19.288787841796875, "logits/rejected": -18.71284294128418, "logps/chosen": -297.9713439941406, "logps/rejected": -276.64056396484375, "loss": 0.4794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3906381130218506, "rewards/margins": 0.907815158367157, "rewards/rejected": 1.482823133468628, "step": 90780 }, { "epoch": 4.215144621384465, "grad_norm": 55.02098846435547, "learning_rate": 4.7202748502716e-08, "logits/chosen": -19.24702262878418, "logits/rejected": -19.27754020690918, "logps/chosen": -346.6573791503906, "logps/rejected": -437.24853515625, "loss": 1.1964, "rewards/accuracies": 0.5, "rewards/chosen": 2.441512107849121, "rewards/margins": -0.36571913957595825, "rewards/rejected": 2.8072311878204346, "step": 90790 }, { "epoch": 4.215608895491898, "grad_norm": 8.868854522705078, "learning_rate": 4.7174892056270016e-08, "logits/chosen": -18.440826416015625, "logits/rejected": -17.424531936645508, "logps/chosen": -419.31817626953125, "logps/rejected": -266.08935546875, "loss": 0.5581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7286884784698486, "rewards/margins": 1.214561104774475, "rewards/rejected": 1.5141278505325317, "step": 90800 }, { "epoch": 4.216073169599332, "grad_norm": 0.14384955167770386, "learning_rate": 4.714703560982404e-08, "logits/chosen": -19.3228759765625, "logits/rejected": -18.07611083984375, "logps/chosen": -394.94573974609375, "logps/rejected": -374.1216735839844, "loss": 0.5042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.807135105133057, "rewards/margins": 2.2742905616760254, "rewards/rejected": 2.5328447818756104, "step": 90810 }, { "epoch": 4.216537443706764, "grad_norm": 8.400872230529785, "learning_rate": 4.711917916337806e-08, "logits/chosen": -18.875417709350586, "logits/rejected": -17.967945098876953, "logps/chosen": -337.9806823730469, "logps/rejected": -235.5722198486328, "loss": 0.657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8903532028198242, "rewards/margins": 1.2359604835510254, "rewards/rejected": 0.654392421245575, "step": 90820 }, { "epoch": 4.217001717814197, "grad_norm": 29.031421661376953, "learning_rate": 4.709132271693207e-08, "logits/chosen": -18.816802978515625, "logits/rejected": -18.53038215637207, "logps/chosen": -475.44830322265625, "logps/rejected": -396.7113342285156, "loss": 0.6343, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8905277252197266, "rewards/margins": 0.7965447306632996, "rewards/rejected": 3.0939829349517822, "step": 90830 }, { "epoch": 4.217465991921631, "grad_norm": 94.70745849609375, "learning_rate": 4.7063466270486094e-08, "logits/chosen": -18.363971710205078, "logits/rejected": -17.87232208251953, "logps/chosen": -436.753173828125, "logps/rejected": -370.78515625, "loss": 0.5274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.144567012786865, "rewards/margins": 1.3683452606201172, "rewards/rejected": 3.776221752166748, "step": 90840 }, { "epoch": 4.217930266029064, "grad_norm": 146.8087615966797, "learning_rate": 4.7035609824040115e-08, "logits/chosen": -19.237003326416016, "logits/rejected": -18.0980167388916, "logps/chosen": -472.36053466796875, "logps/rejected": -385.60211181640625, "loss": 1.051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.693640947341919, "rewards/margins": 0.3237842619419098, "rewards/rejected": 3.369856357574463, "step": 90850 }, { "epoch": 4.218394540136496, "grad_norm": 10.041218757629395, "learning_rate": 4.700775337759413e-08, "logits/chosen": -19.624343872070312, "logits/rejected": -18.319049835205078, "logps/chosen": -346.82147216796875, "logps/rejected": -275.78558349609375, "loss": 0.8288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6286377906799316, "rewards/margins": 1.4638824462890625, "rewards/rejected": 2.16475510597229, "step": 90860 }, { "epoch": 4.21885881424393, "grad_norm": 5.778942584991455, "learning_rate": 4.697989693114815e-08, "logits/chosen": -19.112030029296875, "logits/rejected": -17.79578971862793, "logps/chosen": -477.5668029785156, "logps/rejected": -319.0517578125, "loss": 0.4241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4529945850372314, "rewards/margins": 1.8327182531356812, "rewards/rejected": 1.6202762126922607, "step": 90870 }, { "epoch": 4.219323088351363, "grad_norm": 48.80067825317383, "learning_rate": 4.6952040484702165e-08, "logits/chosen": -19.00521469116211, "logits/rejected": -19.056072235107422, "logps/chosen": -326.02911376953125, "logps/rejected": -338.637451171875, "loss": 0.6842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3805415630340576, "rewards/margins": 0.6070400476455688, "rewards/rejected": 2.7735016345977783, "step": 90880 }, { "epoch": 4.219787362458796, "grad_norm": 25.74448585510254, "learning_rate": 4.6924184038256186e-08, "logits/chosen": -19.843412399291992, "logits/rejected": -18.958913803100586, "logps/chosen": -369.1263732910156, "logps/rejected": -369.4439392089844, "loss": 1.2862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8939318656921387, "rewards/margins": 1.092523455619812, "rewards/rejected": 2.801408290863037, "step": 90890 }, { "epoch": 4.220251636566228, "grad_norm": 7.642306804656982, "learning_rate": 4.689632759181021e-08, "logits/chosen": -19.383745193481445, "logits/rejected": -17.833187103271484, "logps/chosen": -542.1586303710938, "logps/rejected": -387.7848815917969, "loss": 0.4991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.236830234527588, "rewards/margins": 2.434657096862793, "rewards/rejected": 2.8021726608276367, "step": 90900 }, { "epoch": 4.220715910673662, "grad_norm": 0.9441813230514526, "learning_rate": 4.686847114536422e-08, "logits/chosen": -18.28963851928711, "logits/rejected": -18.273441314697266, "logps/chosen": -252.6802215576172, "logps/rejected": -326.03424072265625, "loss": 1.4226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6776535511016846, "rewards/margins": -0.32305908203125, "rewards/rejected": 2.0007128715515137, "step": 90910 }, { "epoch": 4.221180184781095, "grad_norm": 17.019365310668945, "learning_rate": 4.684061469891824e-08, "logits/chosen": -19.67268180847168, "logits/rejected": -18.382020950317383, "logps/chosen": -365.5966796875, "logps/rejected": -295.7524108886719, "loss": 0.3205, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1836256980896, "rewards/margins": 2.207237720489502, "rewards/rejected": 1.9763882160186768, "step": 90920 }, { "epoch": 4.221644458888528, "grad_norm": 64.29460906982422, "learning_rate": 4.681275825247225e-08, "logits/chosen": -19.273542404174805, "logits/rejected": -18.17849349975586, "logps/chosen": -337.9616394042969, "logps/rejected": -287.6932067871094, "loss": 0.5775, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.167463779449463, "rewards/margins": 1.0464380979537964, "rewards/rejected": 1.1210256814956665, "step": 90930 }, { "epoch": 4.222108732995961, "grad_norm": 46.820831298828125, "learning_rate": 4.678490180602627e-08, "logits/chosen": -19.235780715942383, "logits/rejected": -18.648860931396484, "logps/chosen": -344.72442626953125, "logps/rejected": -243.61978149414062, "loss": 0.7915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.139265537261963, "rewards/margins": 0.6801475882530212, "rewards/rejected": 1.4591180086135864, "step": 90940 }, { "epoch": 4.222573007103394, "grad_norm": 180.842529296875, "learning_rate": 4.6757045359580293e-08, "logits/chosen": -19.411956787109375, "logits/rejected": -18.78753662109375, "logps/chosen": -550.6369018554688, "logps/rejected": -432.6112365722656, "loss": 1.0114, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.103224754333496, "rewards/margins": 0.573147177696228, "rewards/rejected": 4.530077934265137, "step": 90950 }, { "epoch": 4.223037281210827, "grad_norm": 100.93927764892578, "learning_rate": 4.672918891313431e-08, "logits/chosen": -18.762123107910156, "logits/rejected": -16.971336364746094, "logps/chosen": -384.3551025390625, "logps/rejected": -266.72344970703125, "loss": 0.3885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.772899627685547, "rewards/margins": 2.342134475708008, "rewards/rejected": 1.4307653903961182, "step": 90960 }, { "epoch": 4.2235015553182595, "grad_norm": 24.22425651550293, "learning_rate": 4.670133246668833e-08, "logits/chosen": -19.321285247802734, "logits/rejected": -18.050582885742188, "logps/chosen": -432.359619140625, "logps/rejected": -264.13763427734375, "loss": 0.6062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.150030612945557, "rewards/margins": 2.4814982414245605, "rewards/rejected": 1.668532133102417, "step": 90970 }, { "epoch": 4.223965829425693, "grad_norm": 0.020713601261377335, "learning_rate": 4.667347602024235e-08, "logits/chosen": -19.178089141845703, "logits/rejected": -18.441261291503906, "logps/chosen": -425.7528381347656, "logps/rejected": -354.92828369140625, "loss": 0.6152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.151332378387451, "rewards/margins": 1.8895502090454102, "rewards/rejected": 1.2617824077606201, "step": 90980 }, { "epoch": 4.224430103533126, "grad_norm": 8.921747207641602, "learning_rate": 4.6645619573796365e-08, "logits/chosen": -19.065887451171875, "logits/rejected": -17.796201705932617, "logps/chosen": -460.4906311035156, "logps/rejected": -386.36181640625, "loss": 0.4783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.359857082366943, "rewards/margins": 1.8621852397918701, "rewards/rejected": 2.4976720809936523, "step": 90990 }, { "epoch": 4.224894377640559, "grad_norm": 44.31586456298828, "learning_rate": 4.6617763127350386e-08, "logits/chosen": -19.60972023010254, "logits/rejected": -18.712520599365234, "logps/chosen": -451.2738342285156, "logps/rejected": -402.85076904296875, "loss": 0.5441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.41169548034668, "rewards/margins": 1.389885425567627, "rewards/rejected": 3.0218100547790527, "step": 91000 }, { "epoch": 4.225358651747992, "grad_norm": 0.5976126194000244, "learning_rate": 4.65899066809044e-08, "logits/chosen": -18.668304443359375, "logits/rejected": -17.91488265991211, "logps/chosen": -333.51220703125, "logps/rejected": -242.52670288085938, "loss": 0.6701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4005789756774902, "rewards/margins": 1.7547729015350342, "rewards/rejected": 1.645806074142456, "step": 91010 }, { "epoch": 4.225822925855425, "grad_norm": 22.782926559448242, "learning_rate": 4.656205023445842e-08, "logits/chosen": -20.333473205566406, "logits/rejected": -18.346155166625977, "logps/chosen": -408.50628662109375, "logps/rejected": -322.3382263183594, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.455696105957031, "rewards/margins": 2.3975112438201904, "rewards/rejected": 2.058184862136841, "step": 91020 }, { "epoch": 4.226287199962858, "grad_norm": 158.46177673339844, "learning_rate": 4.653419378801244e-08, "logits/chosen": -18.997079849243164, "logits/rejected": -19.01699447631836, "logps/chosen": -280.14117431640625, "logps/rejected": -287.64862060546875, "loss": 0.7203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1042568683624268, "rewards/margins": 0.17377647757530212, "rewards/rejected": 1.9304805994033813, "step": 91030 }, { "epoch": 4.226751474070291, "grad_norm": 3.7338778972625732, "learning_rate": 4.650633734156646e-08, "logits/chosen": -18.664173126220703, "logits/rejected": -18.14206314086914, "logps/chosen": -397.28912353515625, "logps/rejected": -290.47210693359375, "loss": 0.6379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9584922790527344, "rewards/margins": 1.6793277263641357, "rewards/rejected": 2.2791643142700195, "step": 91040 }, { "epoch": 4.227215748177724, "grad_norm": 195.02484130859375, "learning_rate": 4.647848089512048e-08, "logits/chosen": -19.179052352905273, "logits/rejected": -18.85349464416504, "logps/chosen": -391.80633544921875, "logps/rejected": -307.98834228515625, "loss": 0.8433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.630528450012207, "rewards/margins": 2.233025074005127, "rewards/rejected": 2.3975040912628174, "step": 91050 }, { "epoch": 4.227680022285157, "grad_norm": 51.31046676635742, "learning_rate": 4.64506244486745e-08, "logits/chosen": -18.302066802978516, "logits/rejected": -17.342426300048828, "logps/chosen": -396.0137939453125, "logps/rejected": -297.93023681640625, "loss": 0.5567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.096696376800537, "rewards/margins": 0.9255781173706055, "rewards/rejected": 2.1711184978485107, "step": 91060 }, { "epoch": 4.22814429639259, "grad_norm": 18.685550689697266, "learning_rate": 4.6422768002228514e-08, "logits/chosen": -18.83915901184082, "logits/rejected": -18.716182708740234, "logps/chosen": -382.77410888671875, "logps/rejected": -393.358154296875, "loss": 1.0368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.360969066619873, "rewards/margins": 0.2424609661102295, "rewards/rejected": 3.1185078620910645, "step": 91070 }, { "epoch": 4.228608570500024, "grad_norm": 118.69481658935547, "learning_rate": 4.6394911555782535e-08, "logits/chosen": -19.083656311035156, "logits/rejected": -17.99698257446289, "logps/chosen": -485.98095703125, "logps/rejected": -371.64764404296875, "loss": 0.4681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.299786567687988, "rewards/margins": 2.713240146636963, "rewards/rejected": 2.5865464210510254, "step": 91080 }, { "epoch": 4.229072844607456, "grad_norm": 259.08465576171875, "learning_rate": 4.636705510933655e-08, "logits/chosen": -19.74495506286621, "logits/rejected": -19.589635848999023, "logps/chosen": -464.9593200683594, "logps/rejected": -487.44451904296875, "loss": 1.4984, "rewards/accuracies": 0.5, "rewards/chosen": 3.472398042678833, "rewards/margins": -0.4327371120452881, "rewards/rejected": 3.9051353931427, "step": 91090 }, { "epoch": 4.229537118714889, "grad_norm": 52.49370193481445, "learning_rate": 4.633919866289057e-08, "logits/chosen": -18.42361831665039, "logits/rejected": -17.53841209411621, "logps/chosen": -352.8894348144531, "logps/rejected": -336.070556640625, "loss": 0.9988, "rewards/accuracies": 0.5, "rewards/chosen": 3.054326057434082, "rewards/margins": 0.7440798878669739, "rewards/rejected": 2.310245990753174, "step": 91100 }, { "epoch": 4.230001392822322, "grad_norm": 64.61882781982422, "learning_rate": 4.631134221644459e-08, "logits/chosen": -19.487869262695312, "logits/rejected": -17.777729034423828, "logps/chosen": -423.74786376953125, "logps/rejected": -281.47308349609375, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 3.9695937633514404, "rewards/margins": 2.6650233268737793, "rewards/rejected": 1.3045704364776611, "step": 91110 }, { "epoch": 4.230465666929756, "grad_norm": 12.543967247009277, "learning_rate": 4.628348576999861e-08, "logits/chosen": -18.69150733947754, "logits/rejected": -18.798038482666016, "logps/chosen": -342.6654052734375, "logps/rejected": -368.6287536621094, "loss": 0.983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1790244579315186, "rewards/margins": 0.10390494763851166, "rewards/rejected": 2.0751194953918457, "step": 91120 }, { "epoch": 4.230929941037188, "grad_norm": 17.370702743530273, "learning_rate": 4.625562932355262e-08, "logits/chosen": -19.612773895263672, "logits/rejected": -19.37466812133789, "logps/chosen": -304.75238037109375, "logps/rejected": -297.84234619140625, "loss": 0.634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1815011501312256, "rewards/margins": 0.4972701668739319, "rewards/rejected": 2.6842310428619385, "step": 91130 }, { "epoch": 4.231394215144621, "grad_norm": 146.43898010253906, "learning_rate": 4.6227772877106636e-08, "logits/chosen": -18.584299087524414, "logits/rejected": -17.95993995666504, "logps/chosen": -470.18682861328125, "logps/rejected": -348.64990234375, "loss": 0.3516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.285965919494629, "rewards/margins": 1.8174464702606201, "rewards/rejected": 2.468519926071167, "step": 91140 }, { "epoch": 4.231858489252055, "grad_norm": 174.7151336669922, "learning_rate": 4.619991643066066e-08, "logits/chosen": -18.50779151916504, "logits/rejected": -18.216243743896484, "logps/chosen": -341.3935546875, "logps/rejected": -316.64886474609375, "loss": 1.0608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9190573692321777, "rewards/margins": 0.04208286851644516, "rewards/rejected": 2.87697434425354, "step": 91150 }, { "epoch": 4.2323227633594875, "grad_norm": 68.0452651977539, "learning_rate": 4.617205998421468e-08, "logits/chosen": -19.83287239074707, "logits/rejected": -19.00933265686035, "logps/chosen": -381.07086181640625, "logps/rejected": -353.5503845214844, "loss": 0.6323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8091118335723877, "rewards/margins": 1.8953416347503662, "rewards/rejected": 1.9137704372406006, "step": 91160 }, { "epoch": 4.23278703746692, "grad_norm": 4.855118274688721, "learning_rate": 4.614420353776869e-08, "logits/chosen": -18.273786544799805, "logits/rejected": -17.983484268188477, "logps/chosen": -315.443603515625, "logps/rejected": -252.44668579101562, "loss": 0.9722, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0989248752593994, "rewards/margins": -0.05804933235049248, "rewards/rejected": 1.156974196434021, "step": 91170 }, { "epoch": 4.233251311574353, "grad_norm": 5.169061183929443, "learning_rate": 4.6116347091322714e-08, "logits/chosen": -19.551654815673828, "logits/rejected": -19.330446243286133, "logps/chosen": -480.4502868652344, "logps/rejected": -510.26629638671875, "loss": 1.6615, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.848957061767578, "rewards/margins": -0.5501668453216553, "rewards/rejected": 5.399124622344971, "step": 91180 }, { "epoch": 4.233715585681787, "grad_norm": 1.020719289779663, "learning_rate": 4.6088490644876735e-08, "logits/chosen": -17.728588104248047, "logits/rejected": -16.779666900634766, "logps/chosen": -380.02093505859375, "logps/rejected": -258.34686279296875, "loss": 0.5004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8167927265167236, "rewards/margins": 1.7771459817886353, "rewards/rejected": 1.039646863937378, "step": 91190 }, { "epoch": 4.2341798597892195, "grad_norm": 2.5054116249084473, "learning_rate": 4.606063419843075e-08, "logits/chosen": -18.155221939086914, "logits/rejected": -18.022401809692383, "logps/chosen": -404.22625732421875, "logps/rejected": -300.75091552734375, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.83242130279541, "rewards/margins": 0.7959728240966797, "rewards/rejected": 2.0364482402801514, "step": 91200 }, { "epoch": 4.234644133896652, "grad_norm": 11.17708969116211, "learning_rate": 4.603277775198477e-08, "logits/chosen": -19.601360321044922, "logits/rejected": -18.667861938476562, "logps/chosen": -368.54229736328125, "logps/rejected": -235.3421173095703, "loss": 1.0249, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.570735454559326, "rewards/margins": 1.300594449043274, "rewards/rejected": 2.270141124725342, "step": 91210 }, { "epoch": 4.235108408004086, "grad_norm": 18.119680404663086, "learning_rate": 4.6004921305538785e-08, "logits/chosen": -18.543033599853516, "logits/rejected": -18.357128143310547, "logps/chosen": -303.47467041015625, "logps/rejected": -303.1141662597656, "loss": 0.5152, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1993587017059326, "rewards/margins": 1.2843778133392334, "rewards/rejected": 0.9149808883666992, "step": 91220 }, { "epoch": 4.235572682111519, "grad_norm": 0.6659437417984009, "learning_rate": 4.5977064859092806e-08, "logits/chosen": -18.806034088134766, "logits/rejected": -17.654678344726562, "logps/chosen": -392.61834716796875, "logps/rejected": -232.58828735351562, "loss": 0.7792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.765712261199951, "rewards/margins": 1.7505680322647095, "rewards/rejected": 1.0151441097259521, "step": 91230 }, { "epoch": 4.2360369562189515, "grad_norm": 10.979975700378418, "learning_rate": 4.594920841264683e-08, "logits/chosen": -18.928558349609375, "logits/rejected": -18.21592140197754, "logps/chosen": -384.3036193847656, "logps/rejected": -285.00421142578125, "loss": 0.6377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.937352180480957, "rewards/margins": 1.4610254764556885, "rewards/rejected": 1.4763269424438477, "step": 91240 }, { "epoch": 4.236501230326384, "grad_norm": 185.8109130859375, "learning_rate": 4.592135196620084e-08, "logits/chosen": -19.575742721557617, "logits/rejected": -18.754573822021484, "logps/chosen": -414.8995056152344, "logps/rejected": -410.07489013671875, "loss": 0.5238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5335304737091064, "rewards/margins": 0.9411764144897461, "rewards/rejected": 2.592353582382202, "step": 91250 }, { "epoch": 4.236965504433818, "grad_norm": 104.10775756835938, "learning_rate": 4.589349551975486e-08, "logits/chosen": -18.57204818725586, "logits/rejected": -18.37864112854004, "logps/chosen": -427.5567932128906, "logps/rejected": -380.51361083984375, "loss": 1.3554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.031785488128662, "rewards/margins": 0.14109714329242706, "rewards/rejected": 2.890688180923462, "step": 91260 }, { "epoch": 4.237429778541251, "grad_norm": 97.5390625, "learning_rate": 4.5865639073308884e-08, "logits/chosen": -19.413490295410156, "logits/rejected": -19.266794204711914, "logps/chosen": -411.9200134277344, "logps/rejected": -353.06402587890625, "loss": 0.6918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.815011501312256, "rewards/margins": 1.09855055809021, "rewards/rejected": 2.716460704803467, "step": 91270 }, { "epoch": 4.2378940526486835, "grad_norm": 19.872007369995117, "learning_rate": 4.58377826268629e-08, "logits/chosen": -19.041927337646484, "logits/rejected": -18.17331314086914, "logps/chosen": -404.4216003417969, "logps/rejected": -308.51995849609375, "loss": 0.6935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3234424591064453, "rewards/margins": 1.1161664724349976, "rewards/rejected": 2.207275867462158, "step": 91280 }, { "epoch": 4.238358326756117, "grad_norm": 28.719045639038086, "learning_rate": 4.580992618041692e-08, "logits/chosen": -19.56598663330078, "logits/rejected": -18.27996253967285, "logps/chosen": -413.27294921875, "logps/rejected": -212.4622039794922, "loss": 0.2182, "rewards/accuracies": 1.0, "rewards/chosen": 3.2170956134796143, "rewards/margins": 2.5861055850982666, "rewards/rejected": 0.6309901475906372, "step": 91290 }, { "epoch": 4.23882260086355, "grad_norm": 0.8929728269577026, "learning_rate": 4.5782069733970935e-08, "logits/chosen": -19.03334617614746, "logits/rejected": -18.626522064208984, "logps/chosen": -279.8782043457031, "logps/rejected": -269.73663330078125, "loss": 1.3159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4122371673583984, "rewards/margins": 0.12125267833471298, "rewards/rejected": 2.2909843921661377, "step": 91300 }, { "epoch": 4.239286874970983, "grad_norm": 30.114355087280273, "learning_rate": 4.5754213287524956e-08, "logits/chosen": -20.13689613342285, "logits/rejected": -19.413232803344727, "logps/chosen": -326.9876708984375, "logps/rejected": -296.72125244140625, "loss": 0.6832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.050255537033081, "rewards/margins": 0.7626117467880249, "rewards/rejected": 2.2876439094543457, "step": 91310 }, { "epoch": 4.239751149078416, "grad_norm": 0.5567470192909241, "learning_rate": 4.5726356841078964e-08, "logits/chosen": -19.907962799072266, "logits/rejected": -18.723310470581055, "logps/chosen": -379.42059326171875, "logps/rejected": -236.25375366210938, "loss": 0.2822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.62475848197937, "rewards/margins": 1.8786922693252563, "rewards/rejected": 1.7460664510726929, "step": 91320 }, { "epoch": 4.240215423185849, "grad_norm": 24.16061782836914, "learning_rate": 4.5698500394632985e-08, "logits/chosen": -18.551036834716797, "logits/rejected": -18.129114151000977, "logps/chosen": -324.31890869140625, "logps/rejected": -323.93133544921875, "loss": 1.277, "rewards/accuracies": 0.5, "rewards/chosen": 3.4958221912384033, "rewards/margins": 0.852552592754364, "rewards/rejected": 2.6432695388793945, "step": 91330 }, { "epoch": 4.240679697293282, "grad_norm": 211.07080078125, "learning_rate": 4.5670643948187006e-08, "logits/chosen": -19.60370445251465, "logits/rejected": -19.273597717285156, "logps/chosen": -398.87701416015625, "logps/rejected": -413.93316650390625, "loss": 0.6987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.788687705993652, "rewards/margins": 1.3851994276046753, "rewards/rejected": 3.4034881591796875, "step": 91340 }, { "epoch": 4.241143971400715, "grad_norm": 49.66450500488281, "learning_rate": 4.564278750174102e-08, "logits/chosen": -18.956275939941406, "logits/rejected": -18.19994354248047, "logps/chosen": -367.59979248046875, "logps/rejected": -304.9655456542969, "loss": 1.1223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.085601329803467, "rewards/margins": 0.3957671523094177, "rewards/rejected": 1.6898343563079834, "step": 91350 }, { "epoch": 4.241608245508148, "grad_norm": 1.6215931177139282, "learning_rate": 4.561493105529504e-08, "logits/chosen": -19.179264068603516, "logits/rejected": -18.253894805908203, "logps/chosen": -347.283203125, "logps/rejected": -254.1152801513672, "loss": 0.7661, "rewards/accuracies": 0.5, "rewards/chosen": 3.0653610229492188, "rewards/margins": 1.1294201612472534, "rewards/rejected": 1.9359405040740967, "step": 91360 }, { "epoch": 4.242072519615581, "grad_norm": 1.42222261428833, "learning_rate": 4.558707460884906e-08, "logits/chosen": -18.97266387939453, "logits/rejected": -18.988632202148438, "logps/chosen": -337.1206970214844, "logps/rejected": -369.2087707519531, "loss": 1.2174, "rewards/accuracies": 0.5, "rewards/chosen": 2.4818809032440186, "rewards/margins": -0.09616043418645859, "rewards/rejected": 2.5780415534973145, "step": 91370 }, { "epoch": 4.242536793723014, "grad_norm": 24.81922149658203, "learning_rate": 4.555921816240308e-08, "logits/chosen": -19.29474449157715, "logits/rejected": -18.613117218017578, "logps/chosen": -406.28204345703125, "logps/rejected": -286.8460998535156, "loss": 0.7982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.961451292037964, "rewards/margins": 1.5411603450775146, "rewards/rejected": 2.420290946960449, "step": 91380 }, { "epoch": 4.2430010678304475, "grad_norm": 179.31988525390625, "learning_rate": 4.55313617159571e-08, "logits/chosen": -18.527708053588867, "logits/rejected": -17.9935245513916, "logps/chosen": -315.5970764160156, "logps/rejected": -277.0919494628906, "loss": 1.0413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.816490411758423, "rewards/margins": 0.782403826713562, "rewards/rejected": 2.0340867042541504, "step": 91390 }, { "epoch": 4.24346534193788, "grad_norm": 175.7489013671875, "learning_rate": 4.550350526951112e-08, "logits/chosen": -18.911346435546875, "logits/rejected": -18.16840362548828, "logps/chosen": -356.6961669921875, "logps/rejected": -288.3775939941406, "loss": 0.8072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4141876697540283, "rewards/margins": 1.6704124212265015, "rewards/rejected": 1.7437750101089478, "step": 91400 }, { "epoch": 4.243929616045313, "grad_norm": 169.88967895507812, "learning_rate": 4.5475648823065134e-08, "logits/chosen": -19.256927490234375, "logits/rejected": -18.805259704589844, "logps/chosen": -372.0986328125, "logps/rejected": -247.61483764648438, "loss": 0.6902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.332986831665039, "rewards/margins": 1.398874282836914, "rewards/rejected": 0.9341124296188354, "step": 91410 }, { "epoch": 4.244393890152746, "grad_norm": 215.29852294921875, "learning_rate": 4.5447792376619155e-08, "logits/chosen": -20.07822036743164, "logits/rejected": -19.624073028564453, "logps/chosen": -487.06597900390625, "logps/rejected": -468.738037109375, "loss": 0.69, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.0803070068359375, "rewards/margins": 1.0085585117340088, "rewards/rejected": 4.071748733520508, "step": 91420 }, { "epoch": 4.2448581642601795, "grad_norm": 63.387691497802734, "learning_rate": 4.541993593017317e-08, "logits/chosen": -19.72756576538086, "logits/rejected": -18.295886993408203, "logps/chosen": -393.60614013671875, "logps/rejected": -305.45062255859375, "loss": 0.4964, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.570528745651245, "rewards/margins": 1.8438221216201782, "rewards/rejected": 1.726707100868225, "step": 91430 }, { "epoch": 4.245322438367612, "grad_norm": 248.99435424804688, "learning_rate": 4.539207948372719e-08, "logits/chosen": -19.611095428466797, "logits/rejected": -18.592121124267578, "logps/chosen": -388.3838806152344, "logps/rejected": -260.0669860839844, "loss": 0.7486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.896127700805664, "rewards/margins": 0.7034005522727966, "rewards/rejected": 3.1927270889282227, "step": 91440 }, { "epoch": 4.245786712475045, "grad_norm": 116.66375732421875, "learning_rate": 4.536422303728121e-08, "logits/chosen": -19.966541290283203, "logits/rejected": -19.147151947021484, "logps/chosen": -429.34326171875, "logps/rejected": -360.9996032714844, "loss": 0.5159, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.070499420166016, "rewards/margins": 0.8092284202575684, "rewards/rejected": 3.2612712383270264, "step": 91450 }, { "epoch": 4.246250986582479, "grad_norm": 102.18065643310547, "learning_rate": 4.533636659083523e-08, "logits/chosen": -20.197834014892578, "logits/rejected": -19.430797576904297, "logps/chosen": -304.77874755859375, "logps/rejected": -249.96676635742188, "loss": 0.629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6934776306152344, "rewards/margins": 1.315798044204712, "rewards/rejected": 2.3776795864105225, "step": 91460 }, { "epoch": 4.2467152606899115, "grad_norm": 7.609517574310303, "learning_rate": 4.530851014438925e-08, "logits/chosen": -18.799667358398438, "logits/rejected": -17.831649780273438, "logps/chosen": -447.1241149902344, "logps/rejected": -362.3298645019531, "loss": 1.0119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.823452949523926, "rewards/margins": 2.1172537803649902, "rewards/rejected": 2.7061991691589355, "step": 91470 }, { "epoch": 4.247179534797344, "grad_norm": 0.0638447105884552, "learning_rate": 4.528343934258786e-08, "logits/chosen": -19.44613265991211, "logits/rejected": -18.18874740600586, "logps/chosen": -434.65435791015625, "logps/rejected": -372.5564880371094, "loss": 0.5449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.092154026031494, "rewards/margins": 2.027583360671997, "rewards/rejected": 2.064570903778076, "step": 91480 }, { "epoch": 4.247643808904777, "grad_norm": 5.386616230010986, "learning_rate": 4.5255582896141883e-08, "logits/chosen": -18.699453353881836, "logits/rejected": -17.81205177307129, "logps/chosen": -395.62286376953125, "logps/rejected": -259.8141174316406, "loss": 1.0273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.06723690032959, "rewards/margins": 0.5758160948753357, "rewards/rejected": 2.4914205074310303, "step": 91490 }, { "epoch": 4.248108083012211, "grad_norm": 23.07762336730957, "learning_rate": 4.52277264496959e-08, "logits/chosen": -18.685678482055664, "logits/rejected": -17.95375633239746, "logps/chosen": -393.39678955078125, "logps/rejected": -291.5150451660156, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.023263454437256, "rewards/margins": 1.2306381464004517, "rewards/rejected": 1.7926256656646729, "step": 91500 }, { "epoch": 4.248572357119643, "grad_norm": 2.728839874267578, "learning_rate": 4.519987000324992e-08, "logits/chosen": -20.02509880065918, "logits/rejected": -19.333288192749023, "logps/chosen": -410.03155517578125, "logps/rejected": -394.65618896484375, "loss": 0.4544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.982728004455566, "rewards/margins": 2.3100690841674805, "rewards/rejected": 3.6726584434509277, "step": 91510 }, { "epoch": 4.249036631227076, "grad_norm": 102.43653106689453, "learning_rate": 4.5172013556803934e-08, "logits/chosen": -19.247669219970703, "logits/rejected": -19.37120246887207, "logps/chosen": -398.07916259765625, "logps/rejected": -397.2884521484375, "loss": 1.1584, "rewards/accuracies": 0.5, "rewards/chosen": 3.569382905960083, "rewards/margins": 0.24254071712493896, "rewards/rejected": 3.3268425464630127, "step": 91520 }, { "epoch": 4.24950090533451, "grad_norm": 16.209144592285156, "learning_rate": 4.5144157110357955e-08, "logits/chosen": -19.028911590576172, "logits/rejected": -17.619972229003906, "logps/chosen": -493.46502685546875, "logps/rejected": -322.16314697265625, "loss": 0.1462, "rewards/accuracies": 1.0, "rewards/chosen": 4.990200042724609, "rewards/margins": 3.1002018451690674, "rewards/rejected": 1.8899986743927002, "step": 91530 }, { "epoch": 4.249965179441943, "grad_norm": 61.3048095703125, "learning_rate": 4.5116300663911976e-08, "logits/chosen": -18.555070877075195, "logits/rejected": -18.75286102294922, "logps/chosen": -412.93914794921875, "logps/rejected": -404.044189453125, "loss": 0.5724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.677832841873169, "rewards/margins": 0.8179639577865601, "rewards/rejected": 2.8598685264587402, "step": 91540 }, { "epoch": 4.250429453549375, "grad_norm": 58.066070556640625, "learning_rate": 4.508844421746599e-08, "logits/chosen": -19.43537139892578, "logits/rejected": -17.913869857788086, "logps/chosen": -377.2034912109375, "logps/rejected": -328.75653076171875, "loss": 0.2383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.149567604064941, "rewards/margins": 2.9546375274658203, "rewards/rejected": 1.1949299573898315, "step": 91550 }, { "epoch": 4.250893727656808, "grad_norm": 35.8174934387207, "learning_rate": 4.506058777102001e-08, "logits/chosen": -19.627155303955078, "logits/rejected": -18.93625831604004, "logps/chosen": -428.35906982421875, "logps/rejected": -310.3154602050781, "loss": 0.6857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.569263458251953, "rewards/margins": 1.6980655193328857, "rewards/rejected": 2.8711981773376465, "step": 91560 }, { "epoch": 4.251358001764242, "grad_norm": 114.33514404296875, "learning_rate": 4.503273132457403e-08, "logits/chosen": -18.446975708007812, "logits/rejected": -17.52351188659668, "logps/chosen": -329.31610107421875, "logps/rejected": -231.9880828857422, "loss": 0.3763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.712385654449463, "rewards/margins": 1.7318731546401978, "rewards/rejected": 0.9805127382278442, "step": 91570 }, { "epoch": 4.251822275871675, "grad_norm": 156.97914123535156, "learning_rate": 4.500487487812804e-08, "logits/chosen": -18.42117691040039, "logits/rejected": -18.327774047851562, "logps/chosen": -431.4354553222656, "logps/rejected": -341.0213928222656, "loss": 1.0257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.191896677017212, "rewards/margins": 0.24486994743347168, "rewards/rejected": 2.9470272064208984, "step": 91580 }, { "epoch": 4.252286549979107, "grad_norm": 6.928290843963623, "learning_rate": 4.497701843168206e-08, "logits/chosen": -20.024578094482422, "logits/rejected": -18.806835174560547, "logps/chosen": -424.162841796875, "logps/rejected": -316.2453918457031, "loss": 0.6321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.708035945892334, "rewards/margins": 1.1927459239959717, "rewards/rejected": 2.5152900218963623, "step": 91590 }, { "epoch": 4.252750824086541, "grad_norm": 46.69368362426758, "learning_rate": 4.4949161985236076e-08, "logits/chosen": -19.450464248657227, "logits/rejected": -18.595844268798828, "logps/chosen": -371.5208435058594, "logps/rejected": -348.13543701171875, "loss": 0.7324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.276318073272705, "rewards/margins": 0.7730053067207336, "rewards/rejected": 2.503312587738037, "step": 91600 }, { "epoch": 4.253215098193974, "grad_norm": 0.21497419476509094, "learning_rate": 4.49213055387901e-08, "logits/chosen": -19.604867935180664, "logits/rejected": -18.423147201538086, "logps/chosen": -341.95672607421875, "logps/rejected": -256.6178283691406, "loss": 0.4748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3981664180755615, "rewards/margins": 2.2253077030181885, "rewards/rejected": 0.17285893857479095, "step": 91610 }, { "epoch": 4.253679372301407, "grad_norm": 74.23690032958984, "learning_rate": 4.489344909234412e-08, "logits/chosen": -19.499649047851562, "logits/rejected": -19.763172149658203, "logps/chosen": -326.71734619140625, "logps/rejected": -410.13250732421875, "loss": 1.3427, "rewards/accuracies": 0.5, "rewards/chosen": 3.148108959197998, "rewards/margins": 0.17632044851779938, "rewards/rejected": 2.9717884063720703, "step": 91620 }, { "epoch": 4.25414364640884, "grad_norm": 15.78417682647705, "learning_rate": 4.4865592645898133e-08, "logits/chosen": -18.989181518554688, "logits/rejected": -18.70269775390625, "logps/chosen": -294.0169982910156, "logps/rejected": -304.8018798828125, "loss": 1.088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3788704872131348, "rewards/margins": -0.3489709496498108, "rewards/rejected": 2.72784161567688, "step": 91630 }, { "epoch": 4.254607920516273, "grad_norm": 126.61157989501953, "learning_rate": 4.4837736199452154e-08, "logits/chosen": -19.57539176940918, "logits/rejected": -18.755672454833984, "logps/chosen": -293.03143310546875, "logps/rejected": -276.87255859375, "loss": 0.4855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5538010597229004, "rewards/margins": 1.5540931224822998, "rewards/rejected": 1.999707818031311, "step": 91640 }, { "epoch": 4.255072194623706, "grad_norm": 79.11195373535156, "learning_rate": 4.480987975300617e-08, "logits/chosen": -18.48005485534668, "logits/rejected": -17.853713989257812, "logps/chosen": -361.6229553222656, "logps/rejected": -336.8634948730469, "loss": 0.2654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6304774284362793, "rewards/margins": 1.9734258651733398, "rewards/rejected": 1.6570520401000977, "step": 91650 }, { "epoch": 4.2555364687311386, "grad_norm": 0.6345047354698181, "learning_rate": 4.478202330656019e-08, "logits/chosen": -20.686079025268555, "logits/rejected": -19.674745559692383, "logps/chosen": -487.24920654296875, "logps/rejected": -318.4050598144531, "loss": 0.6981, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.339522361755371, "rewards/margins": 1.8594194650650024, "rewards/rejected": 2.4801034927368164, "step": 91660 }, { "epoch": 4.256000742838572, "grad_norm": 121.54231262207031, "learning_rate": 4.475416686011421e-08, "logits/chosen": -18.573284149169922, "logits/rejected": -17.771650314331055, "logps/chosen": -382.27703857421875, "logps/rejected": -314.64715576171875, "loss": 0.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.6184821128845215, "rewards/margins": 1.7159936428070068, "rewards/rejected": 2.9024882316589355, "step": 91670 }, { "epoch": 4.256465016946005, "grad_norm": 68.25302124023438, "learning_rate": 4.4726310413668226e-08, "logits/chosen": -18.962915420532227, "logits/rejected": -19.50296974182129, "logps/chosen": -311.75323486328125, "logps/rejected": -325.9771423339844, "loss": 1.6149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3942224979400635, "rewards/margins": -0.5077983140945435, "rewards/rejected": 2.9020209312438965, "step": 91680 }, { "epoch": 4.256929291053438, "grad_norm": 18.31875228881836, "learning_rate": 4.469845396722225e-08, "logits/chosen": -18.84903335571289, "logits/rejected": -18.356239318847656, "logps/chosen": -380.28021240234375, "logps/rejected": -328.53265380859375, "loss": 1.0965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5794785022735596, "rewards/margins": 0.8569984436035156, "rewards/rejected": 2.722480058670044, "step": 91690 }, { "epoch": 4.257393565160871, "grad_norm": 21.09662437438965, "learning_rate": 4.467059752077627e-08, "logits/chosen": -19.26439666748047, "logits/rejected": -18.27585792541504, "logps/chosen": -483.724365234375, "logps/rejected": -294.44891357421875, "loss": 0.6125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.580689430236816, "rewards/margins": 2.279775619506836, "rewards/rejected": 2.3009133338928223, "step": 91700 }, { "epoch": 4.257857839268304, "grad_norm": 6.951843738555908, "learning_rate": 4.464274107433028e-08, "logits/chosen": -18.918163299560547, "logits/rejected": -18.90993881225586, "logps/chosen": -324.69598388671875, "logps/rejected": -371.22503662109375, "loss": 1.6743, "rewards/accuracies": 0.5, "rewards/chosen": 2.4718925952911377, "rewards/margins": -0.4011545181274414, "rewards/rejected": 2.873047113418579, "step": 91710 }, { "epoch": 4.258322113375737, "grad_norm": 77.68352508544922, "learning_rate": 4.4614884627884304e-08, "logits/chosen": -18.766597747802734, "logits/rejected": -18.132898330688477, "logps/chosen": -393.4736022949219, "logps/rejected": -359.04498291015625, "loss": 0.9195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8816131353378296, "rewards/margins": 0.22264154255390167, "rewards/rejected": 1.6589715480804443, "step": 91720 }, { "epoch": 4.25878638748317, "grad_norm": 1.4662526845932007, "learning_rate": 4.458702818143832e-08, "logits/chosen": -19.119945526123047, "logits/rejected": -18.764806747436523, "logps/chosen": -302.3379821777344, "logps/rejected": -239.05270385742188, "loss": 0.7998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.149752616882324, "rewards/margins": 1.288059115409851, "rewards/rejected": 0.861693263053894, "step": 91730 }, { "epoch": 4.259250661590603, "grad_norm": 28.03834342956543, "learning_rate": 4.455917173499234e-08, "logits/chosen": -17.880718231201172, "logits/rejected": -17.966442108154297, "logps/chosen": -224.48098754882812, "logps/rejected": -312.7751770019531, "loss": 1.4383, "rewards/accuracies": 0.5, "rewards/chosen": 1.4779977798461914, "rewards/margins": -0.3403502106666565, "rewards/rejected": 1.8183481693267822, "step": 91740 }, { "epoch": 4.259714935698036, "grad_norm": 40.25392150878906, "learning_rate": 4.453131528854636e-08, "logits/chosen": -18.762264251708984, "logits/rejected": -17.194042205810547, "logps/chosen": -460.1161193847656, "logps/rejected": -274.09808349609375, "loss": 0.3003, "rewards/accuracies": 1.0, "rewards/chosen": 4.082399368286133, "rewards/margins": 2.439115285873413, "rewards/rejected": 1.6432840824127197, "step": 91750 }, { "epoch": 4.260179209805469, "grad_norm": 92.78874206542969, "learning_rate": 4.4503458842100375e-08, "logits/chosen": -18.259841918945312, "logits/rejected": -18.299320220947266, "logps/chosen": -273.1926574707031, "logps/rejected": -322.2672119140625, "loss": 1.747, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.2115541696548462, "rewards/margins": -0.899685263633728, "rewards/rejected": 2.111239433288574, "step": 91760 }, { "epoch": 4.260643483912903, "grad_norm": 110.90646362304688, "learning_rate": 4.4475602395654396e-08, "logits/chosen": -19.420635223388672, "logits/rejected": -18.784481048583984, "logps/chosen": -379.61212158203125, "logps/rejected": -288.9812316894531, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2761268615722656, "rewards/margins": 1.615465521812439, "rewards/rejected": 1.6606614589691162, "step": 91770 }, { "epoch": 4.261107758020335, "grad_norm": 1.291689395904541, "learning_rate": 4.4447745949208404e-08, "logits/chosen": -19.2098331451416, "logits/rejected": -18.572811126708984, "logps/chosen": -329.4047546386719, "logps/rejected": -313.295166015625, "loss": 0.5686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.177967071533203, "rewards/margins": 1.2203764915466309, "rewards/rejected": 1.9575906991958618, "step": 91780 }, { "epoch": 4.261572032127768, "grad_norm": 10.648293495178223, "learning_rate": 4.4419889502762425e-08, "logits/chosen": -18.825815200805664, "logits/rejected": -18.68552589416504, "logps/chosen": -337.21649169921875, "logps/rejected": -287.08050537109375, "loss": 0.5124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.248035430908203, "rewards/margins": 1.3427642583847046, "rewards/rejected": 0.9052711725234985, "step": 91790 }, { "epoch": 4.262036306235201, "grad_norm": 217.83139038085938, "learning_rate": 4.4392033056316447e-08, "logits/chosen": -19.24087142944336, "logits/rejected": -18.843557357788086, "logps/chosen": -389.70037841796875, "logps/rejected": -285.680419921875, "loss": 0.7533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.705881357192993, "rewards/margins": 1.075538158416748, "rewards/rejected": 1.6303430795669556, "step": 91800 }, { "epoch": 4.262500580342635, "grad_norm": 2.337357997894287, "learning_rate": 4.436417660987046e-08, "logits/chosen": -19.405200958251953, "logits/rejected": -18.350940704345703, "logps/chosen": -304.843017578125, "logps/rejected": -233.7706756591797, "loss": 0.5046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2629518508911133, "rewards/margins": 1.1323635578155518, "rewards/rejected": 1.1305882930755615, "step": 91810 }, { "epoch": 4.262964854450067, "grad_norm": 134.09312438964844, "learning_rate": 4.433632016342448e-08, "logits/chosen": -18.35280990600586, "logits/rejected": -18.091487884521484, "logps/chosen": -301.4294128417969, "logps/rejected": -317.10247802734375, "loss": 0.9182, "rewards/accuracies": 0.5, "rewards/chosen": 2.1947567462921143, "rewards/margins": 0.3845091462135315, "rewards/rejected": 1.8102474212646484, "step": 91820 }, { "epoch": 4.2634291285575, "grad_norm": 150.45220947265625, "learning_rate": 4.4308463716978503e-08, "logits/chosen": -18.88761329650879, "logits/rejected": -18.608461380004883, "logps/chosen": -245.95571899414062, "logps/rejected": -236.83499145507812, "loss": 1.8371, "rewards/accuracies": 0.5, "rewards/chosen": 2.161182403564453, "rewards/margins": -0.3413996994495392, "rewards/rejected": 2.50258207321167, "step": 91830 }, { "epoch": 4.263893402664934, "grad_norm": 77.6512222290039, "learning_rate": 4.428060727053252e-08, "logits/chosen": -19.014530181884766, "logits/rejected": -18.165983200073242, "logps/chosen": -388.76043701171875, "logps/rejected": -371.8187255859375, "loss": 0.2325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7214157581329346, "rewards/margins": 2.2617576122283936, "rewards/rejected": 1.4596582651138306, "step": 91840 }, { "epoch": 4.2643576767723665, "grad_norm": 20.564064025878906, "learning_rate": 4.425275082408654e-08, "logits/chosen": -20.291568756103516, "logits/rejected": -18.73765754699707, "logps/chosen": -450.5863342285156, "logps/rejected": -288.41009521484375, "loss": 0.3318, "rewards/accuracies": 1.0, "rewards/chosen": 3.6147255897521973, "rewards/margins": 1.3598997592926025, "rewards/rejected": 2.2548258304595947, "step": 91850 }, { "epoch": 4.264821950879799, "grad_norm": 74.19982147216797, "learning_rate": 4.4224894377640554e-08, "logits/chosen": -20.647775650024414, "logits/rejected": -18.82252311706543, "logps/chosen": -476.0104064941406, "logps/rejected": -286.8443298339844, "loss": 0.488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.620655059814453, "rewards/margins": 1.7335937023162842, "rewards/rejected": 1.8870614767074585, "step": 91860 }, { "epoch": 4.265286224987232, "grad_norm": 2.5178680419921875, "learning_rate": 4.4197037931194575e-08, "logits/chosen": -19.59352684020996, "logits/rejected": -18.830224990844727, "logps/chosen": -332.2882385253906, "logps/rejected": -287.4774169921875, "loss": 0.6926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.523202657699585, "rewards/margins": 0.8975931406021118, "rewards/rejected": 1.6256097555160522, "step": 91870 }, { "epoch": 4.265750499094666, "grad_norm": 8.819303512573242, "learning_rate": 4.4169181484748596e-08, "logits/chosen": -18.894350051879883, "logits/rejected": -17.70177459716797, "logps/chosen": -337.8277893066406, "logps/rejected": -228.0452423095703, "loss": 0.5412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.26115083694458, "rewards/margins": 1.9780561923980713, "rewards/rejected": 1.2830946445465088, "step": 91880 }, { "epoch": 4.2662147732020985, "grad_norm": 0.41707295179367065, "learning_rate": 4.414132503830261e-08, "logits/chosen": -19.478242874145508, "logits/rejected": -19.336666107177734, "logps/chosen": -309.588623046875, "logps/rejected": -337.19268798828125, "loss": 0.7194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.895613193511963, "rewards/margins": 1.2677886486053467, "rewards/rejected": 2.6278247833251953, "step": 91890 }, { "epoch": 4.266679047309531, "grad_norm": 88.05752563476562, "learning_rate": 4.411346859185663e-08, "logits/chosen": -19.35908317565918, "logits/rejected": -18.186506271362305, "logps/chosen": -364.08660888671875, "logps/rejected": -286.5653076171875, "loss": 0.8998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6316475868225098, "rewards/margins": 0.5141406059265137, "rewards/rejected": 2.117506980895996, "step": 91900 }, { "epoch": 4.267143321416965, "grad_norm": 192.05445861816406, "learning_rate": 4.408561214541065e-08, "logits/chosen": -19.69080924987793, "logits/rejected": -19.12803077697754, "logps/chosen": -385.5901794433594, "logps/rejected": -312.1201171875, "loss": 0.6312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.057272434234619, "rewards/margins": 1.4609975814819336, "rewards/rejected": 2.5962748527526855, "step": 91910 }, { "epoch": 4.267607595524398, "grad_norm": 217.557861328125, "learning_rate": 4.405775569896467e-08, "logits/chosen": -18.962356567382812, "logits/rejected": -18.00642967224121, "logps/chosen": -357.06048583984375, "logps/rejected": -288.24761962890625, "loss": 0.4852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3699958324432373, "rewards/margins": 1.5377196073532104, "rewards/rejected": 1.8322759866714478, "step": 91920 }, { "epoch": 4.2680718696318305, "grad_norm": 0.6290515065193176, "learning_rate": 4.402989925251869e-08, "logits/chosen": -18.33404541015625, "logits/rejected": -17.281673431396484, "logps/chosen": -372.89691162109375, "logps/rejected": -271.00439453125, "loss": 0.2354, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7989723682403564, "rewards/margins": 2.7457847595214844, "rewards/rejected": 1.0531878471374512, "step": 91930 }, { "epoch": 4.268536143739263, "grad_norm": 180.26995849609375, "learning_rate": 4.40020428060727e-08, "logits/chosen": -19.908283233642578, "logits/rejected": -19.703012466430664, "logps/chosen": -307.44757080078125, "logps/rejected": -274.7554626464844, "loss": 1.0917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1409268379211426, "rewards/margins": 0.41810616850852966, "rewards/rejected": 1.7228206396102905, "step": 91940 }, { "epoch": 4.269000417846697, "grad_norm": 55.82682800292969, "learning_rate": 4.3974186359626724e-08, "logits/chosen": -18.610980987548828, "logits/rejected": -18.970619201660156, "logps/chosen": -333.6612243652344, "logps/rejected": -377.7287292480469, "loss": 1.0955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1138699054718018, "rewards/margins": 0.4779386520385742, "rewards/rejected": 2.6359310150146484, "step": 91950 }, { "epoch": 4.26946469195413, "grad_norm": 20.001035690307617, "learning_rate": 4.3946329913180745e-08, "logits/chosen": -19.101259231567383, "logits/rejected": -18.747840881347656, "logps/chosen": -437.74725341796875, "logps/rejected": -395.29962158203125, "loss": 0.3826, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7255547046661377, "rewards/margins": 1.4416701793670654, "rewards/rejected": 2.2838845252990723, "step": 91960 }, { "epoch": 4.2699289660615625, "grad_norm": 16.57699203491211, "learning_rate": 4.391847346673476e-08, "logits/chosen": -19.071643829345703, "logits/rejected": -18.917232513427734, "logps/chosen": -414.8268127441406, "logps/rejected": -376.0085754394531, "loss": 1.2163, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2247283458709717, "rewards/margins": 0.412494421005249, "rewards/rejected": 2.8122339248657227, "step": 91970 }, { "epoch": 4.270393240168996, "grad_norm": 1.06455659866333, "learning_rate": 4.3890617020288774e-08, "logits/chosen": -18.71094512939453, "logits/rejected": -18.264482498168945, "logps/chosen": -309.0437316894531, "logps/rejected": -246.5086669921875, "loss": 0.891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6241819858551025, "rewards/margins": 1.7046372890472412, "rewards/rejected": 0.919544517993927, "step": 91980 }, { "epoch": 4.270857514276429, "grad_norm": 27.098047256469727, "learning_rate": 4.386276057384279e-08, "logits/chosen": -20.194034576416016, "logits/rejected": -19.440778732299805, "logps/chosen": -394.5059814453125, "logps/rejected": -341.9000549316406, "loss": 0.4512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.916935682296753, "rewards/margins": 1.1316370964050293, "rewards/rejected": 2.7852983474731445, "step": 91990 }, { "epoch": 4.271321788383862, "grad_norm": 1.9368902444839478, "learning_rate": 4.383490412739681e-08, "logits/chosen": -18.261722564697266, "logits/rejected": -18.199459075927734, "logps/chosen": -364.22900390625, "logps/rejected": -294.9311218261719, "loss": 0.6699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.303401231765747, "rewards/margins": 1.0664762258529663, "rewards/rejected": 2.236924886703491, "step": 92000 }, { "epoch": 4.2717860624912944, "grad_norm": 158.5536346435547, "learning_rate": 4.380704768095083e-08, "logits/chosen": -18.87150764465332, "logits/rejected": -18.65612030029297, "logps/chosen": -409.56219482421875, "logps/rejected": -345.12420654296875, "loss": 0.7947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7610907554626465, "rewards/margins": 0.9833106994628906, "rewards/rejected": 2.7777810096740723, "step": 92010 }, { "epoch": 4.272250336598728, "grad_norm": 154.66885375976562, "learning_rate": 4.3779191234504846e-08, "logits/chosen": -17.755796432495117, "logits/rejected": -19.24130630493164, "logps/chosen": -287.1372985839844, "logps/rejected": -386.73394775390625, "loss": 2.0435, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.467452049255371, "rewards/margins": -1.0983362197875977, "rewards/rejected": 3.5657882690429688, "step": 92020 }, { "epoch": 4.272714610706161, "grad_norm": 303.1099853515625, "learning_rate": 4.375133478805887e-08, "logits/chosen": -19.45574951171875, "logits/rejected": -19.00299644470215, "logps/chosen": -420.8418884277344, "logps/rejected": -432.28741455078125, "loss": 0.5536, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.049335956573486, "rewards/margins": 1.634749412536621, "rewards/rejected": 2.414586067199707, "step": 92030 }, { "epoch": 4.273178884813594, "grad_norm": 37.77621078491211, "learning_rate": 4.372347834161289e-08, "logits/chosen": -18.646549224853516, "logits/rejected": -18.485637664794922, "logps/chosen": -334.48541259765625, "logps/rejected": -309.1721496582031, "loss": 0.7498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3006529808044434, "rewards/margins": 0.2915478050708771, "rewards/rejected": 2.0091052055358887, "step": 92040 }, { "epoch": 4.273643158921027, "grad_norm": 149.77987670898438, "learning_rate": 4.36956218951669e-08, "logits/chosen": -18.320232391357422, "logits/rejected": -17.198875427246094, "logps/chosen": -354.2453918457031, "logps/rejected": -217.94705200195312, "loss": 0.2391, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.057379961013794, "rewards/margins": 2.85921311378479, "rewards/rejected": 0.1981668621301651, "step": 92050 }, { "epoch": 4.27410743302846, "grad_norm": 100.3774642944336, "learning_rate": 4.3667765448720924e-08, "logits/chosen": -18.809345245361328, "logits/rejected": -18.889474868774414, "logps/chosen": -338.6969909667969, "logps/rejected": -327.9302673339844, "loss": 1.1351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8208975791931152, "rewards/margins": 0.5669852495193481, "rewards/rejected": 2.253912925720215, "step": 92060 }, { "epoch": 4.274571707135893, "grad_norm": 147.54591369628906, "learning_rate": 4.363990900227494e-08, "logits/chosen": -18.684520721435547, "logits/rejected": -18.339000701904297, "logps/chosen": -329.4175720214844, "logps/rejected": -311.32958984375, "loss": 1.0655, "rewards/accuracies": 0.5, "rewards/chosen": 3.4844765663146973, "rewards/margins": 1.0476653575897217, "rewards/rejected": 2.4368112087249756, "step": 92070 }, { "epoch": 4.275035981243326, "grad_norm": 36.81665802001953, "learning_rate": 4.361205255582896e-08, "logits/chosen": -19.265501022338867, "logits/rejected": -17.612977981567383, "logps/chosen": -339.0679931640625, "logps/rejected": -250.2661590576172, "loss": 0.4927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2212581634521484, "rewards/margins": 1.529961347579956, "rewards/rejected": 1.691296935081482, "step": 92080 }, { "epoch": 4.275500255350759, "grad_norm": 0.00110256252810359, "learning_rate": 4.358419610938298e-08, "logits/chosen": -20.29096221923828, "logits/rejected": -18.99314308166504, "logps/chosen": -569.0048217773438, "logps/rejected": -362.24237060546875, "loss": 0.6157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.515772819519043, "rewards/margins": 2.6176815032958984, "rewards/rejected": 2.8980913162231445, "step": 92090 }, { "epoch": 4.275964529458192, "grad_norm": 17.751659393310547, "learning_rate": 4.3556339662936995e-08, "logits/chosen": -19.73456573486328, "logits/rejected": -18.994808197021484, "logps/chosen": -367.5579528808594, "logps/rejected": -338.9683837890625, "loss": 0.3558, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.529742240905762, "rewards/margins": 1.5377384424209595, "rewards/rejected": 2.992004156112671, "step": 92100 }, { "epoch": 4.276428803565625, "grad_norm": 231.44906616210938, "learning_rate": 4.3528483216491016e-08, "logits/chosen": -19.233612060546875, "logits/rejected": -18.768217086791992, "logps/chosen": -462.9668884277344, "logps/rejected": -338.5240783691406, "loss": 0.8411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.95733642578125, "rewards/margins": 1.2325081825256348, "rewards/rejected": 2.724828004837036, "step": 92110 }, { "epoch": 4.2768930776730585, "grad_norm": 66.6068344116211, "learning_rate": 4.350062677004504e-08, "logits/chosen": -18.782428741455078, "logits/rejected": -18.496686935424805, "logps/chosen": -316.95001220703125, "logps/rejected": -269.870361328125, "loss": 0.537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.327402114868164, "rewards/margins": 1.5186777114868164, "rewards/rejected": 1.8087241649627686, "step": 92120 }, { "epoch": 4.277357351780491, "grad_norm": 14.921317100524902, "learning_rate": 4.347277032359905e-08, "logits/chosen": -18.486095428466797, "logits/rejected": -18.542490005493164, "logps/chosen": -202.27163696289062, "logps/rejected": -262.24237060546875, "loss": 1.0996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0021775960922241, "rewards/margins": 0.11911288648843765, "rewards/rejected": 0.8830649256706238, "step": 92130 }, { "epoch": 4.277821625887924, "grad_norm": 65.41403198242188, "learning_rate": 4.344491387715307e-08, "logits/chosen": -18.39827537536621, "logits/rejected": -19.214427947998047, "logps/chosen": -348.11163330078125, "logps/rejected": -418.85894775390625, "loss": 1.3183, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.65175724029541, "rewards/margins": -0.3273158073425293, "rewards/rejected": 2.9790730476379395, "step": 92140 }, { "epoch": 4.278285899995357, "grad_norm": 64.71631622314453, "learning_rate": 4.341705743070709e-08, "logits/chosen": -20.615100860595703, "logits/rejected": -20.29680061340332, "logps/chosen": -399.1516418457031, "logps/rejected": -358.16876220703125, "loss": 0.5072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.12298583984375, "rewards/margins": 1.3105520009994507, "rewards/rejected": 2.812434196472168, "step": 92150 }, { "epoch": 4.2787501741027905, "grad_norm": 77.14695739746094, "learning_rate": 4.338920098426111e-08, "logits/chosen": -18.765140533447266, "logits/rejected": -17.69308853149414, "logps/chosen": -354.3468322753906, "logps/rejected": -286.15765380859375, "loss": 0.7943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8116517066955566, "rewards/margins": 1.568084955215454, "rewards/rejected": 2.2435662746429443, "step": 92160 }, { "epoch": 4.279214448210223, "grad_norm": 0.15119114518165588, "learning_rate": 4.3364130182459723e-08, "logits/chosen": -20.04388427734375, "logits/rejected": -19.925607681274414, "logps/chosen": -406.77777099609375, "logps/rejected": -326.838134765625, "loss": 0.9109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6779720783233643, "rewards/margins": 0.888076663017273, "rewards/rejected": 1.7898952960968018, "step": 92170 }, { "epoch": 4.279678722317656, "grad_norm": 47.9483528137207, "learning_rate": 4.3336273736013744e-08, "logits/chosen": -19.90290069580078, "logits/rejected": -19.046781539916992, "logps/chosen": -389.18560791015625, "logps/rejected": -304.5006408691406, "loss": 0.8521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.774160861968994, "rewards/margins": 1.524064064025879, "rewards/rejected": 2.2500972747802734, "step": 92180 }, { "epoch": 4.28014299642509, "grad_norm": 30.055370330810547, "learning_rate": 4.330841728956776e-08, "logits/chosen": -19.530109405517578, "logits/rejected": -18.866811752319336, "logps/chosen": -447.7786560058594, "logps/rejected": -349.2926025390625, "loss": 0.3874, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.097872734069824, "rewards/margins": 2.318058490753174, "rewards/rejected": 1.7798141241073608, "step": 92190 }, { "epoch": 4.280607270532522, "grad_norm": 271.3500061035156, "learning_rate": 4.328056084312178e-08, "logits/chosen": -18.811378479003906, "logits/rejected": -19.197696685791016, "logps/chosen": -341.37353515625, "logps/rejected": -424.6407775878906, "loss": 2.1558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9617576599121094, "rewards/margins": -0.8543357849121094, "rewards/rejected": 3.816093921661377, "step": 92200 }, { "epoch": 4.281071544639955, "grad_norm": 90.87190246582031, "learning_rate": 4.32527043966758e-08, "logits/chosen": -19.940706253051758, "logits/rejected": -17.82915496826172, "logps/chosen": -418.5038146972656, "logps/rejected": -319.463623046875, "loss": 0.232, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.455717086791992, "rewards/margins": 2.5329196453094482, "rewards/rejected": 1.9227972030639648, "step": 92210 }, { "epoch": 4.281535818747389, "grad_norm": 9.95495319366455, "learning_rate": 4.3224847950229816e-08, "logits/chosen": -18.434934616088867, "logits/rejected": -18.553836822509766, "logps/chosen": -272.8802795410156, "logps/rejected": -295.29327392578125, "loss": 1.1105, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9586299657821655, "rewards/margins": 0.3624489903450012, "rewards/rejected": 1.5961809158325195, "step": 92220 }, { "epoch": 4.282000092854822, "grad_norm": 134.25527954101562, "learning_rate": 4.319699150378384e-08, "logits/chosen": -18.247051239013672, "logits/rejected": -18.796802520751953, "logps/chosen": -351.88970947265625, "logps/rejected": -448.93487548828125, "loss": 0.9748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7372822761535645, "rewards/margins": -0.08459079265594482, "rewards/rejected": 2.8218724727630615, "step": 92230 }, { "epoch": 4.282464366962254, "grad_norm": 1.940742015838623, "learning_rate": 4.3169135057337845e-08, "logits/chosen": -19.971324920654297, "logits/rejected": -18.07992935180664, "logps/chosen": -389.0445251464844, "logps/rejected": -274.83514404296875, "loss": 0.7194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.801233291625977, "rewards/margins": 2.3788235187530518, "rewards/rejected": 2.4224095344543457, "step": 92240 }, { "epoch": 4.282928641069687, "grad_norm": 2.0256776809692383, "learning_rate": 4.3141278610891866e-08, "logits/chosen": -18.637008666992188, "logits/rejected": -18.41743278503418, "logps/chosen": -435.8614807128906, "logps/rejected": -391.0311584472656, "loss": 0.6405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.841241359710693, "rewards/margins": 1.2718006372451782, "rewards/rejected": 3.5694408416748047, "step": 92250 }, { "epoch": 4.283392915177121, "grad_norm": 45.03915786743164, "learning_rate": 4.311342216444589e-08, "logits/chosen": -19.069171905517578, "logits/rejected": -18.64614486694336, "logps/chosen": -434.4481506347656, "logps/rejected": -381.04864501953125, "loss": 0.4069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.637274742126465, "rewards/margins": 1.6477601528167725, "rewards/rejected": 2.9895143508911133, "step": 92260 }, { "epoch": 4.283857189284554, "grad_norm": 31.936033248901367, "learning_rate": 4.30855657179999e-08, "logits/chosen": -18.892681121826172, "logits/rejected": -17.72800636291504, "logps/chosen": -304.91290283203125, "logps/rejected": -181.0338592529297, "loss": 0.3712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.591200351715088, "rewards/margins": 1.9203048944473267, "rewards/rejected": 0.6708954572677612, "step": 92270 }, { "epoch": 4.284321463391986, "grad_norm": 0.6684888601303101, "learning_rate": 4.305770927155392e-08, "logits/chosen": -19.315322875976562, "logits/rejected": -18.524520874023438, "logps/chosen": -367.37176513671875, "logps/rejected": -339.394287109375, "loss": 0.6722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.183228015899658, "rewards/margins": 1.0220528841018677, "rewards/rejected": 2.16117525100708, "step": 92280 }, { "epoch": 4.28478573749942, "grad_norm": 144.63626098632812, "learning_rate": 4.302985282510794e-08, "logits/chosen": -18.75617027282715, "logits/rejected": -18.193119049072266, "logps/chosen": -361.7481384277344, "logps/rejected": -304.434814453125, "loss": 0.969, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0837645530700684, "rewards/margins": 0.5649714469909668, "rewards/rejected": 2.5187933444976807, "step": 92290 }, { "epoch": 4.285250011606853, "grad_norm": 44.94047546386719, "learning_rate": 4.300199637866196e-08, "logits/chosen": -19.30399513244629, "logits/rejected": -19.163593292236328, "logps/chosen": -317.7526550292969, "logps/rejected": -305.1700744628906, "loss": 0.9022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0088961124420166, "rewards/margins": 0.8414028882980347, "rewards/rejected": 1.1674931049346924, "step": 92300 }, { "epoch": 4.285714285714286, "grad_norm": 41.527034759521484, "learning_rate": 4.297413993221598e-08, "logits/chosen": -18.680362701416016, "logits/rejected": -16.41905403137207, "logps/chosen": -416.47320556640625, "logps/rejected": -187.06629943847656, "loss": 0.2232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.623198509216309, "rewards/margins": 4.434924125671387, "rewards/rejected": 0.18827475607395172, "step": 92310 }, { "epoch": 4.286178559821718, "grad_norm": 1.2230812311172485, "learning_rate": 4.2946283485769994e-08, "logits/chosen": -19.820756912231445, "logits/rejected": -18.935625076293945, "logps/chosen": -415.22979736328125, "logps/rejected": -337.3978576660156, "loss": 0.8468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5906100273132324, "rewards/margins": 1.4127646684646606, "rewards/rejected": 2.1778452396392822, "step": 92320 }, { "epoch": 4.286642833929152, "grad_norm": 86.52669525146484, "learning_rate": 4.2918427039324015e-08, "logits/chosen": -18.740779876708984, "logits/rejected": -18.26470375061035, "logps/chosen": -449.26507568359375, "logps/rejected": -378.67138671875, "loss": 0.2697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.667445182800293, "rewards/margins": 2.1750950813293457, "rewards/rejected": 2.492349624633789, "step": 92330 }, { "epoch": 4.287107108036585, "grad_norm": 7.539647579193115, "learning_rate": 4.2890570592878037e-08, "logits/chosen": -19.468320846557617, "logits/rejected": -19.585758209228516, "logps/chosen": -381.54193115234375, "logps/rejected": -312.089111328125, "loss": 0.5283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7641067504882812, "rewards/margins": 1.4543049335479736, "rewards/rejected": 1.3098018169403076, "step": 92340 }, { "epoch": 4.287571382144018, "grad_norm": 242.4417724609375, "learning_rate": 4.286271414643205e-08, "logits/chosen": -19.53565216064453, "logits/rejected": -18.439367294311523, "logps/chosen": -494.411865234375, "logps/rejected": -342.1657409667969, "loss": 0.5214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2952775955200195, "rewards/margins": 2.069422721862793, "rewards/rejected": 2.2258546352386475, "step": 92350 }, { "epoch": 4.288035656251451, "grad_norm": 20.182836532592773, "learning_rate": 4.283485769998607e-08, "logits/chosen": -19.796422958374023, "logits/rejected": -18.441509246826172, "logps/chosen": -477.6612243652344, "logps/rejected": -378.0491638183594, "loss": 0.1752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.008783340454102, "rewards/margins": 2.5825958251953125, "rewards/rejected": 2.42618727684021, "step": 92360 }, { "epoch": 4.288499930358884, "grad_norm": 46.63572692871094, "learning_rate": 4.280700125354009e-08, "logits/chosen": -19.548892974853516, "logits/rejected": -19.611774444580078, "logps/chosen": -445.4267578125, "logps/rejected": -466.62017822265625, "loss": 0.8366, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.344097137451172, "rewards/margins": 0.6605812907218933, "rewards/rejected": 3.6835150718688965, "step": 92370 }, { "epoch": 4.288964204466317, "grad_norm": 118.40210723876953, "learning_rate": 4.277914480709411e-08, "logits/chosen": -19.89655303955078, "logits/rejected": -19.155405044555664, "logps/chosen": -556.2747802734375, "logps/rejected": -441.95391845703125, "loss": 0.4922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.388640880584717, "rewards/margins": 1.9034430980682373, "rewards/rejected": 2.4851980209350586, "step": 92380 }, { "epoch": 4.2894284785737495, "grad_norm": 78.58675384521484, "learning_rate": 4.275128836064813e-08, "logits/chosen": -19.56235694885254, "logits/rejected": -18.068796157836914, "logps/chosen": -404.4647216796875, "logps/rejected": -283.79571533203125, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 3.2958552837371826, "rewards/margins": 2.3766627311706543, "rewards/rejected": 0.9191926717758179, "step": 92390 }, { "epoch": 4.289892752681183, "grad_norm": 45.157470703125, "learning_rate": 4.2723431914202144e-08, "logits/chosen": -19.473344802856445, "logits/rejected": -18.367944717407227, "logps/chosen": -367.5877380371094, "logps/rejected": -302.5841064453125, "loss": 0.5472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.998209238052368, "rewards/margins": 0.9033526182174683, "rewards/rejected": 2.0948567390441895, "step": 92400 }, { "epoch": 4.290357026788616, "grad_norm": 60.377689361572266, "learning_rate": 4.2695575467756165e-08, "logits/chosen": -20.114086151123047, "logits/rejected": -18.92279624938965, "logps/chosen": -418.78082275390625, "logps/rejected": -300.68365478515625, "loss": 0.2286, "rewards/accuracies": 1.0, "rewards/chosen": 3.943150043487549, "rewards/margins": 2.144430160522461, "rewards/rejected": 1.7987200021743774, "step": 92410 }, { "epoch": 4.290821300896049, "grad_norm": 39.48383712768555, "learning_rate": 4.2667719021310186e-08, "logits/chosen": -18.61513900756836, "logits/rejected": -18.1428279876709, "logps/chosen": -413.20745849609375, "logps/rejected": -338.754638671875, "loss": 0.5454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4498324394226074, "rewards/margins": 1.1627180576324463, "rewards/rejected": 2.287114381790161, "step": 92420 }, { "epoch": 4.291285575003482, "grad_norm": 207.9248046875, "learning_rate": 4.2639862574864194e-08, "logits/chosen": -19.819259643554688, "logits/rejected": -17.726118087768555, "logps/chosen": -425.6898498535156, "logps/rejected": -276.76007080078125, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.968493938446045, "rewards/margins": 2.8362269401550293, "rewards/rejected": 1.1322672367095947, "step": 92430 }, { "epoch": 4.291749849110915, "grad_norm": 39.95941162109375, "learning_rate": 4.2612006128418215e-08, "logits/chosen": -19.798816680908203, "logits/rejected": -17.67626190185547, "logps/chosen": -484.05291748046875, "logps/rejected": -297.5453186035156, "loss": 0.1607, "rewards/accuracies": 1.0, "rewards/chosen": 4.693778038024902, "rewards/margins": 2.7212724685668945, "rewards/rejected": 1.9725052118301392, "step": 92440 }, { "epoch": 4.292214123218348, "grad_norm": 294.3890075683594, "learning_rate": 4.258414968197223e-08, "logits/chosen": -19.753877639770508, "logits/rejected": -19.86876678466797, "logps/chosen": -398.618896484375, "logps/rejected": -410.08575439453125, "loss": 1.0726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3382256031036377, "rewards/margins": 0.25753748416900635, "rewards/rejected": 3.0806884765625, "step": 92450 }, { "epoch": 4.292678397325782, "grad_norm": 171.22283935546875, "learning_rate": 4.255629323552625e-08, "logits/chosen": -18.228404998779297, "logits/rejected": -18.0081729888916, "logps/chosen": -397.3135070800781, "logps/rejected": -339.85748291015625, "loss": 0.9012, "rewards/accuracies": 0.5, "rewards/chosen": 4.17803430557251, "rewards/margins": 1.37868332862854, "rewards/rejected": 2.7993509769439697, "step": 92460 }, { "epoch": 4.293142671433214, "grad_norm": 0.3489125669002533, "learning_rate": 4.252843678908027e-08, "logits/chosen": -19.69770050048828, "logits/rejected": -18.151012420654297, "logps/chosen": -453.8106384277344, "logps/rejected": -290.72100830078125, "loss": 0.4539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.540166854858398, "rewards/margins": 2.169879198074341, "rewards/rejected": 2.3702874183654785, "step": 92470 }, { "epoch": 4.293606945540647, "grad_norm": 102.84867095947266, "learning_rate": 4.2500580342634286e-08, "logits/chosen": -19.31998062133789, "logits/rejected": -18.65237808227539, "logps/chosen": -393.84002685546875, "logps/rejected": -363.8658142089844, "loss": 0.9358, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9575085639953613, "rewards/margins": 0.2944723665714264, "rewards/rejected": 2.663036584854126, "step": 92480 }, { "epoch": 4.29407121964808, "grad_norm": 0.1088581308722496, "learning_rate": 4.247272389618831e-08, "logits/chosen": -17.746068954467773, "logits/rejected": -17.268638610839844, "logps/chosen": -384.6995849609375, "logps/rejected": -331.32928466796875, "loss": 0.5342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.022497653961182, "rewards/margins": 2.2668375968933105, "rewards/rejected": 1.755659818649292, "step": 92490 }, { "epoch": 4.294535493755514, "grad_norm": 129.67739868164062, "learning_rate": 4.244486744974232e-08, "logits/chosen": -19.31695556640625, "logits/rejected": -19.355484008789062, "logps/chosen": -388.5227355957031, "logps/rejected": -392.7948913574219, "loss": 1.0998, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.975569725036621, "rewards/margins": -0.06482130289077759, "rewards/rejected": 3.040391206741333, "step": 92500 }, { "epoch": 4.294999767862946, "grad_norm": 46.33793258666992, "learning_rate": 4.241701100329634e-08, "logits/chosen": -19.09907341003418, "logits/rejected": -18.08881187438965, "logps/chosen": -390.35943603515625, "logps/rejected": -279.0063781738281, "loss": 0.726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.21887469291687, "rewards/margins": 1.0753791332244873, "rewards/rejected": 2.143496036529541, "step": 92510 }, { "epoch": 4.295464041970379, "grad_norm": 81.88103485107422, "learning_rate": 4.2389154556850364e-08, "logits/chosen": -19.103914260864258, "logits/rejected": -17.94217300415039, "logps/chosen": -374.6304931640625, "logps/rejected": -263.46319580078125, "loss": 0.7012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.742326259613037, "rewards/margins": 1.5417497158050537, "rewards/rejected": 1.2005763053894043, "step": 92520 }, { "epoch": 4.295928316077813, "grad_norm": 65.04448699951172, "learning_rate": 4.236129811040438e-08, "logits/chosen": -18.91000747680664, "logits/rejected": -18.440452575683594, "logps/chosen": -314.4693298339844, "logps/rejected": -276.8553466796875, "loss": 0.8639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0275440216064453, "rewards/margins": 0.9785538911819458, "rewards/rejected": 2.048989772796631, "step": 92530 }, { "epoch": 4.2963925901852456, "grad_norm": 2.200259208679199, "learning_rate": 4.23334416639584e-08, "logits/chosen": -17.858924865722656, "logits/rejected": -17.41892433166504, "logps/chosen": -362.8664245605469, "logps/rejected": -300.4850769042969, "loss": 1.0692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6058127880096436, "rewards/margins": 0.8331985473632812, "rewards/rejected": 1.7726142406463623, "step": 92540 }, { "epoch": 4.296856864292678, "grad_norm": 42.272640228271484, "learning_rate": 4.230558521751242e-08, "logits/chosen": -20.02280616760254, "logits/rejected": -18.984872817993164, "logps/chosen": -447.17584228515625, "logps/rejected": -295.08270263671875, "loss": 0.3336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.505133628845215, "rewards/margins": 1.562799334526062, "rewards/rejected": 2.9423346519470215, "step": 92550 }, { "epoch": 4.297321138400111, "grad_norm": 356.3213806152344, "learning_rate": 4.2277728771066436e-08, "logits/chosen": -19.515121459960938, "logits/rejected": -18.596405029296875, "logps/chosen": -401.5731201171875, "logps/rejected": -384.0799560546875, "loss": 0.7073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2403292655944824, "rewards/margins": 0.609491229057312, "rewards/rejected": 2.63083815574646, "step": 92560 }, { "epoch": 4.297785412507545, "grad_norm": 192.22886657714844, "learning_rate": 4.224987232462046e-08, "logits/chosen": -19.407550811767578, "logits/rejected": -18.610599517822266, "logps/chosen": -324.52490234375, "logps/rejected": -307.20147705078125, "loss": 0.9876, "rewards/accuracies": 0.5, "rewards/chosen": 2.7797985076904297, "rewards/margins": 0.7293664813041687, "rewards/rejected": 2.050431728363037, "step": 92570 }, { "epoch": 4.2982496866149775, "grad_norm": 1.629251480102539, "learning_rate": 4.222201587817447e-08, "logits/chosen": -19.143966674804688, "logits/rejected": -18.022140502929688, "logps/chosen": -370.5220031738281, "logps/rejected": -256.25213623046875, "loss": 0.3832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.920804262161255, "rewards/margins": 2.1479599475860596, "rewards/rejected": 1.7728445529937744, "step": 92580 }, { "epoch": 4.29871396072241, "grad_norm": 18.395414352416992, "learning_rate": 4.219415943172849e-08, "logits/chosen": -19.93448257446289, "logits/rejected": -19.126667022705078, "logps/chosen": -397.92254638671875, "logps/rejected": -320.0943603515625, "loss": 0.3439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.969999313354492, "rewards/margins": 1.6839683055877686, "rewards/rejected": 2.2860307693481445, "step": 92590 }, { "epoch": 4.299178234829844, "grad_norm": 81.69377899169922, "learning_rate": 4.2166302985282514e-08, "logits/chosen": -18.60087013244629, "logits/rejected": -19.33324432373047, "logps/chosen": -381.5593566894531, "logps/rejected": -458.90399169921875, "loss": 1.5996, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.22145938873291, "rewards/margins": -0.4204902648925781, "rewards/rejected": 2.6419498920440674, "step": 92600 }, { "epoch": 4.299642508937277, "grad_norm": 1.3887766599655151, "learning_rate": 4.213844653883653e-08, "logits/chosen": -19.017358779907227, "logits/rejected": -18.039112091064453, "logps/chosen": -425.03680419921875, "logps/rejected": -270.451904296875, "loss": 0.5184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6675045490264893, "rewards/margins": 2.1894009113311768, "rewards/rejected": 1.4781038761138916, "step": 92610 }, { "epoch": 4.3001067830447095, "grad_norm": 30.99752426147461, "learning_rate": 4.211059009239055e-08, "logits/chosen": -18.11542320251465, "logits/rejected": -17.5059871673584, "logps/chosen": -291.77630615234375, "logps/rejected": -230.26437377929688, "loss": 0.5459, "rewards/accuracies": 0.5, "rewards/chosen": 2.3272266387939453, "rewards/margins": 1.2765766382217407, "rewards/rejected": 1.0506501197814941, "step": 92620 }, { "epoch": 4.300571057152142, "grad_norm": 4.165722370147705, "learning_rate": 4.208273364594456e-08, "logits/chosen": -18.505714416503906, "logits/rejected": -17.88125228881836, "logps/chosen": -394.7973327636719, "logps/rejected": -333.04412841796875, "loss": 0.7837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.513686418533325, "rewards/margins": 0.870710015296936, "rewards/rejected": 2.6429762840270996, "step": 92630 }, { "epoch": 4.301035331259576, "grad_norm": 184.85626220703125, "learning_rate": 4.205487719949858e-08, "logits/chosen": -19.370967864990234, "logits/rejected": -19.302663803100586, "logps/chosen": -457.03057861328125, "logps/rejected": -347.4073181152344, "loss": 1.0377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.172612190246582, "rewards/margins": 0.8872568011283875, "rewards/rejected": 3.28535532951355, "step": 92640 }, { "epoch": 4.301499605367009, "grad_norm": 0.660510241985321, "learning_rate": 4.20270207530526e-08, "logits/chosen": -18.761852264404297, "logits/rejected": -18.98719024658203, "logps/chosen": -425.50384521484375, "logps/rejected": -440.11444091796875, "loss": 0.7728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.212192058563232, "rewards/margins": 0.9987473487854004, "rewards/rejected": 3.2134456634521484, "step": 92650 }, { "epoch": 4.3019638794744415, "grad_norm": 152.05848693847656, "learning_rate": 4.1999164306606614e-08, "logits/chosen": -19.687847137451172, "logits/rejected": -19.029006958007812, "logps/chosen": -336.14312744140625, "logps/rejected": -415.3955078125, "loss": 1.1908, "rewards/accuracies": 0.5, "rewards/chosen": 2.6922380924224854, "rewards/margins": 0.10862083733081818, "rewards/rejected": 2.5836167335510254, "step": 92660 }, { "epoch": 4.302428153581875, "grad_norm": 33.74885940551758, "learning_rate": 4.1971307860160635e-08, "logits/chosen": -18.993558883666992, "logits/rejected": -18.162403106689453, "logps/chosen": -529.173095703125, "logps/rejected": -388.51922607421875, "loss": 0.3598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.093804359436035, "rewards/margins": 2.409074306488037, "rewards/rejected": 2.68472957611084, "step": 92670 }, { "epoch": 4.302892427689308, "grad_norm": 58.70831298828125, "learning_rate": 4.1943451413714657e-08, "logits/chosen": -19.986080169677734, "logits/rejected": -18.281373977661133, "logps/chosen": -447.02520751953125, "logps/rejected": -335.87176513671875, "loss": 0.4633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.768494606018066, "rewards/margins": 2.6587605476379395, "rewards/rejected": 2.1097350120544434, "step": 92680 }, { "epoch": 4.303356701796741, "grad_norm": 36.267120361328125, "learning_rate": 4.191559496726867e-08, "logits/chosen": -19.53912353515625, "logits/rejected": -17.70025634765625, "logps/chosen": -421.4658203125, "logps/rejected": -290.55670166015625, "loss": 0.7076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6310043334960938, "rewards/margins": 1.9352996349334717, "rewards/rejected": 1.6957050561904907, "step": 92690 }, { "epoch": 4.3038209759041735, "grad_norm": 48.049102783203125, "learning_rate": 4.188773852082269e-08, "logits/chosen": -18.918930053710938, "logits/rejected": -17.657421112060547, "logps/chosen": -385.54156494140625, "logps/rejected": -301.1449279785156, "loss": 0.3391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.248892307281494, "rewards/margins": 1.9582058191299438, "rewards/rejected": 1.2906866073608398, "step": 92700 }, { "epoch": 4.304285250011607, "grad_norm": 41.62747573852539, "learning_rate": 4.185988207437671e-08, "logits/chosen": -19.207609176635742, "logits/rejected": -18.605239868164062, "logps/chosen": -315.0862731933594, "logps/rejected": -261.09564208984375, "loss": 0.9794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5508893728256226, "rewards/margins": -0.08913187682628632, "rewards/rejected": 1.6400213241577148, "step": 92710 }, { "epoch": 4.30474952411904, "grad_norm": 0.2610771954059601, "learning_rate": 4.183202562793073e-08, "logits/chosen": -19.208087921142578, "logits/rejected": -18.228410720825195, "logps/chosen": -494.66265869140625, "logps/rejected": -443.66973876953125, "loss": 0.6872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.467042922973633, "rewards/margins": 1.0938903093338013, "rewards/rejected": 3.3731529712677, "step": 92720 }, { "epoch": 4.305213798226473, "grad_norm": 4.060891151428223, "learning_rate": 4.180416918148475e-08, "logits/chosen": -19.41905403137207, "logits/rejected": -19.09603500366211, "logps/chosen": -352.22393798828125, "logps/rejected": -312.88848876953125, "loss": 0.4399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.945033311843872, "rewards/margins": 1.870027780532837, "rewards/rejected": 2.075005292892456, "step": 92730 }, { "epoch": 4.305678072333906, "grad_norm": 76.69451904296875, "learning_rate": 4.1776312735038764e-08, "logits/chosen": -20.308244705200195, "logits/rejected": -19.812541961669922, "logps/chosen": -451.659912109375, "logps/rejected": -388.9427795410156, "loss": 0.4775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.293005466461182, "rewards/margins": 1.6623932123184204, "rewards/rejected": 2.630612373352051, "step": 92740 }, { "epoch": 4.306142346441339, "grad_norm": 101.12459564208984, "learning_rate": 4.1748456288592785e-08, "logits/chosen": -18.526165008544922, "logits/rejected": -18.67892074584961, "logps/chosen": -445.65496826171875, "logps/rejected": -390.17041015625, "loss": 0.4262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1913161277771, "rewards/margins": 1.1374702453613281, "rewards/rejected": 3.0538458824157715, "step": 92750 }, { "epoch": 4.306606620548772, "grad_norm": 113.03633117675781, "learning_rate": 4.1720599842146806e-08, "logits/chosen": -19.57365608215332, "logits/rejected": -19.049968719482422, "logps/chosen": -353.4892272949219, "logps/rejected": -321.993408203125, "loss": 0.5226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.468111038208008, "rewards/margins": 1.3027048110961914, "rewards/rejected": 2.1654062271118164, "step": 92760 }, { "epoch": 4.307070894656205, "grad_norm": 78.89020538330078, "learning_rate": 4.169274339570082e-08, "logits/chosen": -18.253314971923828, "logits/rejected": -18.1839599609375, "logps/chosen": -460.58734130859375, "logps/rejected": -481.22235107421875, "loss": 0.7178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4022281169891357, "rewards/margins": 0.30482617020606995, "rewards/rejected": 3.097402334213257, "step": 92770 }, { "epoch": 4.307535168763638, "grad_norm": 174.78993225097656, "learning_rate": 4.166488694925484e-08, "logits/chosen": -18.635894775390625, "logits/rejected": -17.780376434326172, "logps/chosen": -502.5057067871094, "logps/rejected": -411.8916015625, "loss": 1.0371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8058524131774902, "rewards/margins": 1.2618242502212524, "rewards/rejected": 2.5440287590026855, "step": 92780 }, { "epoch": 4.307999442871071, "grad_norm": 266.94891357421875, "learning_rate": 4.1637030502808856e-08, "logits/chosen": -19.623517990112305, "logits/rejected": -17.343456268310547, "logps/chosen": -486.07354736328125, "logps/rejected": -278.46893310546875, "loss": 0.5578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.797965049743652, "rewards/margins": 2.813044309616089, "rewards/rejected": 1.9849210977554321, "step": 92790 }, { "epoch": 4.308463716978504, "grad_norm": 39.03122329711914, "learning_rate": 4.160917405636288e-08, "logits/chosen": -18.937602996826172, "logits/rejected": -18.688920974731445, "logps/chosen": -325.04486083984375, "logps/rejected": -278.4075622558594, "loss": 0.7848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0532312393188477, "rewards/margins": 1.1855766773223877, "rewards/rejected": 0.8676546216011047, "step": 92800 }, { "epoch": 4.3089279910859375, "grad_norm": 75.7339096069336, "learning_rate": 4.15813176099169e-08, "logits/chosen": -18.45509910583496, "logits/rejected": -17.657588958740234, "logps/chosen": -429.06353759765625, "logps/rejected": -268.631591796875, "loss": 0.6812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9989089965820312, "rewards/margins": 2.741055488586426, "rewards/rejected": 1.2578541040420532, "step": 92810 }, { "epoch": 4.30939226519337, "grad_norm": 7.66445255279541, "learning_rate": 4.155346116347091e-08, "logits/chosen": -19.26790428161621, "logits/rejected": -19.383258819580078, "logps/chosen": -419.2525939941406, "logps/rejected": -367.89312744140625, "loss": 0.6377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.507419586181641, "rewards/margins": 1.4633694887161255, "rewards/rejected": 3.0440499782562256, "step": 92820 }, { "epoch": 4.309856539300803, "grad_norm": 228.91639709472656, "learning_rate": 4.152560471702493e-08, "logits/chosen": -19.651567459106445, "logits/rejected": -18.65859603881836, "logps/chosen": -468.3970642089844, "logps/rejected": -362.237060546875, "loss": 0.577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7660160064697266, "rewards/margins": 0.9952473640441895, "rewards/rejected": 2.770768642425537, "step": 92830 }, { "epoch": 4.310320813408236, "grad_norm": 168.37472534179688, "learning_rate": 4.149774827057894e-08, "logits/chosen": -18.5378475189209, "logits/rejected": -17.839176177978516, "logps/chosen": -367.56036376953125, "logps/rejected": -294.130615234375, "loss": 1.1132, "rewards/accuracies": 0.5, "rewards/chosen": 3.2062671184539795, "rewards/margins": 0.7532078623771667, "rewards/rejected": 2.453059196472168, "step": 92840 }, { "epoch": 4.3107850875156695, "grad_norm": 53.86921691894531, "learning_rate": 4.146989182413296e-08, "logits/chosen": -19.697860717773438, "logits/rejected": -19.344955444335938, "logps/chosen": -393.2989501953125, "logps/rejected": -369.3340148925781, "loss": 0.5233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.516592025756836, "rewards/margins": 1.0336782932281494, "rewards/rejected": 2.482914447784424, "step": 92850 }, { "epoch": 4.311249361623102, "grad_norm": 150.70106506347656, "learning_rate": 4.1442035377686984e-08, "logits/chosen": -18.381587982177734, "logits/rejected": -17.642526626586914, "logps/chosen": -337.849609375, "logps/rejected": -290.41143798828125, "loss": 0.6667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0037715435028076, "rewards/margins": 1.0966100692749023, "rewards/rejected": 0.9071610569953918, "step": 92860 }, { "epoch": 4.311713635730535, "grad_norm": 217.4234619140625, "learning_rate": 4.1414178931241e-08, "logits/chosen": -20.554534912109375, "logits/rejected": -19.50310707092285, "logps/chosen": -409.3260803222656, "logps/rejected": -319.3079833984375, "loss": 0.8709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.658774375915527, "rewards/margins": 1.4789012670516968, "rewards/rejected": 3.17987322807312, "step": 92870 }, { "epoch": 4.312177909837969, "grad_norm": 143.50025939941406, "learning_rate": 4.138632248479502e-08, "logits/chosen": -18.392417907714844, "logits/rejected": -18.88075828552246, "logps/chosen": -305.21600341796875, "logps/rejected": -332.69158935546875, "loss": 1.1856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2438740730285645, "rewards/margins": -0.46260619163513184, "rewards/rejected": 2.7064802646636963, "step": 92880 }, { "epoch": 4.312642183945401, "grad_norm": 186.4858856201172, "learning_rate": 4.135846603834904e-08, "logits/chosen": -19.180458068847656, "logits/rejected": -17.700050354003906, "logps/chosen": -489.55242919921875, "logps/rejected": -355.13287353515625, "loss": 0.4386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4384965896606445, "rewards/margins": 2.5466132164001465, "rewards/rejected": 1.8918834924697876, "step": 92890 }, { "epoch": 4.313106458052834, "grad_norm": 42.638675689697266, "learning_rate": 4.1330609591903056e-08, "logits/chosen": -19.069730758666992, "logits/rejected": -19.256418228149414, "logps/chosen": -267.5383605957031, "logps/rejected": -339.7416076660156, "loss": 1.0801, "rewards/accuracies": 0.5, "rewards/chosen": 2.4434056282043457, "rewards/margins": -0.09574093669652939, "rewards/rejected": 2.539146900177002, "step": 92900 }, { "epoch": 4.313570732160267, "grad_norm": 60.24160385131836, "learning_rate": 4.130275314545708e-08, "logits/chosen": -18.332624435424805, "logits/rejected": -17.81031608581543, "logps/chosen": -412.1671447753906, "logps/rejected": -386.78106689453125, "loss": 0.9633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4087023735046387, "rewards/margins": 0.7889897227287292, "rewards/rejected": 2.619713068008423, "step": 92910 }, { "epoch": 4.314035006267701, "grad_norm": 70.26634216308594, "learning_rate": 4.127489669901109e-08, "logits/chosen": -18.890949249267578, "logits/rejected": -17.567020416259766, "logps/chosen": -396.88494873046875, "logps/rejected": -226.55752563476562, "loss": 0.5017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.538855791091919, "rewards/margins": 2.3836638927459717, "rewards/rejected": 1.155192255973816, "step": 92920 }, { "epoch": 4.314499280375133, "grad_norm": 73.46302795410156, "learning_rate": 4.124704025256511e-08, "logits/chosen": -17.856008529663086, "logits/rejected": -17.19756317138672, "logps/chosen": -419.3335876464844, "logps/rejected": -350.13104248046875, "loss": 1.3388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2209553718566895, "rewards/margins": 1.5213117599487305, "rewards/rejected": 2.699643611907959, "step": 92930 }, { "epoch": 4.314963554482566, "grad_norm": 29.711851119995117, "learning_rate": 4.1219183806119134e-08, "logits/chosen": -18.694900512695312, "logits/rejected": -17.961854934692383, "logps/chosen": -420.4366149902344, "logps/rejected": -294.2245178222656, "loss": 0.2125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.722844123840332, "rewards/margins": 2.2642834186553955, "rewards/rejected": 2.4585609436035156, "step": 92940 }, { "epoch": 4.31542782859, "grad_norm": 10.901695251464844, "learning_rate": 4.119132735967315e-08, "logits/chosen": -19.024581909179688, "logits/rejected": -18.035358428955078, "logps/chosen": -369.97393798828125, "logps/rejected": -255.93222045898438, "loss": 0.9853, "rewards/accuracies": 0.5, "rewards/chosen": 3.152575731277466, "rewards/margins": 1.2153207063674927, "rewards/rejected": 1.9372546672821045, "step": 92950 }, { "epoch": 4.315892102697433, "grad_norm": 6.486767768859863, "learning_rate": 4.116347091322717e-08, "logits/chosen": -18.461772918701172, "logits/rejected": -18.42705535888672, "logps/chosen": -401.6391906738281, "logps/rejected": -360.3217468261719, "loss": 0.7905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.960779905319214, "rewards/margins": 0.7733871936798096, "rewards/rejected": 2.1873929500579834, "step": 92960 }, { "epoch": 4.316356376804865, "grad_norm": 6.85089111328125, "learning_rate": 4.113561446678119e-08, "logits/chosen": -19.403791427612305, "logits/rejected": -18.919239044189453, "logps/chosen": -398.218017578125, "logps/rejected": -358.4748840332031, "loss": 1.039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0419788360595703, "rewards/margins": 1.1888577938079834, "rewards/rejected": 1.8531211614608765, "step": 92970 }, { "epoch": 4.316820650912298, "grad_norm": 162.47023010253906, "learning_rate": 4.1107758020335205e-08, "logits/chosen": -18.677274703979492, "logits/rejected": -18.523662567138672, "logps/chosen": -346.99261474609375, "logps/rejected": -243.9227294921875, "loss": 0.5961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.383157253265381, "rewards/margins": 1.2541905641555786, "rewards/rejected": 2.128967046737671, "step": 92980 }, { "epoch": 4.317284925019732, "grad_norm": 68.239013671875, "learning_rate": 4.1079901573889226e-08, "logits/chosen": -18.461009979248047, "logits/rejected": -18.509597778320312, "logps/chosen": -382.65008544921875, "logps/rejected": -397.71002197265625, "loss": 1.1113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5407073497772217, "rewards/margins": 0.6711317896842957, "rewards/rejected": 2.8695759773254395, "step": 92990 }, { "epoch": 4.317749199127165, "grad_norm": 1.9660351276397705, "learning_rate": 4.105204512744324e-08, "logits/chosen": -18.462753295898438, "logits/rejected": -17.939186096191406, "logps/chosen": -449.80181884765625, "logps/rejected": -345.29327392578125, "loss": 0.4017, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.630187034606934, "rewards/margins": 3.080294132232666, "rewards/rejected": 1.5498929023742676, "step": 93000 }, { "epoch": 4.318213473234597, "grad_norm": 119.84510040283203, "learning_rate": 4.102418868099726e-08, "logits/chosen": -19.001462936401367, "logits/rejected": -17.578166961669922, "logps/chosen": -390.10748291015625, "logps/rejected": -310.1749267578125, "loss": 0.4995, "rewards/accuracies": 0.5, "rewards/chosen": 3.3038153648376465, "rewards/margins": 1.2129181623458862, "rewards/rejected": 2.09089732170105, "step": 93010 }, { "epoch": 4.318677747342031, "grad_norm": 124.11543273925781, "learning_rate": 4.099633223455128e-08, "logits/chosen": -19.379039764404297, "logits/rejected": -19.554370880126953, "logps/chosen": -363.69091796875, "logps/rejected": -388.66851806640625, "loss": 0.8413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.851701498031616, "rewards/margins": 0.41214504837989807, "rewards/rejected": 2.43955659866333, "step": 93020 }, { "epoch": 4.319142021449464, "grad_norm": 178.5792236328125, "learning_rate": 4.096847578810529e-08, "logits/chosen": -18.564029693603516, "logits/rejected": -17.106060028076172, "logps/chosen": -401.32000732421875, "logps/rejected": -258.93072509765625, "loss": 0.4117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.956306219100952, "rewards/margins": 2.4492809772491455, "rewards/rejected": 0.5070253014564514, "step": 93030 }, { "epoch": 4.319606295556897, "grad_norm": 101.14920806884766, "learning_rate": 4.094061934165931e-08, "logits/chosen": -19.397916793823242, "logits/rejected": -18.866708755493164, "logps/chosen": -390.9000549316406, "logps/rejected": -387.039794921875, "loss": 0.8149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4169764518737793, "rewards/margins": 0.7895519733428955, "rewards/rejected": 2.627424716949463, "step": 93040 }, { "epoch": 4.32007056966433, "grad_norm": 2.222585678100586, "learning_rate": 4.091276289521333e-08, "logits/chosen": -18.563547134399414, "logits/rejected": -17.988672256469727, "logps/chosen": -329.42242431640625, "logps/rejected": -250.15689086914062, "loss": 1.3781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3791158199310303, "rewards/margins": 0.8926130533218384, "rewards/rejected": 1.486503005027771, "step": 93050 }, { "epoch": 4.320534843771763, "grad_norm": 149.69386291503906, "learning_rate": 4.088490644876735e-08, "logits/chosen": -18.660097122192383, "logits/rejected": -17.992544174194336, "logps/chosen": -326.74908447265625, "logps/rejected": -280.143310546875, "loss": 1.1739, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7235631942749023, "rewards/margins": 0.7360435724258423, "rewards/rejected": 1.9875198602676392, "step": 93060 }, { "epoch": 4.320999117879196, "grad_norm": 20.53948402404785, "learning_rate": 4.085705000232137e-08, "logits/chosen": -19.214107513427734, "logits/rejected": -18.569713592529297, "logps/chosen": -333.59344482421875, "logps/rejected": -272.74627685546875, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0740554332733154, "rewards/margins": 1.5637798309326172, "rewards/rejected": 1.5102754831314087, "step": 93070 }, { "epoch": 4.3214633919866285, "grad_norm": 3.390279531478882, "learning_rate": 4.0829193555875384e-08, "logits/chosen": -18.786500930786133, "logits/rejected": -18.257938385009766, "logps/chosen": -330.6172180175781, "logps/rejected": -297.28302001953125, "loss": 0.808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1638574600219727, "rewards/margins": 1.0774261951446533, "rewards/rejected": 2.0864315032958984, "step": 93080 }, { "epoch": 4.321927666094062, "grad_norm": 288.1827392578125, "learning_rate": 4.0801337109429405e-08, "logits/chosen": -18.810924530029297, "logits/rejected": -18.691883087158203, "logps/chosen": -338.1260681152344, "logps/rejected": -313.70343017578125, "loss": 1.4335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.845054030418396, "rewards/margins": 0.08704452961683273, "rewards/rejected": 1.7580093145370483, "step": 93090 }, { "epoch": 4.322391940201495, "grad_norm": 154.26376342773438, "learning_rate": 4.0773480662983426e-08, "logits/chosen": -18.920621871948242, "logits/rejected": -18.343425750732422, "logps/chosen": -404.532958984375, "logps/rejected": -330.76507568359375, "loss": 0.8039, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.922694683074951, "rewards/margins": 0.4395938515663147, "rewards/rejected": 2.483100652694702, "step": 93100 }, { "epoch": 4.322856214308928, "grad_norm": 37.97964096069336, "learning_rate": 4.074562421653744e-08, "logits/chosen": -18.584941864013672, "logits/rejected": -17.85849952697754, "logps/chosen": -401.2164306640625, "logps/rejected": -306.65240478515625, "loss": 0.5832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.695420026779175, "rewards/margins": 1.558415412902832, "rewards/rejected": 2.1370043754577637, "step": 93110 }, { "epoch": 4.323320488416361, "grad_norm": 197.9167022705078, "learning_rate": 4.071776777009146e-08, "logits/chosen": -18.847454071044922, "logits/rejected": -19.219280242919922, "logps/chosen": -426.59429931640625, "logps/rejected": -360.1644287109375, "loss": 0.3736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.655831336975098, "rewards/margins": 2.456373691558838, "rewards/rejected": 2.199457883834839, "step": 93120 }, { "epoch": 4.323784762523794, "grad_norm": 43.51596450805664, "learning_rate": 4.0689911323645476e-08, "logits/chosen": -18.526287078857422, "logits/rejected": -17.492733001708984, "logps/chosen": -425.868896484375, "logps/rejected": -351.6937561035156, "loss": 0.3089, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.909245729446411, "rewards/margins": 2.0815396308898926, "rewards/rejected": 1.8277060985565186, "step": 93130 }, { "epoch": 4.324249036631227, "grad_norm": 16.255783081054688, "learning_rate": 4.06620548771995e-08, "logits/chosen": -18.39443588256836, "logits/rejected": -17.508930206298828, "logps/chosen": -417.2732849121094, "logps/rejected": -361.2594909667969, "loss": 0.5337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.676058292388916, "rewards/margins": 1.5974856615066528, "rewards/rejected": 2.0785727500915527, "step": 93140 }, { "epoch": 4.32471331073866, "grad_norm": 264.65057373046875, "learning_rate": 4.063419843075352e-08, "logits/chosen": -17.95650863647461, "logits/rejected": -17.803194046020508, "logps/chosen": -362.79931640625, "logps/rejected": -336.40240478515625, "loss": 0.5534, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1698527336120605, "rewards/margins": 0.8004299402236938, "rewards/rejected": 1.3694229125976562, "step": 93150 }, { "epoch": 4.325177584846093, "grad_norm": 1.4560316801071167, "learning_rate": 4.060634198430753e-08, "logits/chosen": -20.098678588867188, "logits/rejected": -19.596538543701172, "logps/chosen": -460.9505310058594, "logps/rejected": -300.1920471191406, "loss": 0.4677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5681679248809814, "rewards/margins": 1.620449423789978, "rewards/rejected": 1.9477183818817139, "step": 93160 }, { "epoch": 4.325641858953526, "grad_norm": 36.29951477050781, "learning_rate": 4.0578485537861554e-08, "logits/chosen": -18.676528930664062, "logits/rejected": -17.92498207092285, "logps/chosen": -391.84375, "logps/rejected": -311.39581298828125, "loss": 0.6694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2396788597106934, "rewards/margins": 1.2425892353057861, "rewards/rejected": 1.9970899820327759, "step": 93170 }, { "epoch": 4.326106133060959, "grad_norm": 62.36702346801758, "learning_rate": 4.0550629091415575e-08, "logits/chosen": -19.267070770263672, "logits/rejected": -18.403913497924805, "logps/chosen": -525.2960205078125, "logps/rejected": -421.70233154296875, "loss": 1.1981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9595322608947754, "rewards/margins": 0.7176046967506409, "rewards/rejected": 3.2419273853302, "step": 93180 }, { "epoch": 4.326570407168393, "grad_norm": 117.84962463378906, "learning_rate": 4.052277264496959e-08, "logits/chosen": -18.892135620117188, "logits/rejected": -17.712627410888672, "logps/chosen": -453.68603515625, "logps/rejected": -291.6827087402344, "loss": 0.4582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.468438148498535, "rewards/margins": 2.578535556793213, "rewards/rejected": 1.8899030685424805, "step": 93190 }, { "epoch": 4.327034681275825, "grad_norm": 15.052035331726074, "learning_rate": 4.049491619852361e-08, "logits/chosen": -18.247961044311523, "logits/rejected": -18.031986236572266, "logps/chosen": -317.462890625, "logps/rejected": -297.3900451660156, "loss": 0.5001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.971898078918457, "rewards/margins": 1.0807418823242188, "rewards/rejected": 0.8911563158035278, "step": 93200 }, { "epoch": 4.327498955383258, "grad_norm": 7.341726303100586, "learning_rate": 4.0467059752077626e-08, "logits/chosen": -19.6492977142334, "logits/rejected": -18.717849731445312, "logps/chosen": -434.46124267578125, "logps/rejected": -339.722412109375, "loss": 0.2749, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.681735992431641, "rewards/margins": 1.8359863758087158, "rewards/rejected": 2.845750093460083, "step": 93210 }, { "epoch": 4.327963229490691, "grad_norm": 71.01778411865234, "learning_rate": 4.043920330563164e-08, "logits/chosen": -18.177448272705078, "logits/rejected": -18.4665470123291, "logps/chosen": -346.45159912109375, "logps/rejected": -402.69122314453125, "loss": 1.1431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.24552583694458, "rewards/margins": 0.10901232063770294, "rewards/rejected": 3.1365134716033936, "step": 93220 }, { "epoch": 4.328427503598125, "grad_norm": 83.78926849365234, "learning_rate": 4.0411346859185655e-08, "logits/chosen": -18.925769805908203, "logits/rejected": -18.191295623779297, "logps/chosen": -399.92193603515625, "logps/rejected": -305.24371337890625, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 2.7421927452087402, "rewards/margins": 0.8982046842575073, "rewards/rejected": 1.8439878225326538, "step": 93230 }, { "epoch": 4.328891777705557, "grad_norm": 0.17282100021839142, "learning_rate": 4.0383490412739676e-08, "logits/chosen": -19.25087547302246, "logits/rejected": -17.942096710205078, "logps/chosen": -421.0895080566406, "logps/rejected": -275.8832702636719, "loss": 0.5328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.356201648712158, "rewards/margins": 1.7414424419403076, "rewards/rejected": 1.614759087562561, "step": 93240 }, { "epoch": 4.32935605181299, "grad_norm": 56.85453414916992, "learning_rate": 4.03556339662937e-08, "logits/chosen": -19.399578094482422, "logits/rejected": -18.337963104248047, "logps/chosen": -355.42864990234375, "logps/rejected": -272.94451904296875, "loss": 0.6961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9489123821258545, "rewards/margins": 1.5185649394989014, "rewards/rejected": 1.430347204208374, "step": 93250 }, { "epoch": 4.329820325920424, "grad_norm": 25.000802993774414, "learning_rate": 4.032777751984771e-08, "logits/chosen": -19.222726821899414, "logits/rejected": -19.213787078857422, "logps/chosen": -380.165283203125, "logps/rejected": -377.54193115234375, "loss": 1.049, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1995506286621094, "rewards/margins": 0.8598006963729858, "rewards/rejected": 2.339749813079834, "step": 93260 }, { "epoch": 4.3302846000278565, "grad_norm": 2.3147966861724854, "learning_rate": 4.029992107340173e-08, "logits/chosen": -19.449443817138672, "logits/rejected": -17.848865509033203, "logps/chosen": -339.26776123046875, "logps/rejected": -242.33761596679688, "loss": 0.2871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2531661987304688, "rewards/margins": 2.485163927078247, "rewards/rejected": 0.7680023908615112, "step": 93270 }, { "epoch": 4.330748874135289, "grad_norm": 53.35883331298828, "learning_rate": 4.0272064626955754e-08, "logits/chosen": -19.1097469329834, "logits/rejected": -18.63486099243164, "logps/chosen": -368.35980224609375, "logps/rejected": -328.96783447265625, "loss": 0.8341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5389771461486816, "rewards/margins": 1.2464988231658936, "rewards/rejected": 2.292478084564209, "step": 93280 }, { "epoch": 4.331213148242723, "grad_norm": 2.806157350540161, "learning_rate": 4.024420818050977e-08, "logits/chosen": -19.411949157714844, "logits/rejected": -18.474380493164062, "logps/chosen": -395.6929016113281, "logps/rejected": -305.96844482421875, "loss": 0.4992, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6918201446533203, "rewards/margins": 2.193969249725342, "rewards/rejected": 1.4978505373001099, "step": 93290 }, { "epoch": 4.331677422350156, "grad_norm": 40.496681213378906, "learning_rate": 4.021635173406379e-08, "logits/chosen": -20.295156478881836, "logits/rejected": -19.095783233642578, "logps/chosen": -402.791015625, "logps/rejected": -360.99658203125, "loss": 0.5811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.374357223510742, "rewards/margins": 1.4849941730499268, "rewards/rejected": 1.8893629312515259, "step": 93300 }, { "epoch": 4.3321416964575885, "grad_norm": 53.59455490112305, "learning_rate": 4.018849528761781e-08, "logits/chosen": -19.693124771118164, "logits/rejected": -18.377864837646484, "logps/chosen": -425.8890686035156, "logps/rejected": -281.04254150390625, "loss": 0.3822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.284189224243164, "rewards/margins": 2.0673041343688965, "rewards/rejected": 2.2168853282928467, "step": 93310 }, { "epoch": 4.332605970565021, "grad_norm": 78.10050964355469, "learning_rate": 4.0160638841171825e-08, "logits/chosen": -18.888395309448242, "logits/rejected": -17.56869888305664, "logps/chosen": -403.67315673828125, "logps/rejected": -282.172119140625, "loss": 0.244, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.554476022720337, "rewards/margins": 2.22534441947937, "rewards/rejected": 0.3291321396827698, "step": 93320 }, { "epoch": 4.333070244672455, "grad_norm": 211.86965942382812, "learning_rate": 4.0132782394725846e-08, "logits/chosen": -19.02468490600586, "logits/rejected": -18.623056411743164, "logps/chosen": -334.70343017578125, "logps/rejected": -313.78265380859375, "loss": 0.9167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.104287624359131, "rewards/margins": 0.5093159079551697, "rewards/rejected": 1.5949718952178955, "step": 93330 }, { "epoch": 4.333534518779888, "grad_norm": 0.38376471400260925, "learning_rate": 4.010492594827986e-08, "logits/chosen": -19.076345443725586, "logits/rejected": -17.891088485717773, "logps/chosen": -388.383544921875, "logps/rejected": -283.97235107421875, "loss": 0.3292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.102301120758057, "rewards/margins": 2.7075233459472656, "rewards/rejected": 1.394777774810791, "step": 93340 }, { "epoch": 4.3339987928873205, "grad_norm": 9.198454856872559, "learning_rate": 4.007706950183388e-08, "logits/chosen": -19.71401596069336, "logits/rejected": -19.540664672851562, "logps/chosen": -484.49774169921875, "logps/rejected": -550.6173095703125, "loss": 0.7197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.55394172668457, "rewards/margins": 0.8572924733161926, "rewards/rejected": 3.6966490745544434, "step": 93350 }, { "epoch": 4.334463066994754, "grad_norm": 32.80842208862305, "learning_rate": 4.00492130553879e-08, "logits/chosen": -19.401832580566406, "logits/rejected": -19.01557731628418, "logps/chosen": -515.9366455078125, "logps/rejected": -410.837158203125, "loss": 0.805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.255603075027466, "rewards/margins": 0.5965811610221863, "rewards/rejected": 2.6590218544006348, "step": 93360 }, { "epoch": 4.334927341102187, "grad_norm": 229.4713592529297, "learning_rate": 4.002135660894192e-08, "logits/chosen": -18.775854110717773, "logits/rejected": -18.263172149658203, "logps/chosen": -414.857177734375, "logps/rejected": -377.80657958984375, "loss": 0.6609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9217607975006104, "rewards/margins": 0.6388190984725952, "rewards/rejected": 2.282942056655884, "step": 93370 }, { "epoch": 4.33539161520962, "grad_norm": 1.6727920770645142, "learning_rate": 3.999350016249594e-08, "logits/chosen": -19.17620849609375, "logits/rejected": -17.799428939819336, "logps/chosen": -435.46234130859375, "logps/rejected": -292.04571533203125, "loss": 0.3139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8194007873535156, "rewards/margins": 2.1108415126800537, "rewards/rejected": 1.7085596323013306, "step": 93380 }, { "epoch": 4.3358558893170525, "grad_norm": 1.541235327720642, "learning_rate": 3.996564371604996e-08, "logits/chosen": -18.40724754333496, "logits/rejected": -18.70748519897461, "logps/chosen": -334.8504638671875, "logps/rejected": -348.82861328125, "loss": 1.4522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.073227643966675, "rewards/margins": -0.324621319770813, "rewards/rejected": 2.3978488445281982, "step": 93390 }, { "epoch": 4.336320163424486, "grad_norm": 107.71926879882812, "learning_rate": 3.9937787269603975e-08, "logits/chosen": -18.902462005615234, "logits/rejected": -19.279855728149414, "logps/chosen": -365.059326171875, "logps/rejected": -435.9347229003906, "loss": 1.2531, "rewards/accuracies": 0.5, "rewards/chosen": 3.1931445598602295, "rewards/margins": 0.1498161256313324, "rewards/rejected": 3.043328285217285, "step": 93400 }, { "epoch": 4.336784437531919, "grad_norm": 119.3041763305664, "learning_rate": 3.9909930823157996e-08, "logits/chosen": -19.25516128540039, "logits/rejected": -18.43951988220215, "logps/chosen": -384.9034423828125, "logps/rejected": -298.72882080078125, "loss": 0.9731, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8711538314819336, "rewards/margins": 0.8977571725845337, "rewards/rejected": 1.9733966588974, "step": 93410 }, { "epoch": 4.337248711639352, "grad_norm": 144.88072204589844, "learning_rate": 3.9882074376712004e-08, "logits/chosen": -18.56362533569336, "logits/rejected": -18.633617401123047, "logps/chosen": -313.8965148925781, "logps/rejected": -308.3283996582031, "loss": 0.8152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2577321529388428, "rewards/margins": 0.35789093375205994, "rewards/rejected": 1.899841547012329, "step": 93420 }, { "epoch": 4.337712985746785, "grad_norm": 3.4790866374969482, "learning_rate": 3.9854217930266025e-08, "logits/chosen": -18.726736068725586, "logits/rejected": -17.961837768554688, "logps/chosen": -421.8310546875, "logps/rejected": -347.25677490234375, "loss": 0.3896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7166926860809326, "rewards/margins": 1.543309211730957, "rewards/rejected": 2.1733837127685547, "step": 93430 }, { "epoch": 4.338177259854218, "grad_norm": 105.10541534423828, "learning_rate": 3.982636148382004e-08, "logits/chosen": -19.610464096069336, "logits/rejected": -18.91510581970215, "logps/chosen": -409.5460510253906, "logps/rejected": -355.8763122558594, "loss": 0.9501, "rewards/accuracies": 0.5, "rewards/chosen": 3.0435662269592285, "rewards/margins": -0.08150659501552582, "rewards/rejected": 3.125072717666626, "step": 93440 }, { "epoch": 4.338641533961651, "grad_norm": 5.898642063140869, "learning_rate": 3.979850503737406e-08, "logits/chosen": -19.637435913085938, "logits/rejected": -18.993112564086914, "logps/chosen": -477.9020080566406, "logps/rejected": -341.7044982910156, "loss": 0.3515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.822388172149658, "rewards/margins": 2.2620081901550293, "rewards/rejected": 2.560380220413208, "step": 93450 }, { "epoch": 4.339105808069084, "grad_norm": 2.524806261062622, "learning_rate": 3.977064859092808e-08, "logits/chosen": -18.57944107055664, "logits/rejected": -17.998022079467773, "logps/chosen": -505.9600524902344, "logps/rejected": -334.4840393066406, "loss": 0.4969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5216078758239746, "rewards/margins": 1.7206449508666992, "rewards/rejected": 1.8009626865386963, "step": 93460 }, { "epoch": 4.339570082176517, "grad_norm": 112.8298110961914, "learning_rate": 3.9742792144482096e-08, "logits/chosen": -19.630367279052734, "logits/rejected": -18.613727569580078, "logps/chosen": -374.1849365234375, "logps/rejected": -337.94281005859375, "loss": 1.1401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8333346843719482, "rewards/margins": 0.8942364454269409, "rewards/rejected": 2.939098596572876, "step": 93470 }, { "epoch": 4.34003435628395, "grad_norm": 1.6895605325698853, "learning_rate": 3.971493569803612e-08, "logits/chosen": -18.545658111572266, "logits/rejected": -18.189090728759766, "logps/chosen": -278.2848815917969, "logps/rejected": -275.5774230957031, "loss": 0.6437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0077965259552, "rewards/margins": 1.0225800275802612, "rewards/rejected": 0.9852163195610046, "step": 93480 }, { "epoch": 4.340498630391383, "grad_norm": 173.65008544921875, "learning_rate": 3.968707925159014e-08, "logits/chosen": -20.193504333496094, "logits/rejected": -19.439525604248047, "logps/chosen": -428.7021484375, "logps/rejected": -336.06414794921875, "loss": 0.5032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.436237335205078, "rewards/margins": 1.7814092636108398, "rewards/rejected": 2.654827833175659, "step": 93490 }, { "epoch": 4.3409629044988165, "grad_norm": 1.7970161437988281, "learning_rate": 3.965922280514415e-08, "logits/chosen": -19.392419815063477, "logits/rejected": -17.913005828857422, "logps/chosen": -396.7655334472656, "logps/rejected": -255.20852661132812, "loss": 0.3967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.338024139404297, "rewards/margins": 1.9673376083374023, "rewards/rejected": 1.370686411857605, "step": 93500 }, { "epoch": 4.341427178606249, "grad_norm": 139.46987915039062, "learning_rate": 3.9631366358698174e-08, "logits/chosen": -18.56220054626465, "logits/rejected": -17.809722900390625, "logps/chosen": -347.6619567871094, "logps/rejected": -285.56890869140625, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.345818042755127, "rewards/margins": 0.9897697567939758, "rewards/rejected": 1.356048345565796, "step": 93510 }, { "epoch": 4.341891452713682, "grad_norm": 166.9365692138672, "learning_rate": 3.9603509912252195e-08, "logits/chosen": -19.51423454284668, "logits/rejected": -18.534957885742188, "logps/chosen": -387.3406677246094, "logps/rejected": -364.84552001953125, "loss": 0.933, "rewards/accuracies": 0.5, "rewards/chosen": 3.7575008869171143, "rewards/margins": 0.40819233655929565, "rewards/rejected": 3.349308490753174, "step": 93520 }, { "epoch": 4.342355726821115, "grad_norm": 245.75900268554688, "learning_rate": 3.957565346580621e-08, "logits/chosen": -19.232080459594727, "logits/rejected": -17.755334854125977, "logps/chosen": -473.8619079589844, "logps/rejected": -367.6542053222656, "loss": 0.6117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4141037464141846, "rewards/margins": 1.0381337404251099, "rewards/rejected": 2.375969886779785, "step": 93530 }, { "epoch": 4.3428200009285485, "grad_norm": 199.76571655273438, "learning_rate": 3.954779701936023e-08, "logits/chosen": -18.65822982788086, "logits/rejected": -19.77959442138672, "logps/chosen": -410.58984375, "logps/rejected": -453.41851806640625, "loss": 1.7938, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0722782611846924, "rewards/margins": -0.25770601630210876, "rewards/rejected": 3.329984188079834, "step": 93540 }, { "epoch": 4.343284275035981, "grad_norm": 4.160250663757324, "learning_rate": 3.9519940572914246e-08, "logits/chosen": -20.857254028320312, "logits/rejected": -19.181150436401367, "logps/chosen": -383.2796325683594, "logps/rejected": -366.591064453125, "loss": 0.6824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.672554969787598, "rewards/margins": 1.5904898643493652, "rewards/rejected": 3.0820651054382324, "step": 93550 }, { "epoch": 4.343748549143414, "grad_norm": 72.81239318847656, "learning_rate": 3.949208412646827e-08, "logits/chosen": -19.18246078491211, "logits/rejected": -18.652217864990234, "logps/chosen": -369.50555419921875, "logps/rejected": -372.2956237792969, "loss": 0.5512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7885334491729736, "rewards/margins": 0.6720008850097656, "rewards/rejected": 2.116532564163208, "step": 93560 }, { "epoch": 4.344212823250848, "grad_norm": 10.046415328979492, "learning_rate": 3.946422768002229e-08, "logits/chosen": -19.3389835357666, "logits/rejected": -17.674915313720703, "logps/chosen": -445.262451171875, "logps/rejected": -257.36712646484375, "loss": 0.5182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.803397178649902, "rewards/margins": 2.831390380859375, "rewards/rejected": 1.9720064401626587, "step": 93570 }, { "epoch": 4.3446770973582804, "grad_norm": 67.98049926757812, "learning_rate": 3.94363712335763e-08, "logits/chosen": -19.717893600463867, "logits/rejected": -18.444721221923828, "logps/chosen": -427.64385986328125, "logps/rejected": -334.07574462890625, "loss": 0.2721, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.625563621520996, "rewards/margins": 1.9788639545440674, "rewards/rejected": 2.6466994285583496, "step": 93580 }, { "epoch": 4.345141371465713, "grad_norm": 0.3259718418121338, "learning_rate": 3.9408514787130324e-08, "logits/chosen": -19.482343673706055, "logits/rejected": -18.77909278869629, "logps/chosen": -365.73272705078125, "logps/rejected": -287.3846740722656, "loss": 0.5395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.302318572998047, "rewards/margins": 1.9377491474151611, "rewards/rejected": 1.3645691871643066, "step": 93590 }, { "epoch": 4.345605645573146, "grad_norm": 1.4725066423416138, "learning_rate": 3.9380658340684345e-08, "logits/chosen": -19.5058536529541, "logits/rejected": -18.70687484741211, "logps/chosen": -449.082763671875, "logps/rejected": -348.75335693359375, "loss": 0.4334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4846458435058594, "rewards/margins": 1.3665040731430054, "rewards/rejected": 2.1181416511535645, "step": 93600 }, { "epoch": 4.34606991968058, "grad_norm": 52.534423828125, "learning_rate": 3.935280189423836e-08, "logits/chosen": -18.258480072021484, "logits/rejected": -18.40228271484375, "logps/chosen": -327.0027160644531, "logps/rejected": -342.55804443359375, "loss": 0.8277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.567232370376587, "rewards/margins": 0.4105745255947113, "rewards/rejected": 1.1566579341888428, "step": 93610 }, { "epoch": 4.346534193788012, "grad_norm": 105.4270248413086, "learning_rate": 3.9324945447792374e-08, "logits/chosen": -19.16411590576172, "logits/rejected": -19.0861873626709, "logps/chosen": -374.56011962890625, "logps/rejected": -330.23162841796875, "loss": 1.0819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.473106861114502, "rewards/margins": 0.7388461828231812, "rewards/rejected": 2.7342610359191895, "step": 93620 }, { "epoch": 4.346998467895445, "grad_norm": 272.6622009277344, "learning_rate": 3.929708900134639e-08, "logits/chosen": -18.955202102661133, "logits/rejected": -17.827865600585938, "logps/chosen": -402.13360595703125, "logps/rejected": -342.1375732421875, "loss": 1.0288, "rewards/accuracies": 0.5, "rewards/chosen": 3.5421435832977295, "rewards/margins": 0.7963235974311829, "rewards/rejected": 2.7458202838897705, "step": 93630 }, { "epoch": 4.347462742002879, "grad_norm": 9.864501953125, "learning_rate": 3.926923255490041e-08, "logits/chosen": -18.268321990966797, "logits/rejected": -17.13492202758789, "logps/chosen": -363.2076721191406, "logps/rejected": -263.7862243652344, "loss": 0.5901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4469540119171143, "rewards/margins": 1.5291557312011719, "rewards/rejected": 1.917798399925232, "step": 93640 }, { "epoch": 4.347927016110312, "grad_norm": 22.70524787902832, "learning_rate": 3.9241376108454424e-08, "logits/chosen": -18.96186065673828, "logits/rejected": -17.619855880737305, "logps/chosen": -291.8979187011719, "logps/rejected": -180.1675262451172, "loss": 0.252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4343628883361816, "rewards/margins": 2.184436321258545, "rewards/rejected": 0.2499266415834427, "step": 93650 }, { "epoch": 4.348391290217744, "grad_norm": 134.0026397705078, "learning_rate": 3.9213519662008445e-08, "logits/chosen": -18.714298248291016, "logits/rejected": -17.935298919677734, "logps/chosen": -348.6680908203125, "logps/rejected": -322.0126037597656, "loss": 0.5107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9935555458068848, "rewards/margins": 1.632163643836975, "rewards/rejected": 1.3613916635513306, "step": 93660 }, { "epoch": 4.348855564325177, "grad_norm": 0.35278552770614624, "learning_rate": 3.9185663215562466e-08, "logits/chosen": -19.01405906677246, "logits/rejected": -18.333133697509766, "logps/chosen": -427.28619384765625, "logps/rejected": -379.44476318359375, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.665222644805908, "rewards/margins": 1.8112056255340576, "rewards/rejected": 1.8540165424346924, "step": 93670 }, { "epoch": 4.349319838432611, "grad_norm": 60.63682556152344, "learning_rate": 3.915780676911648e-08, "logits/chosen": -19.020544052124023, "logits/rejected": -17.72804832458496, "logps/chosen": -411.8506774902344, "logps/rejected": -268.66815185546875, "loss": 0.6114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5107181072235107, "rewards/margins": 1.6713035106658936, "rewards/rejected": 1.8394148349761963, "step": 93680 }, { "epoch": 4.349784112540044, "grad_norm": 3.1496777534484863, "learning_rate": 3.91299503226705e-08, "logits/chosen": -18.83562469482422, "logits/rejected": -18.535802841186523, "logps/chosen": -361.18060302734375, "logps/rejected": -315.21026611328125, "loss": 0.9987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9437172412872314, "rewards/margins": 0.9192142486572266, "rewards/rejected": 3.024503231048584, "step": 93690 }, { "epoch": 4.350248386647476, "grad_norm": 65.49259948730469, "learning_rate": 3.910209387622452e-08, "logits/chosen": -20.47683334350586, "logits/rejected": -19.245407104492188, "logps/chosen": -407.95819091796875, "logps/rejected": -428.17340087890625, "loss": 0.9358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.597595691680908, "rewards/margins": 1.2686434984207153, "rewards/rejected": 3.328951597213745, "step": 93700 }, { "epoch": 4.35071266075491, "grad_norm": 177.92874145507812, "learning_rate": 3.907423742977854e-08, "logits/chosen": -18.753944396972656, "logits/rejected": -18.684728622436523, "logps/chosen": -396.2737121582031, "logps/rejected": -384.68267822265625, "loss": 1.1926, "rewards/accuracies": 0.5, "rewards/chosen": 3.4031014442443848, "rewards/margins": -0.007702446077018976, "rewards/rejected": 3.4108035564422607, "step": 93710 }, { "epoch": 4.351176934862343, "grad_norm": 21.13150405883789, "learning_rate": 3.904638098333256e-08, "logits/chosen": -19.148656845092773, "logits/rejected": -18.325937271118164, "logps/chosen": -329.7525329589844, "logps/rejected": -287.1070251464844, "loss": 0.8454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6911821365356445, "rewards/margins": 1.3691928386688232, "rewards/rejected": 1.3219892978668213, "step": 93720 }, { "epoch": 4.351641208969776, "grad_norm": 265.4091491699219, "learning_rate": 3.901852453688658e-08, "logits/chosen": -19.17336082458496, "logits/rejected": -18.429248809814453, "logps/chosen": -313.6629943847656, "logps/rejected": -368.7101135253906, "loss": 1.4972, "rewards/accuracies": 0.5, "rewards/chosen": 2.8072450160980225, "rewards/margins": 0.20272473990917206, "rewards/rejected": 2.604520320892334, "step": 93730 }, { "epoch": 4.352105483077208, "grad_norm": 45.138427734375, "learning_rate": 3.8990668090440595e-08, "logits/chosen": -19.681577682495117, "logits/rejected": -19.578044891357422, "logps/chosen": -374.23944091796875, "logps/rejected": -379.0431823730469, "loss": 1.786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.561063289642334, "rewards/margins": -0.36167189478874207, "rewards/rejected": 3.9227356910705566, "step": 93740 }, { "epoch": 4.352569757184642, "grad_norm": 0.00785308051854372, "learning_rate": 3.8962811643994616e-08, "logits/chosen": -18.913827896118164, "logits/rejected": -17.805490493774414, "logps/chosen": -357.16339111328125, "logps/rejected": -210.4229278564453, "loss": 0.405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.171722888946533, "rewards/margins": 2.3814444541931152, "rewards/rejected": 0.7902785539627075, "step": 93750 }, { "epoch": 4.353034031292075, "grad_norm": 191.82632446289062, "learning_rate": 3.893495519754863e-08, "logits/chosen": -18.909576416015625, "logits/rejected": -18.883899688720703, "logps/chosen": -314.0623779296875, "logps/rejected": -344.02862548828125, "loss": 1.0223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2174229621887207, "rewards/margins": -0.06647076457738876, "rewards/rejected": 2.283893346786499, "step": 93760 }, { "epoch": 4.3534983053995076, "grad_norm": 20.067808151245117, "learning_rate": 3.890709875110265e-08, "logits/chosen": -17.992589950561523, "logits/rejected": -18.43193244934082, "logps/chosen": -259.478759765625, "logps/rejected": -271.24517822265625, "loss": 0.885, "rewards/accuracies": 0.5, "rewards/chosen": 2.53218674659729, "rewards/margins": 0.7155774831771851, "rewards/rejected": 1.8166090250015259, "step": 93770 }, { "epoch": 4.353962579506941, "grad_norm": 153.5782012939453, "learning_rate": 3.887924230465667e-08, "logits/chosen": -19.621055603027344, "logits/rejected": -19.435104370117188, "logps/chosen": -404.29180908203125, "logps/rejected": -434.03875732421875, "loss": 1.1232, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.10295033454895, "rewards/margins": -0.2453477680683136, "rewards/rejected": 3.3482978343963623, "step": 93780 }, { "epoch": 4.354426853614374, "grad_norm": 0.3798512816429138, "learning_rate": 3.885138585821069e-08, "logits/chosen": -19.789026260375977, "logits/rejected": -19.51633071899414, "logps/chosen": -441.2681579589844, "logps/rejected": -346.4417419433594, "loss": 0.843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.74658203125, "rewards/margins": 1.873735785484314, "rewards/rejected": 2.8728461265563965, "step": 93790 }, { "epoch": 4.354891127721807, "grad_norm": 78.9933853149414, "learning_rate": 3.882352941176471e-08, "logits/chosen": -18.867084503173828, "logits/rejected": -18.72637939453125, "logps/chosen": -454.89306640625, "logps/rejected": -406.9109191894531, "loss": 1.49, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2571001052856445, "rewards/margins": 0.5205187201499939, "rewards/rejected": 3.736581802368164, "step": 93800 }, { "epoch": 4.3553554018292395, "grad_norm": 99.3546142578125, "learning_rate": 3.879567296531873e-08, "logits/chosen": -19.64690589904785, "logits/rejected": -19.63735008239746, "logps/chosen": -369.7012634277344, "logps/rejected": -340.5741882324219, "loss": 0.5175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.210463762283325, "rewards/margins": 1.0322679281234741, "rewards/rejected": 2.1781959533691406, "step": 93810 }, { "epoch": 4.355819675936673, "grad_norm": 22.798797607421875, "learning_rate": 3.876781651887274e-08, "logits/chosen": -19.23939323425293, "logits/rejected": -18.349658966064453, "logps/chosen": -329.795166015625, "logps/rejected": -286.53106689453125, "loss": 0.4509, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7927098274230957, "rewards/margins": 1.3134899139404297, "rewards/rejected": 1.4792201519012451, "step": 93820 }, { "epoch": 4.356283950044106, "grad_norm": 18.77941131591797, "learning_rate": 3.873996007242676e-08, "logits/chosen": -19.48496437072754, "logits/rejected": -17.674009323120117, "logps/chosen": -289.7630615234375, "logps/rejected": -222.490478515625, "loss": 0.2996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.419757127761841, "rewards/margins": 2.5284385681152344, "rewards/rejected": 0.8913186192512512, "step": 93830 }, { "epoch": 4.356748224151539, "grad_norm": 0.6476232409477234, "learning_rate": 3.871210362598077e-08, "logits/chosen": -20.269485473632812, "logits/rejected": -19.004770278930664, "logps/chosen": -433.08642578125, "logps/rejected": -299.8406677246094, "loss": 0.6471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8674168586730957, "rewards/margins": 1.2879502773284912, "rewards/rejected": 2.5794665813446045, "step": 93840 }, { "epoch": 4.357212498258972, "grad_norm": 0.6248869895935059, "learning_rate": 3.8684247179534794e-08, "logits/chosen": -19.580810546875, "logits/rejected": -18.67746353149414, "logps/chosen": -407.4330749511719, "logps/rejected": -273.6405334472656, "loss": 0.2548, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.164114475250244, "rewards/margins": 2.641538381576538, "rewards/rejected": 2.5225765705108643, "step": 93850 }, { "epoch": 4.357676772366405, "grad_norm": 14.857714653015137, "learning_rate": 3.865639073308881e-08, "logits/chosen": -19.601581573486328, "logits/rejected": -19.035816192626953, "logps/chosen": -401.472412109375, "logps/rejected": -377.61383056640625, "loss": 1.2576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9473743438720703, "rewards/margins": 1.0194077491760254, "rewards/rejected": 2.927966594696045, "step": 93860 }, { "epoch": 4.358141046473838, "grad_norm": 183.29849243164062, "learning_rate": 3.862853428664283e-08, "logits/chosen": -18.751832962036133, "logits/rejected": -18.093639373779297, "logps/chosen": -365.36285400390625, "logps/rejected": -335.5768737792969, "loss": 0.9602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.651390075683594, "rewards/margins": 1.7644132375717163, "rewards/rejected": 2.886976718902588, "step": 93870 }, { "epoch": 4.358605320581272, "grad_norm": 189.0413360595703, "learning_rate": 3.860067784019685e-08, "logits/chosen": -18.744510650634766, "logits/rejected": -18.282039642333984, "logps/chosen": -327.524658203125, "logps/rejected": -302.1238098144531, "loss": 0.9035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.639991044998169, "rewards/margins": 1.1156518459320068, "rewards/rejected": 2.524339199066162, "step": 93880 }, { "epoch": 4.359069594688704, "grad_norm": 216.09999084472656, "learning_rate": 3.8572821393750866e-08, "logits/chosen": -19.316757202148438, "logits/rejected": -18.576358795166016, "logps/chosen": -465.58270263671875, "logps/rejected": -370.4100646972656, "loss": 0.8455, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.246994733810425, "rewards/margins": 0.995139479637146, "rewards/rejected": 2.2518553733825684, "step": 93890 }, { "epoch": 4.359533868796137, "grad_norm": 220.78736877441406, "learning_rate": 3.854496494730489e-08, "logits/chosen": -19.727426528930664, "logits/rejected": -19.1416072845459, "logps/chosen": -405.5457458496094, "logps/rejected": -403.24041748046875, "loss": 0.593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.886038303375244, "rewards/margins": 0.9173652529716492, "rewards/rejected": 2.968672752380371, "step": 93900 }, { "epoch": 4.35999814290357, "grad_norm": 0.14074090123176575, "learning_rate": 3.851710850085891e-08, "logits/chosen": -18.762996673583984, "logits/rejected": -17.503849029541016, "logps/chosen": -353.4717712402344, "logps/rejected": -245.9013671875, "loss": 0.3522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7526373863220215, "rewards/margins": 2.369753122329712, "rewards/rejected": 0.38288408517837524, "step": 93910 }, { "epoch": 4.360462417011004, "grad_norm": 0.5716540813446045, "learning_rate": 3.848925205441292e-08, "logits/chosen": -19.041236877441406, "logits/rejected": -19.021770477294922, "logps/chosen": -264.8153381347656, "logps/rejected": -284.8751525878906, "loss": 0.5488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.744123935699463, "rewards/margins": 1.477723479270935, "rewards/rejected": 1.2664004564285278, "step": 93920 }, { "epoch": 4.360926691118436, "grad_norm": 0.9687908887863159, "learning_rate": 3.8461395607966944e-08, "logits/chosen": -19.476045608520508, "logits/rejected": -18.736042022705078, "logps/chosen": -335.3962707519531, "logps/rejected": -273.1685485839844, "loss": 0.8211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.975146770477295, "rewards/margins": 1.395082950592041, "rewards/rejected": 1.580063819885254, "step": 93930 }, { "epoch": 4.361390965225869, "grad_norm": 291.9532165527344, "learning_rate": 3.8433539161520965e-08, "logits/chosen": -18.476970672607422, "logits/rejected": -17.385639190673828, "logps/chosen": -360.92523193359375, "logps/rejected": -283.9349670410156, "loss": 0.7544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.000809907913208, "rewards/margins": 1.4298036098480225, "rewards/rejected": 1.5710062980651855, "step": 93940 }, { "epoch": 4.361855239333303, "grad_norm": 14.394256591796875, "learning_rate": 3.840568271507498e-08, "logits/chosen": -18.31278419494629, "logits/rejected": -17.30398178100586, "logps/chosen": -386.4358215332031, "logps/rejected": -205.2532196044922, "loss": 0.4488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.634657144546509, "rewards/margins": 1.9428377151489258, "rewards/rejected": 0.6918190717697144, "step": 93950 }, { "epoch": 4.3623195134407355, "grad_norm": 0.2196153700351715, "learning_rate": 3.8377826268629e-08, "logits/chosen": -19.963911056518555, "logits/rejected": -18.552833557128906, "logps/chosen": -439.53497314453125, "logps/rejected": -268.6426086425781, "loss": 0.152, "rewards/accuracies": 1.0, "rewards/chosen": 4.602930068969727, "rewards/margins": 2.980100154876709, "rewards/rejected": 1.622829794883728, "step": 93960 }, { "epoch": 4.362783787548168, "grad_norm": 29.75092124938965, "learning_rate": 3.8349969822183015e-08, "logits/chosen": -18.78289031982422, "logits/rejected": -17.74417495727539, "logps/chosen": -463.4046325683594, "logps/rejected": -345.33221435546875, "loss": 0.1703, "rewards/accuracies": 1.0, "rewards/chosen": 3.9633612632751465, "rewards/margins": 2.1534476280212402, "rewards/rejected": 1.8099136352539062, "step": 93970 }, { "epoch": 4.363248061655601, "grad_norm": 4.224534034729004, "learning_rate": 3.8322113375737036e-08, "logits/chosen": -18.112699508666992, "logits/rejected": -16.765865325927734, "logps/chosen": -432.25018310546875, "logps/rejected": -286.17626953125, "loss": 0.3803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7531609535217285, "rewards/margins": 2.1141343116760254, "rewards/rejected": 1.639026403427124, "step": 93980 }, { "epoch": 4.363712335763035, "grad_norm": 112.85112762451172, "learning_rate": 3.829425692929106e-08, "logits/chosen": -19.386457443237305, "logits/rejected": -18.71712303161621, "logps/chosen": -487.23193359375, "logps/rejected": -367.86102294921875, "loss": 0.4568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.449563026428223, "rewards/margins": 1.3567695617675781, "rewards/rejected": 3.0927939414978027, "step": 93990 }, { "epoch": 4.3641766098704675, "grad_norm": 2.473518133163452, "learning_rate": 3.826640048284507e-08, "logits/chosen": -20.372196197509766, "logits/rejected": -18.926576614379883, "logps/chosen": -447.2611389160156, "logps/rejected": -345.2168273925781, "loss": 0.9119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.240259647369385, "rewards/margins": 1.0455172061920166, "rewards/rejected": 3.1947426795959473, "step": 94000 }, { "epoch": 4.3646408839779, "grad_norm": 113.94739532470703, "learning_rate": 3.823854403639909e-08, "logits/chosen": -17.877473831176758, "logits/rejected": -17.880367279052734, "logps/chosen": -363.86468505859375, "logps/rejected": -305.25518798828125, "loss": 1.2922, "rewards/accuracies": 0.5, "rewards/chosen": 2.065295696258545, "rewards/margins": 0.12203431129455566, "rewards/rejected": 1.9432613849639893, "step": 94010 }, { "epoch": 4.365105158085334, "grad_norm": 3.030116558074951, "learning_rate": 3.82106875899531e-08, "logits/chosen": -18.460693359375, "logits/rejected": -17.74661636352539, "logps/chosen": -418.42041015625, "logps/rejected": -352.9703674316406, "loss": 0.7429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.073810577392578, "rewards/margins": 1.4009376764297485, "rewards/rejected": 2.672873020172119, "step": 94020 }, { "epoch": 4.365569432192767, "grad_norm": 38.066162109375, "learning_rate": 3.818283114350712e-08, "logits/chosen": -18.063772201538086, "logits/rejected": -17.862916946411133, "logps/chosen": -312.65484619140625, "logps/rejected": -296.77154541015625, "loss": 0.8609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.551325559616089, "rewards/margins": 0.3961810767650604, "rewards/rejected": 2.155144453048706, "step": 94030 }, { "epoch": 4.3660337063001995, "grad_norm": 11.6554594039917, "learning_rate": 3.815497469706114e-08, "logits/chosen": -19.70492935180664, "logits/rejected": -20.000484466552734, "logps/chosen": -406.906005859375, "logps/rejected": -427.041748046875, "loss": 1.3011, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.513087749481201, "rewards/margins": -0.4999333322048187, "rewards/rejected": 4.013020992279053, "step": 94040 }, { "epoch": 4.366497980407632, "grad_norm": 49.21812438964844, "learning_rate": 3.812711825061516e-08, "logits/chosen": -19.283926010131836, "logits/rejected": -18.61847686767578, "logps/chosen": -403.3573303222656, "logps/rejected": -303.3965759277344, "loss": 0.3663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0054216384887695, "rewards/margins": 1.8152259588241577, "rewards/rejected": 2.1901955604553223, "step": 94050 }, { "epoch": 4.366962254515066, "grad_norm": 70.43175506591797, "learning_rate": 3.809926180416918e-08, "logits/chosen": -19.45162582397461, "logits/rejected": -19.207317352294922, "logps/chosen": -372.38665771484375, "logps/rejected": -377.151611328125, "loss": 0.8804, "rewards/accuracies": 0.5, "rewards/chosen": 3.2176406383514404, "rewards/margins": 0.3153480887413025, "rewards/rejected": 2.902292251586914, "step": 94060 }, { "epoch": 4.367426528622499, "grad_norm": 141.41908264160156, "learning_rate": 3.8071405357723193e-08, "logits/chosen": -18.720544815063477, "logits/rejected": -18.878917694091797, "logps/chosen": -405.63079833984375, "logps/rejected": -446.64630126953125, "loss": 0.9469, "rewards/accuracies": 0.5, "rewards/chosen": 3.7228775024414062, "rewards/margins": 0.4835367202758789, "rewards/rejected": 3.2393405437469482, "step": 94070 }, { "epoch": 4.3678908027299315, "grad_norm": 80.11763000488281, "learning_rate": 3.8043548911277215e-08, "logits/chosen": -19.020925521850586, "logits/rejected": -18.22700309753418, "logps/chosen": -349.3316345214844, "logps/rejected": -252.5121307373047, "loss": 0.3744, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5504677295684814, "rewards/margins": 2.0788159370422363, "rewards/rejected": 1.471651554107666, "step": 94080 }, { "epoch": 4.368355076837365, "grad_norm": 54.48307418823242, "learning_rate": 3.8015692464831236e-08, "logits/chosen": -18.730430603027344, "logits/rejected": -18.654848098754883, "logps/chosen": -338.3695068359375, "logps/rejected": -390.38226318359375, "loss": 1.0384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.151258945465088, "rewards/margins": 0.18995288014411926, "rewards/rejected": 2.961306095123291, "step": 94090 }, { "epoch": 4.368819350944798, "grad_norm": 102.5841293334961, "learning_rate": 3.798783601838525e-08, "logits/chosen": -19.233142852783203, "logits/rejected": -18.651586532592773, "logps/chosen": -374.79888916015625, "logps/rejected": -307.3159484863281, "loss": 0.3817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7006351947784424, "rewards/margins": 1.3724948167800903, "rewards/rejected": 2.3281400203704834, "step": 94100 }, { "epoch": 4.369283625052231, "grad_norm": 69.49632263183594, "learning_rate": 3.795997957193927e-08, "logits/chosen": -19.697053909301758, "logits/rejected": -17.699735641479492, "logps/chosen": -465.89678955078125, "logps/rejected": -261.698974609375, "loss": 0.4358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3091845512390137, "rewards/margins": 2.060103178024292, "rewards/rejected": 1.2490811347961426, "step": 94110 }, { "epoch": 4.369747899159664, "grad_norm": 1.8459457159042358, "learning_rate": 3.793212312549329e-08, "logits/chosen": -19.134658813476562, "logits/rejected": -17.300241470336914, "logps/chosen": -541.7447509765625, "logps/rejected": -347.0330810546875, "loss": 0.2947, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.019985675811768, "rewards/margins": 2.5506820678710938, "rewards/rejected": 1.469304084777832, "step": 94120 }, { "epoch": 4.370212173267097, "grad_norm": 50.992122650146484, "learning_rate": 3.790426667904731e-08, "logits/chosen": -18.797847747802734, "logits/rejected": -18.69145393371582, "logps/chosen": -313.459716796875, "logps/rejected": -303.1427917480469, "loss": 0.7311, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5728602409362793, "rewards/margins": 0.41787680983543396, "rewards/rejected": 2.1549830436706543, "step": 94130 }, { "epoch": 4.37067644737453, "grad_norm": 105.50023651123047, "learning_rate": 3.787641023260133e-08, "logits/chosen": -20.198171615600586, "logits/rejected": -18.80739974975586, "logps/chosen": -525.4472045898438, "logps/rejected": -379.17901611328125, "loss": 0.2373, "rewards/accuracies": 1.0, "rewards/chosen": 4.677533149719238, "rewards/margins": 2.1479992866516113, "rewards/rejected": 2.529533863067627, "step": 94140 }, { "epoch": 4.371140721481963, "grad_norm": 10.828503608703613, "learning_rate": 3.784855378615535e-08, "logits/chosen": -18.692611694335938, "logits/rejected": -17.839740753173828, "logps/chosen": -412.34661865234375, "logps/rejected": -313.5140686035156, "loss": 0.6138, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.153227806091309, "rewards/margins": 1.2606195211410522, "rewards/rejected": 2.8926074504852295, "step": 94150 }, { "epoch": 4.371604995589396, "grad_norm": 15.631832122802734, "learning_rate": 3.7820697339709364e-08, "logits/chosen": -18.857666015625, "logits/rejected": -18.60861587524414, "logps/chosen": -442.6962890625, "logps/rejected": -364.88055419921875, "loss": 0.7666, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.9228978157043457, "rewards/margins": 0.3925507962703705, "rewards/rejected": 3.530346632003784, "step": 94160 }, { "epoch": 4.372069269696829, "grad_norm": 67.79678344726562, "learning_rate": 3.7792840893263385e-08, "logits/chosen": -19.13471221923828, "logits/rejected": -18.746395111083984, "logps/chosen": -329.3741455078125, "logps/rejected": -339.6960754394531, "loss": 0.8568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.218012571334839, "rewards/margins": 0.6998152732849121, "rewards/rejected": 2.5181972980499268, "step": 94170 }, { "epoch": 4.372533543804262, "grad_norm": 57.57717514038086, "learning_rate": 3.77649844468174e-08, "logits/chosen": -18.657135009765625, "logits/rejected": -17.876855850219727, "logps/chosen": -407.3825988769531, "logps/rejected": -362.2134704589844, "loss": 0.4469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.777388095855713, "rewards/margins": 2.111680507659912, "rewards/rejected": 1.6657072305679321, "step": 94180 }, { "epoch": 4.3729978179116955, "grad_norm": 14.574457168579102, "learning_rate": 3.773712800037142e-08, "logits/chosen": -18.653545379638672, "logits/rejected": -18.29129981994629, "logps/chosen": -448.0801696777344, "logps/rejected": -459.41168212890625, "loss": 1.1618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.063551902770996, "rewards/margins": 0.32214611768722534, "rewards/rejected": 3.741405487060547, "step": 94190 }, { "epoch": 4.373462092019128, "grad_norm": 69.15067291259766, "learning_rate": 3.770927155392544e-08, "logits/chosen": -18.186447143554688, "logits/rejected": -18.557750701904297, "logps/chosen": -450.55303955078125, "logps/rejected": -405.4232482910156, "loss": 0.7383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.117331027984619, "rewards/margins": 0.5737824440002441, "rewards/rejected": 2.543548583984375, "step": 94200 }, { "epoch": 4.373926366126561, "grad_norm": 44.492706298828125, "learning_rate": 3.768141510747945e-08, "logits/chosen": -20.090383529663086, "logits/rejected": -19.632137298583984, "logps/chosen": -464.54693603515625, "logps/rejected": -368.51727294921875, "loss": 0.7106, "rewards/accuracies": 0.5, "rewards/chosen": 4.5457024574279785, "rewards/margins": 0.6901313662528992, "rewards/rejected": 3.8555705547332764, "step": 94210 }, { "epoch": 4.374390640233994, "grad_norm": 3.1864984035491943, "learning_rate": 3.765355866103347e-08, "logits/chosen": -19.15153694152832, "logits/rejected": -18.005077362060547, "logps/chosen": -470.1858825683594, "logps/rejected": -334.814208984375, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 4.608916282653809, "rewards/margins": 2.242687702178955, "rewards/rejected": 2.3662283420562744, "step": 94220 }, { "epoch": 4.3748549143414275, "grad_norm": 28.896385192871094, "learning_rate": 3.7625702214587486e-08, "logits/chosen": -20.335712432861328, "logits/rejected": -20.281431198120117, "logps/chosen": -338.5697937011719, "logps/rejected": -354.15533447265625, "loss": 1.2059, "rewards/accuracies": 0.5, "rewards/chosen": 2.021470546722412, "rewards/margins": -0.332899808883667, "rewards/rejected": 2.354370355606079, "step": 94230 }, { "epoch": 4.37531918844886, "grad_norm": 279.7430114746094, "learning_rate": 3.759784576814151e-08, "logits/chosen": -18.710433959960938, "logits/rejected": -18.193233489990234, "logps/chosen": -474.2413635253906, "logps/rejected": -446.8536682128906, "loss": 1.5653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2439932823181152, "rewards/margins": -0.3780879080295563, "rewards/rejected": 3.6220810413360596, "step": 94240 }, { "epoch": 4.375783462556293, "grad_norm": 50.22288513183594, "learning_rate": 3.756998932169553e-08, "logits/chosen": -19.18623924255371, "logits/rejected": -19.14003562927246, "logps/chosen": -381.35333251953125, "logps/rejected": -371.8587951660156, "loss": 0.8178, "rewards/accuracies": 0.5, "rewards/chosen": 3.4301669597625732, "rewards/margins": 0.39275139570236206, "rewards/rejected": 3.0374155044555664, "step": 94250 }, { "epoch": 4.376247736663727, "grad_norm": 78.35420989990234, "learning_rate": 3.754213287524954e-08, "logits/chosen": -18.954303741455078, "logits/rejected": -17.9962158203125, "logps/chosen": -502.654052734375, "logps/rejected": -377.6263732910156, "loss": 0.6051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.245880603790283, "rewards/margins": 1.777019739151001, "rewards/rejected": 2.468860626220703, "step": 94260 }, { "epoch": 4.3767120107711595, "grad_norm": 45.304542541503906, "learning_rate": 3.7514276428803564e-08, "logits/chosen": -19.08712387084961, "logits/rejected": -18.840435028076172, "logps/chosen": -390.28118896484375, "logps/rejected": -340.5783386230469, "loss": 0.6286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.352919101715088, "rewards/margins": 0.825566291809082, "rewards/rejected": 2.5273525714874268, "step": 94270 }, { "epoch": 4.377176284878592, "grad_norm": 168.82901000976562, "learning_rate": 3.748641998235758e-08, "logits/chosen": -19.815656661987305, "logits/rejected": -19.557741165161133, "logps/chosen": -403.3443298339844, "logps/rejected": -454.4815368652344, "loss": 1.0351, "rewards/accuracies": 0.5, "rewards/chosen": 2.525477170944214, "rewards/margins": -0.1025906577706337, "rewards/rejected": 2.6280677318573, "step": 94280 }, { "epoch": 4.377640558986025, "grad_norm": 48.527915954589844, "learning_rate": 3.74585635359116e-08, "logits/chosen": -18.54837417602539, "logits/rejected": -18.12948989868164, "logps/chosen": -284.148193359375, "logps/rejected": -257.05133056640625, "loss": 0.6302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6546671390533447, "rewards/margins": 0.8393367528915405, "rewards/rejected": 0.8153305053710938, "step": 94290 }, { "epoch": 4.378104833093459, "grad_norm": 21.555259704589844, "learning_rate": 3.743070708946562e-08, "logits/chosen": -19.863880157470703, "logits/rejected": -19.179479598999023, "logps/chosen": -605.3931884765625, "logps/rejected": -396.1768493652344, "loss": 0.4753, "rewards/accuracies": 0.5, "rewards/chosen": 3.6126112937927246, "rewards/margins": 0.9608208537101746, "rewards/rejected": 2.6517910957336426, "step": 94300 }, { "epoch": 4.378569107200891, "grad_norm": 208.4444580078125, "learning_rate": 3.7402850643019635e-08, "logits/chosen": -17.957767486572266, "logits/rejected": -18.285991668701172, "logps/chosen": -353.72161865234375, "logps/rejected": -361.3636779785156, "loss": 0.8577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.577789783477783, "rewards/margins": 0.4480384290218353, "rewards/rejected": 3.1297507286071777, "step": 94310 }, { "epoch": 4.379033381308324, "grad_norm": 44.53742980957031, "learning_rate": 3.7374994196573656e-08, "logits/chosen": -19.497861862182617, "logits/rejected": -18.834793090820312, "logps/chosen": -398.95660400390625, "logps/rejected": -343.8623962402344, "loss": 0.6445, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9723541736602783, "rewards/margins": 0.9491381645202637, "rewards/rejected": 2.0232160091400146, "step": 94320 }, { "epoch": 4.379497655415758, "grad_norm": 55.488128662109375, "learning_rate": 3.734713775012768e-08, "logits/chosen": -19.365825653076172, "logits/rejected": -18.239011764526367, "logps/chosen": -428.03912353515625, "logps/rejected": -367.3392333984375, "loss": 0.7384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.10764741897583, "rewards/margins": 1.4345531463623047, "rewards/rejected": 2.6730945110321045, "step": 94330 }, { "epoch": 4.379961929523191, "grad_norm": 86.18536376953125, "learning_rate": 3.731928130368169e-08, "logits/chosen": -19.17433738708496, "logits/rejected": -18.09105110168457, "logps/chosen": -364.1671142578125, "logps/rejected": -287.61798095703125, "loss": 0.4557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7754154205322266, "rewards/margins": 1.6113684177398682, "rewards/rejected": 1.164047122001648, "step": 94340 }, { "epoch": 4.380426203630623, "grad_norm": 17.75589370727539, "learning_rate": 3.729142485723571e-08, "logits/chosen": -19.85486602783203, "logits/rejected": -19.21975326538086, "logps/chosen": -420.7103576660156, "logps/rejected": -283.5146179199219, "loss": 0.6716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.756251096725464, "rewards/margins": 1.3225021362304688, "rewards/rejected": 2.433748722076416, "step": 94350 }, { "epoch": 4.380890477738056, "grad_norm": 71.80494689941406, "learning_rate": 3.726356841078973e-08, "logits/chosen": -19.483013153076172, "logits/rejected": -18.98415184020996, "logps/chosen": -315.30291748046875, "logps/rejected": -330.4580993652344, "loss": 1.0265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.797959804534912, "rewards/margins": 0.4153912663459778, "rewards/rejected": 2.382568597793579, "step": 94360 }, { "epoch": 4.38135475184549, "grad_norm": 45.696109771728516, "learning_rate": 3.723571196434375e-08, "logits/chosen": -19.240596771240234, "logits/rejected": -17.376253128051758, "logps/chosen": -439.5279235839844, "logps/rejected": -288.1811828613281, "loss": 0.5551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.991328239440918, "rewards/margins": 1.640676736831665, "rewards/rejected": 1.350651502609253, "step": 94370 }, { "epoch": 4.381819025952923, "grad_norm": 72.66597747802734, "learning_rate": 3.720785551789776e-08, "logits/chosen": -19.624919891357422, "logits/rejected": -19.052413940429688, "logps/chosen": -403.04827880859375, "logps/rejected": -361.00689697265625, "loss": 0.6367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.20016622543335, "rewards/margins": 1.9151012897491455, "rewards/rejected": 3.2850654125213623, "step": 94380 }, { "epoch": 4.382283300060355, "grad_norm": 33.36819076538086, "learning_rate": 3.7179999071451784e-08, "logits/chosen": -18.89066505432129, "logits/rejected": -19.157791137695312, "logps/chosen": -353.0538024902344, "logps/rejected": -349.0454406738281, "loss": 0.764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4151203632354736, "rewards/margins": 0.2688702344894409, "rewards/rejected": 2.146250009536743, "step": 94390 }, { "epoch": 4.382747574167789, "grad_norm": 148.5886688232422, "learning_rate": 3.71521426250058e-08, "logits/chosen": -19.172771453857422, "logits/rejected": -18.745277404785156, "logps/chosen": -421.884033203125, "logps/rejected": -352.211669921875, "loss": 0.5672, "rewards/accuracies": 0.5, "rewards/chosen": 4.176149368286133, "rewards/margins": 1.3433876037597656, "rewards/rejected": 2.832761764526367, "step": 94400 }, { "epoch": 4.383211848275222, "grad_norm": 153.6929168701172, "learning_rate": 3.712428617855982e-08, "logits/chosen": -18.86972427368164, "logits/rejected": -19.01679801940918, "logps/chosen": -422.2953186035156, "logps/rejected": -497.07244873046875, "loss": 1.276, "rewards/accuracies": 0.5, "rewards/chosen": 4.48037052154541, "rewards/margins": -0.4042055606842041, "rewards/rejected": 4.884575843811035, "step": 94410 }, { "epoch": 4.383676122382655, "grad_norm": 26.83354949951172, "learning_rate": 3.709642973211384e-08, "logits/chosen": -19.40564727783203, "logits/rejected": -18.200641632080078, "logps/chosen": -413.41156005859375, "logps/rejected": -294.224609375, "loss": 0.4566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.332586765289307, "rewards/margins": 1.616829514503479, "rewards/rejected": 2.715757131576538, "step": 94420 }, { "epoch": 4.384140396490087, "grad_norm": 236.3677978515625, "learning_rate": 3.7068573285667856e-08, "logits/chosen": -20.07630729675293, "logits/rejected": -20.25106430053711, "logps/chosen": -338.4105529785156, "logps/rejected": -352.0157165527344, "loss": 0.82, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.270871639251709, "rewards/margins": 0.5127229690551758, "rewards/rejected": 1.7581485509872437, "step": 94430 }, { "epoch": 4.384604670597521, "grad_norm": 19.925336837768555, "learning_rate": 3.704071683922188e-08, "logits/chosen": -19.76080322265625, "logits/rejected": -19.56113052368164, "logps/chosen": -365.8497009277344, "logps/rejected": -310.8029479980469, "loss": 0.3203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7869839668273926, "rewards/margins": 1.7081886529922485, "rewards/rejected": 2.0787954330444336, "step": 94440 }, { "epoch": 4.385068944704954, "grad_norm": 84.60625457763672, "learning_rate": 3.70128603927759e-08, "logits/chosen": -20.136398315429688, "logits/rejected": -19.818639755249023, "logps/chosen": -380.4176330566406, "logps/rejected": -360.37091064453125, "loss": 0.5836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.136606454849243, "rewards/margins": 0.8030107617378235, "rewards/rejected": 2.3335952758789062, "step": 94450 }, { "epoch": 4.385533218812387, "grad_norm": 17.47260284423828, "learning_rate": 3.698500394632991e-08, "logits/chosen": -19.76304054260254, "logits/rejected": -18.80026626586914, "logps/chosen": -404.0530090332031, "logps/rejected": -386.85333251953125, "loss": 0.3306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6789939403533936, "rewards/margins": 2.2824645042419434, "rewards/rejected": 1.3965295553207397, "step": 94460 }, { "epoch": 4.38599749291982, "grad_norm": 5.758235931396484, "learning_rate": 3.695714749988393e-08, "logits/chosen": -18.739744186401367, "logits/rejected": -18.114238739013672, "logps/chosen": -402.31719970703125, "logps/rejected": -304.27557373046875, "loss": 0.4758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2802672386169434, "rewards/margins": 1.5374971628189087, "rewards/rejected": 1.7427701950073242, "step": 94470 }, { "epoch": 4.386461767027253, "grad_norm": 66.1581039428711, "learning_rate": 3.692929105343795e-08, "logits/chosen": -20.280254364013672, "logits/rejected": -18.955501556396484, "logps/chosen": -412.6376953125, "logps/rejected": -291.8449401855469, "loss": 0.5228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.386432647705078, "rewards/margins": 1.4525556564331055, "rewards/rejected": 1.9338769912719727, "step": 94480 }, { "epoch": 4.386926041134686, "grad_norm": 0.7977664470672607, "learning_rate": 3.690143460699196e-08, "logits/chosen": -18.611610412597656, "logits/rejected": -18.279157638549805, "logps/chosen": -271.88360595703125, "logps/rejected": -270.10162353515625, "loss": 0.876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8777706623077393, "rewards/margins": 0.8585359454154968, "rewards/rejected": 1.0192346572875977, "step": 94490 }, { "epoch": 4.3873903152421185, "grad_norm": 3.281012773513794, "learning_rate": 3.6873578160545984e-08, "logits/chosen": -19.28704071044922, "logits/rejected": -19.59601402282715, "logps/chosen": -335.2198181152344, "logps/rejected": -372.92291259765625, "loss": 1.1944, "rewards/accuracies": 0.5, "rewards/chosen": 3.2401375770568848, "rewards/margins": 0.18384496867656708, "rewards/rejected": 3.0562925338745117, "step": 94500 }, { "epoch": 4.387854589349552, "grad_norm": 0.009782828390598297, "learning_rate": 3.6845721714100005e-08, "logits/chosen": -19.515727996826172, "logits/rejected": -19.09036636352539, "logps/chosen": -304.2379455566406, "logps/rejected": -265.98724365234375, "loss": 0.6353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.776066303253174, "rewards/margins": 1.9138208627700806, "rewards/rejected": 1.862244963645935, "step": 94510 }, { "epoch": 4.388318863456985, "grad_norm": 16.99466896057129, "learning_rate": 3.681786526765402e-08, "logits/chosen": -18.036401748657227, "logits/rejected": -17.508869171142578, "logps/chosen": -431.13787841796875, "logps/rejected": -328.2846984863281, "loss": 0.3907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.084059953689575, "rewards/margins": 1.6385494470596313, "rewards/rejected": 1.4455101490020752, "step": 94520 }, { "epoch": 4.388783137564418, "grad_norm": 18.890422821044922, "learning_rate": 3.679000882120804e-08, "logits/chosen": -18.310848236083984, "logits/rejected": -17.31966209411621, "logps/chosen": -352.5271301269531, "logps/rejected": -241.74545288085938, "loss": 0.33, "rewards/accuracies": 1.0, "rewards/chosen": 2.3963799476623535, "rewards/margins": 1.436143398284912, "rewards/rejected": 0.9602367281913757, "step": 94530 }, { "epoch": 4.389247411671851, "grad_norm": 63.83887481689453, "learning_rate": 3.676215237476206e-08, "logits/chosen": -19.999723434448242, "logits/rejected": -18.95218849182129, "logps/chosen": -528.5086669921875, "logps/rejected": -420.5926818847656, "loss": 0.4961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.397181510925293, "rewards/margins": 1.440962791442871, "rewards/rejected": 2.956218719482422, "step": 94540 }, { "epoch": 4.389711685779284, "grad_norm": 6.030178546905518, "learning_rate": 3.6734295928316076e-08, "logits/chosen": -19.041461944580078, "logits/rejected": -18.15960693359375, "logps/chosen": -444.09259033203125, "logps/rejected": -338.37322998046875, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4789934158325195, "rewards/margins": 2.3620502948760986, "rewards/rejected": 2.116943120956421, "step": 94550 }, { "epoch": 4.390175959886717, "grad_norm": 32.55667495727539, "learning_rate": 3.670643948187009e-08, "logits/chosen": -19.68804168701172, "logits/rejected": -19.27493667602539, "logps/chosen": -395.70220947265625, "logps/rejected": -357.7657165527344, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 3.424368381500244, "rewards/margins": 1.1253795623779297, "rewards/rejected": 2.2989888191223145, "step": 94560 }, { "epoch": 4.39064023399415, "grad_norm": 40.96746063232422, "learning_rate": 3.667858303542411e-08, "logits/chosen": -19.531381607055664, "logits/rejected": -19.03415870666504, "logps/chosen": -370.6225891113281, "logps/rejected": -299.7049255371094, "loss": 0.6583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5150394439697266, "rewards/margins": 1.4730284214019775, "rewards/rejected": 2.0420117378234863, "step": 94570 }, { "epoch": 4.391104508101583, "grad_norm": 43.22657012939453, "learning_rate": 3.665072658897813e-08, "logits/chosen": -20.432369232177734, "logits/rejected": -19.211591720581055, "logps/chosen": -337.9459533691406, "logps/rejected": -209.06613159179688, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 3.4658331871032715, "rewards/margins": 3.368227005004883, "rewards/rejected": 0.09760607779026031, "step": 94580 }, { "epoch": 4.391568782209016, "grad_norm": 34.138118743896484, "learning_rate": 3.662287014253215e-08, "logits/chosen": -19.365253448486328, "logits/rejected": -18.759052276611328, "logps/chosen": -341.39337158203125, "logps/rejected": -288.2878723144531, "loss": 0.4659, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5473151206970215, "rewards/margins": 1.462260127067566, "rewards/rejected": 1.0850549936294556, "step": 94590 }, { "epoch": 4.392033056316449, "grad_norm": 41.86968231201172, "learning_rate": 3.659501369608617e-08, "logits/chosen": -19.21097183227539, "logits/rejected": -18.863494873046875, "logps/chosen": -431.39349365234375, "logps/rejected": -410.3902893066406, "loss": 0.9624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8663551807403564, "rewards/margins": 0.08697297424077988, "rewards/rejected": 2.7793822288513184, "step": 94600 }, { "epoch": 4.392497330423883, "grad_norm": 4.7219133377075195, "learning_rate": 3.6567157249640184e-08, "logits/chosen": -20.641857147216797, "logits/rejected": -19.443012237548828, "logps/chosen": -299.5135498046875, "logps/rejected": -262.2961120605469, "loss": 0.5518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4042141437530518, "rewards/margins": 1.2211074829101562, "rewards/rejected": 2.1831064224243164, "step": 94610 }, { "epoch": 4.392961604531315, "grad_norm": 17.398326873779297, "learning_rate": 3.6539300803194205e-08, "logits/chosen": -19.91984748840332, "logits/rejected": -19.079952239990234, "logps/chosen": -367.4381408691406, "logps/rejected": -294.9064636230469, "loss": 0.5243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6586029529571533, "rewards/margins": 1.2841778993606567, "rewards/rejected": 2.374425172805786, "step": 94620 }, { "epoch": 4.393425878638748, "grad_norm": 190.66180419921875, "learning_rate": 3.6511444356748226e-08, "logits/chosen": -19.058149337768555, "logits/rejected": -18.314498901367188, "logps/chosen": -393.65997314453125, "logps/rejected": -370.1238098144531, "loss": 0.8395, "rewards/accuracies": 0.5, "rewards/chosen": 3.2132999897003174, "rewards/margins": 0.2526678442955017, "rewards/rejected": 2.960631847381592, "step": 94630 }, { "epoch": 4.393890152746181, "grad_norm": 52.328636169433594, "learning_rate": 3.648358791030224e-08, "logits/chosen": -19.440433502197266, "logits/rejected": -19.443740844726562, "logps/chosen": -449.77166748046875, "logps/rejected": -414.37921142578125, "loss": 1.27, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4803600311279297, "rewards/margins": -0.016524648293852806, "rewards/rejected": 3.4968841075897217, "step": 94640 }, { "epoch": 4.3943544268536145, "grad_norm": 65.34466552734375, "learning_rate": 3.645573146385626e-08, "logits/chosen": -18.587528228759766, "logits/rejected": -17.953990936279297, "logps/chosen": -518.694580078125, "logps/rejected": -414.4932556152344, "loss": 0.5233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1949868202209473, "rewards/margins": 1.8724472522735596, "rewards/rejected": 1.3225393295288086, "step": 94650 }, { "epoch": 4.394818700961047, "grad_norm": 20.180166244506836, "learning_rate": 3.6427875017410276e-08, "logits/chosen": -19.055566787719727, "logits/rejected": -17.867334365844727, "logps/chosen": -369.97357177734375, "logps/rejected": -249.9462127685547, "loss": 0.3296, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.650484561920166, "rewards/margins": 1.6264407634735107, "rewards/rejected": 1.0240434408187866, "step": 94660 }, { "epoch": 4.39528297506848, "grad_norm": 190.01803588867188, "learning_rate": 3.64000185709643e-08, "logits/chosen": -18.62137794494629, "logits/rejected": -18.881954193115234, "logps/chosen": -345.0956115722656, "logps/rejected": -422.58203125, "loss": 1.2117, "rewards/accuracies": 0.5, "rewards/chosen": 3.656240463256836, "rewards/margins": 0.5561732649803162, "rewards/rejected": 3.100067615509033, "step": 94670 }, { "epoch": 4.395747249175914, "grad_norm": 2.4926202297210693, "learning_rate": 3.637216212451831e-08, "logits/chosen": -19.695390701293945, "logits/rejected": -19.018171310424805, "logps/chosen": -559.4385986328125, "logps/rejected": -400.09796142578125, "loss": 0.7687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.389222621917725, "rewards/margins": 1.0133442878723145, "rewards/rejected": 4.37587833404541, "step": 94680 }, { "epoch": 4.3962115232833465, "grad_norm": 10.546183586120605, "learning_rate": 3.634430567807233e-08, "logits/chosen": -19.775203704833984, "logits/rejected": -18.970144271850586, "logps/chosen": -386.26837158203125, "logps/rejected": -272.97723388671875, "loss": 0.3939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.514998435974121, "rewards/margins": 2.078449249267578, "rewards/rejected": 2.436549663543701, "step": 94690 }, { "epoch": 4.396675797390779, "grad_norm": 170.01339721679688, "learning_rate": 3.631644923162635e-08, "logits/chosen": -19.625675201416016, "logits/rejected": -18.347658157348633, "logps/chosen": -369.6200866699219, "logps/rejected": -352.91436767578125, "loss": 0.7303, "rewards/accuracies": 0.5, "rewards/chosen": 3.457529067993164, "rewards/margins": 1.270939588546753, "rewards/rejected": 2.186589479446411, "step": 94700 }, { "epoch": 4.397140071498213, "grad_norm": 154.94485473632812, "learning_rate": 3.628859278518037e-08, "logits/chosen": -19.650712966918945, "logits/rejected": -18.017913818359375, "logps/chosen": -398.89483642578125, "logps/rejected": -283.269775390625, "loss": 0.3217, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.171840190887451, "rewards/margins": 2.4932608604431152, "rewards/rejected": 1.678579330444336, "step": 94710 }, { "epoch": 4.397604345605646, "grad_norm": 0.903777003288269, "learning_rate": 3.626073633873439e-08, "logits/chosen": -18.81048011779785, "logits/rejected": -18.170276641845703, "logps/chosen": -412.43756103515625, "logps/rejected": -320.63043212890625, "loss": 0.5589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8617584705352783, "rewards/margins": 2.3159079551696777, "rewards/rejected": 1.5458507537841797, "step": 94720 }, { "epoch": 4.3980686197130785, "grad_norm": 173.87062072753906, "learning_rate": 3.6232879892288404e-08, "logits/chosen": -19.720598220825195, "logits/rejected": -18.73689842224121, "logps/chosen": -436.2776794433594, "logps/rejected": -413.81396484375, "loss": 0.4767, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.05410099029541, "rewards/margins": 1.434319257736206, "rewards/rejected": 2.6197822093963623, "step": 94730 }, { "epoch": 4.398532893820511, "grad_norm": 51.258750915527344, "learning_rate": 3.6205023445842425e-08, "logits/chosen": -19.908771514892578, "logits/rejected": -17.92078399658203, "logps/chosen": -421.62091064453125, "logps/rejected": -340.52032470703125, "loss": 0.4484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2227864265441895, "rewards/margins": 2.765723466873169, "rewards/rejected": 1.4570629596710205, "step": 94740 }, { "epoch": 4.398997167927945, "grad_norm": 4.1742401123046875, "learning_rate": 3.6177166999396447e-08, "logits/chosen": -20.14558982849121, "logits/rejected": -18.79615020751953, "logps/chosen": -467.98077392578125, "logps/rejected": -273.77691650390625, "loss": 0.2079, "rewards/accuracies": 1.0, "rewards/chosen": 4.842334270477295, "rewards/margins": 2.0230424404144287, "rewards/rejected": 2.8192920684814453, "step": 94750 }, { "epoch": 4.399461442035378, "grad_norm": 69.0228042602539, "learning_rate": 3.614931055295046e-08, "logits/chosen": -19.220125198364258, "logits/rejected": -18.296764373779297, "logps/chosen": -372.6016540527344, "logps/rejected": -311.2359313964844, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 4.038972854614258, "rewards/margins": 1.3393523693084717, "rewards/rejected": 2.6996207237243652, "step": 94760 }, { "epoch": 4.3999257161428105, "grad_norm": 0.008705373853445053, "learning_rate": 3.6121454106504476e-08, "logits/chosen": -19.133329391479492, "logits/rejected": -18.67180633544922, "logps/chosen": -333.20343017578125, "logps/rejected": -251.52883911132812, "loss": 0.5838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4329771995544434, "rewards/margins": 1.7971723079681396, "rewards/rejected": 1.6358047723770142, "step": 94770 }, { "epoch": 4.400389990250244, "grad_norm": 3.157952070236206, "learning_rate": 3.60935976600585e-08, "logits/chosen": -19.46245765686035, "logits/rejected": -17.40505027770996, "logps/chosen": -407.0962829589844, "logps/rejected": -243.5541534423828, "loss": 0.3257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.910691738128662, "rewards/margins": 2.3374686241149902, "rewards/rejected": 1.57322359085083, "step": 94780 }, { "epoch": 4.400854264357677, "grad_norm": 128.6707000732422, "learning_rate": 3.606574121361252e-08, "logits/chosen": -19.93755531311035, "logits/rejected": -18.84547233581543, "logps/chosen": -415.45098876953125, "logps/rejected": -314.1947326660156, "loss": 0.8484, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.3982625007629395, "rewards/margins": 1.0170999765396118, "rewards/rejected": 3.381162643432617, "step": 94790 }, { "epoch": 4.40131853846511, "grad_norm": 18.125709533691406, "learning_rate": 3.603788476716653e-08, "logits/chosen": -18.928909301757812, "logits/rejected": -18.530797958374023, "logps/chosen": -371.45208740234375, "logps/rejected": -316.99652099609375, "loss": 1.2652, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.837019443511963, "rewards/margins": 1.3498202562332153, "rewards/rejected": 2.487199306488037, "step": 94800 }, { "epoch": 4.4017828125725424, "grad_norm": 131.98529052734375, "learning_rate": 3.6010028320720554e-08, "logits/chosen": -18.396831512451172, "logits/rejected": -16.535255432128906, "logps/chosen": -461.62158203125, "logps/rejected": -226.2132568359375, "loss": 0.3374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5821385383605957, "rewards/margins": 2.7346432209014893, "rewards/rejected": 0.8474950790405273, "step": 94810 }, { "epoch": 4.402247086679976, "grad_norm": 20.733028411865234, "learning_rate": 3.598217187427457e-08, "logits/chosen": -20.099056243896484, "logits/rejected": -18.3963565826416, "logps/chosen": -352.68182373046875, "logps/rejected": -313.0559997558594, "loss": 0.548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3710379600524902, "rewards/margins": 1.3993537425994873, "rewards/rejected": 1.9716840982437134, "step": 94820 }, { "epoch": 4.402711360787409, "grad_norm": 7.895672798156738, "learning_rate": 3.595431542782859e-08, "logits/chosen": -18.666908264160156, "logits/rejected": -18.959688186645508, "logps/chosen": -474.0098571777344, "logps/rejected": -502.144287109375, "loss": 0.8933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.318007946014404, "rewards/margins": 0.8368963003158569, "rewards/rejected": 3.481112003326416, "step": 94830 }, { "epoch": 4.403175634894842, "grad_norm": 11.099526405334473, "learning_rate": 3.592645898138261e-08, "logits/chosen": -19.020761489868164, "logits/rejected": -18.145278930664062, "logps/chosen": -429.75433349609375, "logps/rejected": -353.11907958984375, "loss": 0.5062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7990214824676514, "rewards/margins": 0.9854069948196411, "rewards/rejected": 1.8136142492294312, "step": 94840 }, { "epoch": 4.403639909002275, "grad_norm": 64.74107360839844, "learning_rate": 3.5898602534936625e-08, "logits/chosen": -19.377290725708008, "logits/rejected": -19.14249038696289, "logps/chosen": -282.23309326171875, "logps/rejected": -289.675048828125, "loss": 0.5485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3126182556152344, "rewards/margins": 0.8905350565910339, "rewards/rejected": 1.4220831394195557, "step": 94850 }, { "epoch": 4.404104183109708, "grad_norm": 25.87092399597168, "learning_rate": 3.587074608849064e-08, "logits/chosen": -18.941265106201172, "logits/rejected": -18.000381469726562, "logps/chosen": -458.62738037109375, "logps/rejected": -369.9689636230469, "loss": 0.3052, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8303062915802, "rewards/margins": 1.312053918838501, "rewards/rejected": 2.518252372741699, "step": 94860 }, { "epoch": 4.404568457217141, "grad_norm": 39.156890869140625, "learning_rate": 3.584288964204466e-08, "logits/chosen": -20.289642333984375, "logits/rejected": -19.36611557006836, "logps/chosen": -345.4403991699219, "logps/rejected": -292.90582275390625, "loss": 0.6402, "rewards/accuracies": 0.5, "rewards/chosen": 2.7330918312072754, "rewards/margins": 1.104999303817749, "rewards/rejected": 1.6280927658081055, "step": 94870 }, { "epoch": 4.405032731324574, "grad_norm": 17.122997283935547, "learning_rate": 3.581503319559868e-08, "logits/chosen": -19.107746124267578, "logits/rejected": -17.786617279052734, "logps/chosen": -390.902587890625, "logps/rejected": -256.2223205566406, "loss": 0.693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2596025466918945, "rewards/margins": 1.145977258682251, "rewards/rejected": 1.1136252880096436, "step": 94880 }, { "epoch": 4.405497005432007, "grad_norm": 0.1184472069144249, "learning_rate": 3.5787176749152696e-08, "logits/chosen": -19.408000946044922, "logits/rejected": -17.36923599243164, "logps/chosen": -498.7349548339844, "logps/rejected": -297.17132568359375, "loss": 0.3764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.041935920715332, "rewards/margins": 2.2715210914611816, "rewards/rejected": 1.77041494846344, "step": 94890 }, { "epoch": 4.40596127953944, "grad_norm": 101.49317169189453, "learning_rate": 3.575932030270672e-08, "logits/chosen": -18.517120361328125, "logits/rejected": -17.922910690307617, "logps/chosen": -296.7379150390625, "logps/rejected": -336.14080810546875, "loss": 0.5633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.543605327606201, "rewards/margins": 0.9814078211784363, "rewards/rejected": 1.5621974468231201, "step": 94900 }, { "epoch": 4.406425553646873, "grad_norm": 109.9286880493164, "learning_rate": 3.573146385626073e-08, "logits/chosen": -18.765748977661133, "logits/rejected": -18.147001266479492, "logps/chosen": -430.4642639160156, "logps/rejected": -354.7528076171875, "loss": 0.6687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.103982448577881, "rewards/margins": 1.9185187816619873, "rewards/rejected": 2.1854634284973145, "step": 94910 }, { "epoch": 4.4068898277543065, "grad_norm": 0.20062564313411713, "learning_rate": 3.570360740981475e-08, "logits/chosen": -19.28243637084961, "logits/rejected": -18.737201690673828, "logps/chosen": -459.58331298828125, "logps/rejected": -297.61956787109375, "loss": 0.41, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5126519203186035, "rewards/margins": 2.161921501159668, "rewards/rejected": 2.3507304191589355, "step": 94920 }, { "epoch": 4.407354101861739, "grad_norm": 272.449951171875, "learning_rate": 3.5675750963368774e-08, "logits/chosen": -18.148944854736328, "logits/rejected": -18.23758316040039, "logps/chosen": -364.3268127441406, "logps/rejected": -325.0130615234375, "loss": 1.3264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4171223640441895, "rewards/margins": 0.08433042466640472, "rewards/rejected": 2.332791805267334, "step": 94930 }, { "epoch": 4.407818375969172, "grad_norm": 12.842846870422363, "learning_rate": 3.564789451692279e-08, "logits/chosen": -18.553028106689453, "logits/rejected": -17.615131378173828, "logps/chosen": -415.2462463378906, "logps/rejected": -356.90447998046875, "loss": 0.713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.796809196472168, "rewards/margins": 1.4711734056472778, "rewards/rejected": 1.3256357908248901, "step": 94940 }, { "epoch": 4.408282650076606, "grad_norm": 0.17523431777954102, "learning_rate": 3.562003807047681e-08, "logits/chosen": -18.668066024780273, "logits/rejected": -17.780105590820312, "logps/chosen": -470.5335998535156, "logps/rejected": -307.87640380859375, "loss": 0.9779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.903186321258545, "rewards/margins": 2.68862247467041, "rewards/rejected": 2.214564085006714, "step": 94950 }, { "epoch": 4.4087469241840385, "grad_norm": 1.9533119201660156, "learning_rate": 3.5592181624030825e-08, "logits/chosen": -19.02588653564453, "logits/rejected": -17.603282928466797, "logps/chosen": -401.11700439453125, "logps/rejected": -347.648681640625, "loss": 0.3291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7825064659118652, "rewards/margins": 1.8325313329696655, "rewards/rejected": 1.9499746561050415, "step": 94960 }, { "epoch": 4.409211198291471, "grad_norm": 68.92621612548828, "learning_rate": 3.5564325177584846e-08, "logits/chosen": -18.881221771240234, "logits/rejected": -18.204561233520508, "logps/chosen": -369.836669921875, "logps/rejected": -276.88812255859375, "loss": 0.4229, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0570995807647705, "rewards/margins": 1.9493951797485352, "rewards/rejected": 1.1077046394348145, "step": 94970 }, { "epoch": 4.409675472398904, "grad_norm": 235.3957977294922, "learning_rate": 3.553646873113886e-08, "logits/chosen": -18.650157928466797, "logits/rejected": -17.779766082763672, "logps/chosen": -416.4242248535156, "logps/rejected": -373.1280822753906, "loss": 0.4988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5624184608459473, "rewards/margins": 1.7051910161972046, "rewards/rejected": 1.8572275638580322, "step": 94980 }, { "epoch": 4.410139746506338, "grad_norm": 120.93892669677734, "learning_rate": 3.550861228469288e-08, "logits/chosen": -18.6556339263916, "logits/rejected": -18.103511810302734, "logps/chosen": -450.6453552246094, "logps/rejected": -406.5556640625, "loss": 0.5184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.237212181091309, "rewards/margins": 1.0156662464141846, "rewards/rejected": 3.221546173095703, "step": 94990 }, { "epoch": 4.41060402061377, "grad_norm": 52.998287200927734, "learning_rate": 3.54807558382469e-08, "logits/chosen": -18.573902130126953, "logits/rejected": -17.41499137878418, "logps/chosen": -444.6090393066406, "logps/rejected": -269.2882385253906, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": 3.4125022888183594, "rewards/margins": 2.0884222984313965, "rewards/rejected": 1.3240797519683838, "step": 95000 }, { "epoch": 4.411068294721203, "grad_norm": 44.65687561035156, "learning_rate": 3.545289939180092e-08, "logits/chosen": -18.43372917175293, "logits/rejected": -18.761327743530273, "logps/chosen": -365.42486572265625, "logps/rejected": -341.2782897949219, "loss": 1.9633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8799304962158203, "rewards/margins": -0.4246189594268799, "rewards/rejected": 4.304549217224121, "step": 95010 }, { "epoch": 4.411532568828637, "grad_norm": 82.0828628540039, "learning_rate": 3.542504294535494e-08, "logits/chosen": -19.21776008605957, "logits/rejected": -18.33794403076172, "logps/chosen": -336.09478759765625, "logps/rejected": -339.90057373046875, "loss": 0.614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0534164905548096, "rewards/margins": 0.9445081949234009, "rewards/rejected": 2.1089086532592773, "step": 95020 }, { "epoch": 4.41199684293607, "grad_norm": 71.62572479248047, "learning_rate": 3.539718649890895e-08, "logits/chosen": -18.54596519470215, "logits/rejected": -17.493993759155273, "logps/chosen": -383.9247131347656, "logps/rejected": -294.084228515625, "loss": 1.0655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.82543683052063, "rewards/margins": 1.536171555519104, "rewards/rejected": 2.2892653942108154, "step": 95030 }, { "epoch": 4.412461117043502, "grad_norm": 94.42771911621094, "learning_rate": 3.5369330052462974e-08, "logits/chosen": -20.061193466186523, "logits/rejected": -18.83770751953125, "logps/chosen": -405.3810119628906, "logps/rejected": -305.81964111328125, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 4.1388983726501465, "rewards/margins": 2.147651195526123, "rewards/rejected": 1.9912469387054443, "step": 95040 }, { "epoch": 4.412925391150935, "grad_norm": 0.11330844461917877, "learning_rate": 3.534147360601699e-08, "logits/chosen": -19.660558700561523, "logits/rejected": -18.542999267578125, "logps/chosen": -347.99395751953125, "logps/rejected": -325.7900390625, "loss": 0.5685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.994378089904785, "rewards/margins": 1.0277483463287354, "rewards/rejected": 1.966629981994629, "step": 95050 }, { "epoch": 4.413389665258369, "grad_norm": 10.970118522644043, "learning_rate": 3.531361715957101e-08, "logits/chosen": -19.127582550048828, "logits/rejected": -18.83156394958496, "logps/chosen": -406.92449951171875, "logps/rejected": -263.34271240234375, "loss": 1.0745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9316837787628174, "rewards/margins": 0.8129087686538696, "rewards/rejected": 2.1187751293182373, "step": 95060 }, { "epoch": 4.413853939365802, "grad_norm": 51.854679107666016, "learning_rate": 3.5285760713125024e-08, "logits/chosen": -19.140140533447266, "logits/rejected": -18.43463897705078, "logps/chosen": -392.34234619140625, "logps/rejected": -325.5555114746094, "loss": 0.354, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.232192277908325, "rewards/margins": 1.5363037586212158, "rewards/rejected": 1.695888876914978, "step": 95070 }, { "epoch": 4.414318213473234, "grad_norm": 2.789097547531128, "learning_rate": 3.5257904266679045e-08, "logits/chosen": -18.932004928588867, "logits/rejected": -19.198749542236328, "logps/chosen": -369.5400390625, "logps/rejected": -347.39459228515625, "loss": 1.7504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2041289806365967, "rewards/margins": 0.06444714218378067, "rewards/rejected": 3.1396820545196533, "step": 95080 }, { "epoch": 4.414782487580668, "grad_norm": 30.89788246154785, "learning_rate": 3.5230047820233067e-08, "logits/chosen": -19.53125, "logits/rejected": -19.05483627319336, "logps/chosen": -417.5625, "logps/rejected": -326.98907470703125, "loss": 0.8365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3341095447540283, "rewards/margins": 0.559858500957489, "rewards/rejected": 2.7742509841918945, "step": 95090 }, { "epoch": 4.415246761688101, "grad_norm": 65.391845703125, "learning_rate": 3.520219137378708e-08, "logits/chosen": -18.711177825927734, "logits/rejected": -18.290149688720703, "logps/chosen": -382.4411315917969, "logps/rejected": -344.01605224609375, "loss": 1.0206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6615004539489746, "rewards/margins": 0.5593810081481934, "rewards/rejected": 3.1021196842193604, "step": 95100 }, { "epoch": 4.415711035795534, "grad_norm": 123.13041687011719, "learning_rate": 3.51743349273411e-08, "logits/chosen": -18.91940689086914, "logits/rejected": -18.624069213867188, "logps/chosen": -411.59674072265625, "logps/rejected": -383.97772216796875, "loss": 0.9939, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.642357587814331, "rewards/margins": 0.32534080743789673, "rewards/rejected": 3.3170166015625, "step": 95110 }, { "epoch": 4.416175309902966, "grad_norm": 149.71803283691406, "learning_rate": 3.514647848089512e-08, "logits/chosen": -19.0336971282959, "logits/rejected": -18.401744842529297, "logps/chosen": -389.8546142578125, "logps/rejected": -334.54437255859375, "loss": 0.5752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8097071647644043, "rewards/margins": 0.9925436973571777, "rewards/rejected": 2.8171637058258057, "step": 95120 }, { "epoch": 4.4166395840104, "grad_norm": 92.8851318359375, "learning_rate": 3.511862203444914e-08, "logits/chosen": -19.48337745666504, "logits/rejected": -18.860185623168945, "logps/chosen": -467.1976013183594, "logps/rejected": -419.1234436035156, "loss": 0.4621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5950264930725098, "rewards/margins": 1.3316707611083984, "rewards/rejected": 2.2633557319641113, "step": 95130 }, { "epoch": 4.417103858117833, "grad_norm": 168.9477081298828, "learning_rate": 3.509076558800316e-08, "logits/chosen": -19.239864349365234, "logits/rejected": -18.0439395904541, "logps/chosen": -472.46429443359375, "logps/rejected": -372.13604736328125, "loss": 0.6186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.383518695831299, "rewards/margins": 0.7801424860954285, "rewards/rejected": 3.6033759117126465, "step": 95140 }, { "epoch": 4.417568132225266, "grad_norm": 126.63771057128906, "learning_rate": 3.5062909141557174e-08, "logits/chosen": -19.13068199157715, "logits/rejected": -19.108394622802734, "logps/chosen": -231.64901733398438, "logps/rejected": -237.0457305908203, "loss": 0.8528, "rewards/accuracies": 0.5, "rewards/chosen": 1.0398268699645996, "rewards/margins": 0.1451464295387268, "rewards/rejected": 0.894680380821228, "step": 95150 }, { "epoch": 4.418032406332699, "grad_norm": 52.457611083984375, "learning_rate": 3.503505269511119e-08, "logits/chosen": -19.193498611450195, "logits/rejected": -17.98798370361328, "logps/chosen": -421.37542724609375, "logps/rejected": -301.469970703125, "loss": 0.2565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7904067039489746, "rewards/margins": 2.5425426959991455, "rewards/rejected": 1.2478641271591187, "step": 95160 }, { "epoch": 4.418496680440132, "grad_norm": 81.87235260009766, "learning_rate": 3.500719624866521e-08, "logits/chosen": -18.729267120361328, "logits/rejected": -17.980300903320312, "logps/chosen": -356.14984130859375, "logps/rejected": -284.72430419921875, "loss": 0.4099, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1324381828308105, "rewards/margins": 1.5825884342193604, "rewards/rejected": 1.5498497486114502, "step": 95170 }, { "epoch": 4.418960954547565, "grad_norm": 72.3803482055664, "learning_rate": 3.497933980221923e-08, "logits/chosen": -18.97003173828125, "logits/rejected": -17.850265502929688, "logps/chosen": -286.3738098144531, "logps/rejected": -254.8327178955078, "loss": 0.5792, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.062239170074463, "rewards/margins": 1.7418149709701538, "rewards/rejected": 0.3204244077205658, "step": 95180 }, { "epoch": 4.4194252286549975, "grad_norm": 92.49871063232422, "learning_rate": 3.4951483355773245e-08, "logits/chosen": -18.69662857055664, "logits/rejected": -19.46040916442871, "logps/chosen": -441.2439880371094, "logps/rejected": -388.6443786621094, "loss": 0.9587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8842215538024902, "rewards/margins": 0.028033113107085228, "rewards/rejected": 2.8561885356903076, "step": 95190 }, { "epoch": 4.419889502762431, "grad_norm": 40.94520568847656, "learning_rate": 3.4923626909327266e-08, "logits/chosen": -18.948429107666016, "logits/rejected": -18.225317001342773, "logps/chosen": -379.5046691894531, "logps/rejected": -322.46612548828125, "loss": 0.6417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.097581386566162, "rewards/margins": 1.2448031902313232, "rewards/rejected": 1.852778434753418, "step": 95200 }, { "epoch": 4.420353776869864, "grad_norm": 0.08916269987821579, "learning_rate": 3.489577046288129e-08, "logits/chosen": -19.118892669677734, "logits/rejected": -19.132877349853516, "logps/chosen": -239.5590362548828, "logps/rejected": -272.8663635253906, "loss": 1.4273, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.624651551246643, "rewards/margins": 0.01773412898182869, "rewards/rejected": 1.606917381286621, "step": 95210 }, { "epoch": 4.420818050977297, "grad_norm": 15.79145622253418, "learning_rate": 3.48679140164353e-08, "logits/chosen": -19.497737884521484, "logits/rejected": -17.376171112060547, "logps/chosen": -438.38720703125, "logps/rejected": -252.88406372070312, "loss": 0.9059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.691446304321289, "rewards/margins": 2.4879653453826904, "rewards/rejected": 2.2034807205200195, "step": 95220 }, { "epoch": 4.42128232508473, "grad_norm": 127.77835845947266, "learning_rate": 3.484005756998932e-08, "logits/chosen": -19.3441104888916, "logits/rejected": -18.736467361450195, "logps/chosen": -404.36187744140625, "logps/rejected": -396.40509033203125, "loss": 0.8268, "rewards/accuracies": 0.5, "rewards/chosen": 3.8698794841766357, "rewards/margins": 0.3302146792411804, "rewards/rejected": 3.5396647453308105, "step": 95230 }, { "epoch": 4.421746599192163, "grad_norm": 76.13194274902344, "learning_rate": 3.481220112354334e-08, "logits/chosen": -19.932687759399414, "logits/rejected": -18.772783279418945, "logps/chosen": -384.09613037109375, "logps/rejected": -311.96099853515625, "loss": 0.8955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8385720252990723, "rewards/margins": 0.33189836144447327, "rewards/rejected": 2.506673574447632, "step": 95240 }, { "epoch": 4.422210873299596, "grad_norm": 277.20050048828125, "learning_rate": 3.478434467709735e-08, "logits/chosen": -18.949337005615234, "logits/rejected": -19.061702728271484, "logps/chosen": -463.5869140625, "logps/rejected": -391.61981201171875, "loss": 0.7163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5549087524414062, "rewards/margins": 0.49044960737228394, "rewards/rejected": 3.0644593238830566, "step": 95250 }, { "epoch": 4.422675147407029, "grad_norm": 83.66602325439453, "learning_rate": 3.475648823065137e-08, "logits/chosen": -19.453079223632812, "logits/rejected": -18.736562728881836, "logps/chosen": -387.9070739746094, "logps/rejected": -310.8690185546875, "loss": 0.7202, "rewards/accuracies": 0.5, "rewards/chosen": 3.439244508743286, "rewards/margins": 1.1266367435455322, "rewards/rejected": 2.312607765197754, "step": 95260 }, { "epoch": 4.423139421514462, "grad_norm": 236.63243103027344, "learning_rate": 3.4728631784205394e-08, "logits/chosen": -19.43100357055664, "logits/rejected": -18.360977172851562, "logps/chosen": -457.37567138671875, "logps/rejected": -394.94921875, "loss": 0.5589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8707175254821777, "rewards/margins": 1.4855716228485107, "rewards/rejected": 2.385145664215088, "step": 95270 }, { "epoch": 4.423603695621895, "grad_norm": 237.3043212890625, "learning_rate": 3.470077533775941e-08, "logits/chosen": -20.45154571533203, "logits/rejected": -19.61098861694336, "logps/chosen": -360.91436767578125, "logps/rejected": -350.8265686035156, "loss": 0.6675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.931133985519409, "rewards/margins": 1.0386431217193604, "rewards/rejected": 2.892490863800049, "step": 95280 }, { "epoch": 4.424067969729328, "grad_norm": 29.81304168701172, "learning_rate": 3.467291889131343e-08, "logits/chosen": -19.105731964111328, "logits/rejected": -18.91399383544922, "logps/chosen": -386.79388427734375, "logps/rejected": -334.7311706542969, "loss": 0.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2903289794921875, "rewards/margins": 1.9905074834823608, "rewards/rejected": 2.2998220920562744, "step": 95290 }, { "epoch": 4.424532243836762, "grad_norm": 14.98654556274414, "learning_rate": 3.464506244486745e-08, "logits/chosen": -19.478670120239258, "logits/rejected": -18.976604461669922, "logps/chosen": -523.1334228515625, "logps/rejected": -412.4580078125, "loss": 0.4148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.170300006866455, "rewards/margins": 1.5587607622146606, "rewards/rejected": 2.611539125442505, "step": 95300 }, { "epoch": 4.424996517944194, "grad_norm": 117.81233978271484, "learning_rate": 3.4617205998421466e-08, "logits/chosen": -19.237960815429688, "logits/rejected": -18.55045509338379, "logps/chosen": -366.9454650878906, "logps/rejected": -284.18243408203125, "loss": 0.7424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.160665988922119, "rewards/margins": 1.5771554708480835, "rewards/rejected": 1.5835105180740356, "step": 95310 }, { "epoch": 4.425460792051627, "grad_norm": 20.247541427612305, "learning_rate": 3.458934955197549e-08, "logits/chosen": -18.5328369140625, "logits/rejected": -18.767871856689453, "logps/chosen": -316.6759033203125, "logps/rejected": -301.3414611816406, "loss": 1.221, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6247401237487793, "rewards/margins": -0.35529181361198425, "rewards/rejected": 2.980032205581665, "step": 95320 }, { "epoch": 4.42592506615906, "grad_norm": 1.3588663339614868, "learning_rate": 3.45614931055295e-08, "logits/chosen": -19.998506546020508, "logits/rejected": -19.03069305419922, "logps/chosen": -380.2285461425781, "logps/rejected": -306.0899658203125, "loss": 0.4175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.010590076446533, "rewards/margins": 2.0440003871917725, "rewards/rejected": 1.9665895700454712, "step": 95330 }, { "epoch": 4.4263893402664936, "grad_norm": 28.407041549682617, "learning_rate": 3.453363665908352e-08, "logits/chosen": -20.018856048583984, "logits/rejected": -18.766069412231445, "logps/chosen": -476.2254943847656, "logps/rejected": -452.16131591796875, "loss": 0.5964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8077964782714844, "rewards/margins": 0.6453595161437988, "rewards/rejected": 3.1624369621276855, "step": 95340 }, { "epoch": 4.426853614373926, "grad_norm": 20.178234100341797, "learning_rate": 3.450578021263754e-08, "logits/chosen": -19.178735733032227, "logits/rejected": -18.71065330505371, "logps/chosen": -402.4004821777344, "logps/rejected": -351.02178955078125, "loss": 0.4449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9171042442321777, "rewards/margins": 1.5041662454605103, "rewards/rejected": 2.412938356399536, "step": 95350 }, { "epoch": 4.427317888481359, "grad_norm": 8.187969207763672, "learning_rate": 3.447792376619156e-08, "logits/chosen": -18.818096160888672, "logits/rejected": -18.682846069335938, "logps/chosen": -446.6463928222656, "logps/rejected": -392.2322692871094, "loss": 0.7151, "rewards/accuracies": 0.5, "rewards/chosen": 3.9500527381896973, "rewards/margins": 0.9838587045669556, "rewards/rejected": 2.966193675994873, "step": 95360 }, { "epoch": 4.427782162588793, "grad_norm": 57.914283752441406, "learning_rate": 3.445006731974557e-08, "logits/chosen": -18.979015350341797, "logits/rejected": -17.944438934326172, "logps/chosen": -414.67535400390625, "logps/rejected": -349.0198669433594, "loss": 0.7162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9999547004699707, "rewards/margins": 1.348405361175537, "rewards/rejected": 1.6515495777130127, "step": 95370 }, { "epoch": 4.4282464366962255, "grad_norm": 0.35078537464141846, "learning_rate": 3.4422210873299594e-08, "logits/chosen": -18.277862548828125, "logits/rejected": -18.166194915771484, "logps/chosen": -407.623046875, "logps/rejected": -384.56866455078125, "loss": 1.1087, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4128875732421875, "rewards/margins": 0.18140241503715515, "rewards/rejected": 3.231485366821289, "step": 95380 }, { "epoch": 4.428710710803658, "grad_norm": 0.9754546284675598, "learning_rate": 3.4394354426853615e-08, "logits/chosen": -18.482860565185547, "logits/rejected": -17.947216033935547, "logps/chosen": -330.8316650390625, "logps/rejected": -276.3977966308594, "loss": 0.5732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.093857288360596, "rewards/margins": 1.998963713645935, "rewards/rejected": 2.094893217086792, "step": 95390 }, { "epoch": 4.429174984911091, "grad_norm": 119.07025146484375, "learning_rate": 3.436649798040763e-08, "logits/chosen": -19.486034393310547, "logits/rejected": -18.839139938354492, "logps/chosen": -395.2478332519531, "logps/rejected": -277.51531982421875, "loss": 0.8971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.060857057571411, "rewards/margins": 0.5835205912590027, "rewards/rejected": 2.4773364067077637, "step": 95400 }, { "epoch": 4.429639259018525, "grad_norm": 9.933796882629395, "learning_rate": 3.433864153396165e-08, "logits/chosen": -18.53537940979004, "logits/rejected": -18.102294921875, "logps/chosen": -445.20928955078125, "logps/rejected": -320.65228271484375, "loss": 0.7625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.856865406036377, "rewards/margins": 1.5117237567901611, "rewards/rejected": 2.3451409339904785, "step": 95410 }, { "epoch": 4.4301035331259575, "grad_norm": 4.5932183265686035, "learning_rate": 3.431078508751567e-08, "logits/chosen": -19.347856521606445, "logits/rejected": -18.536678314208984, "logps/chosen": -414.244873046875, "logps/rejected": -357.17657470703125, "loss": 0.763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.058493614196777, "rewards/margins": 1.0437095165252686, "rewards/rejected": 3.014784336090088, "step": 95420 }, { "epoch": 4.43056780723339, "grad_norm": 4.66226863861084, "learning_rate": 3.4282928641069687e-08, "logits/chosen": -19.803741455078125, "logits/rejected": -18.459278106689453, "logps/chosen": -370.35186767578125, "logps/rejected": -268.73748779296875, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.321711778640747, "rewards/margins": 1.6412696838378906, "rewards/rejected": 0.6804423332214355, "step": 95430 }, { "epoch": 4.431032081340824, "grad_norm": 2.167900562286377, "learning_rate": 3.425507219462371e-08, "logits/chosen": -19.341278076171875, "logits/rejected": -18.57697868347168, "logps/chosen": -412.72076416015625, "logps/rejected": -276.91534423828125, "loss": 0.2578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.780088424682617, "rewards/margins": 2.5709116458892822, "rewards/rejected": 1.2091764211654663, "step": 95440 }, { "epoch": 4.431496355448257, "grad_norm": 23.643342971801758, "learning_rate": 3.422721574817772e-08, "logits/chosen": -19.467449188232422, "logits/rejected": -18.69285011291504, "logps/chosen": -429.8174743652344, "logps/rejected": -354.97100830078125, "loss": 0.7936, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.219897747039795, "rewards/margins": 0.30547386407852173, "rewards/rejected": 2.914423704147339, "step": 95450 }, { "epoch": 4.4319606295556895, "grad_norm": 120.40447235107422, "learning_rate": 3.419935930173174e-08, "logits/chosen": -19.053817749023438, "logits/rejected": -19.67544937133789, "logps/chosen": -342.0572204589844, "logps/rejected": -439.33099365234375, "loss": 1.157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.320171356201172, "rewards/margins": 0.17860452830791473, "rewards/rejected": 3.141566753387451, "step": 95460 }, { "epoch": 4.432424903663122, "grad_norm": 19.11492156982422, "learning_rate": 3.417150285528576e-08, "logits/chosen": -19.569318771362305, "logits/rejected": -17.270923614501953, "logps/chosen": -382.84100341796875, "logps/rejected": -214.3514862060547, "loss": 0.3778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2107067108154297, "rewards/margins": 3.1198410987854004, "rewards/rejected": 0.09086586534976959, "step": 95470 }, { "epoch": 4.432889177770556, "grad_norm": 0.018715176731348038, "learning_rate": 3.414364640883978e-08, "logits/chosen": -19.705347061157227, "logits/rejected": -18.68062400817871, "logps/chosen": -377.89886474609375, "logps/rejected": -277.0212707519531, "loss": 0.1776, "rewards/accuracies": 1.0, "rewards/chosen": 4.595547199249268, "rewards/margins": 3.2047266960144043, "rewards/rejected": 1.3908207416534424, "step": 95480 }, { "epoch": 4.433353451877989, "grad_norm": 12.04736042022705, "learning_rate": 3.4115789962393794e-08, "logits/chosen": -18.50222396850586, "logits/rejected": -17.480335235595703, "logps/chosen": -516.1854858398438, "logps/rejected": -370.20587158203125, "loss": 0.5563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.214598655700684, "rewards/margins": 1.9452111721038818, "rewards/rejected": 2.2693874835968018, "step": 95490 }, { "epoch": 4.4338177259854215, "grad_norm": 219.41139221191406, "learning_rate": 3.4087933515947815e-08, "logits/chosen": -19.03498077392578, "logits/rejected": -18.712404251098633, "logps/chosen": -385.9295654296875, "logps/rejected": -336.65093994140625, "loss": 1.0364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.080198764801025, "rewards/margins": 1.7889293432235718, "rewards/rejected": 2.2912697792053223, "step": 95500 }, { "epoch": 4.434282000092855, "grad_norm": 98.40067291259766, "learning_rate": 3.4060077069501836e-08, "logits/chosen": -20.033588409423828, "logits/rejected": -19.638456344604492, "logps/chosen": -338.2137451171875, "logps/rejected": -329.370361328125, "loss": 1.147, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3667831420898438, "rewards/margins": 0.6554746627807617, "rewards/rejected": 2.711308479309082, "step": 95510 }, { "epoch": 4.434746274200288, "grad_norm": 149.7635498046875, "learning_rate": 3.403222062305585e-08, "logits/chosen": -18.566299438476562, "logits/rejected": -18.996402740478516, "logps/chosen": -334.5617370605469, "logps/rejected": -346.3386535644531, "loss": 0.583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6294033527374268, "rewards/margins": 0.7570304274559021, "rewards/rejected": 2.872373104095459, "step": 95520 }, { "epoch": 4.435210548307721, "grad_norm": 54.89863586425781, "learning_rate": 3.400436417660987e-08, "logits/chosen": -18.82596206665039, "logits/rejected": -18.488157272338867, "logps/chosen": -413.75146484375, "logps/rejected": -380.9302978515625, "loss": 0.9145, "rewards/accuracies": 0.5, "rewards/chosen": 2.807335615158081, "rewards/margins": 0.13156957924365997, "rewards/rejected": 2.6757659912109375, "step": 95530 }, { "epoch": 4.435674822415154, "grad_norm": 25.44382095336914, "learning_rate": 3.3976507730163886e-08, "logits/chosen": -19.16139793395996, "logits/rejected": -18.43935203552246, "logps/chosen": -351.6976623535156, "logps/rejected": -294.47125244140625, "loss": 0.604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.348275661468506, "rewards/margins": 0.7852655053138733, "rewards/rejected": 2.5630104541778564, "step": 95540 }, { "epoch": 4.436139096522587, "grad_norm": 4.796718120574951, "learning_rate": 3.39486512837179e-08, "logits/chosen": -18.95496368408203, "logits/rejected": -18.683916091918945, "logps/chosen": -334.18780517578125, "logps/rejected": -308.3898620605469, "loss": 1.4163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.32908296585083, "rewards/margins": 0.6895340085029602, "rewards/rejected": 2.6395487785339355, "step": 95550 }, { "epoch": 4.43660337063002, "grad_norm": 18.35993194580078, "learning_rate": 3.392079483727192e-08, "logits/chosen": -18.385292053222656, "logits/rejected": -17.497886657714844, "logps/chosen": -376.5789489746094, "logps/rejected": -293.0716552734375, "loss": 0.3615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0354578495025635, "rewards/margins": 1.4975578784942627, "rewards/rejected": 1.5378999710083008, "step": 95560 }, { "epoch": 4.437067644737453, "grad_norm": 123.53993225097656, "learning_rate": 3.389293839082594e-08, "logits/chosen": -19.82996368408203, "logits/rejected": -18.700403213500977, "logps/chosen": -412.12554931640625, "logps/rejected": -347.75018310546875, "loss": 0.4561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.156766414642334, "rewards/margins": 2.0604710578918457, "rewards/rejected": 2.0962953567504883, "step": 95570 }, { "epoch": 4.437531918844886, "grad_norm": 4.605188846588135, "learning_rate": 3.386508194437996e-08, "logits/chosen": -18.833110809326172, "logits/rejected": -17.89658546447754, "logps/chosen": -223.7273406982422, "logps/rejected": -188.34739685058594, "loss": 0.4567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.471586227416992, "rewards/margins": 1.6903259754180908, "rewards/rejected": 0.7812603116035461, "step": 95580 }, { "epoch": 4.437996192952319, "grad_norm": 174.14190673828125, "learning_rate": 3.383722549793398e-08, "logits/chosen": -18.280588150024414, "logits/rejected": -18.289447784423828, "logps/chosen": -389.18280029296875, "logps/rejected": -350.99481201171875, "loss": 0.864, "rewards/accuracies": 0.5, "rewards/chosen": 2.715121030807495, "rewards/margins": 0.33084121346473694, "rewards/rejected": 2.38427996635437, "step": 95590 }, { "epoch": 4.438460467059752, "grad_norm": 2.454920768737793, "learning_rate": 3.3809369051488e-08, "logits/chosen": -19.675209045410156, "logits/rejected": -18.571138381958008, "logps/chosen": -464.2742614746094, "logps/rejected": -429.085205078125, "loss": 0.687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.479091644287109, "rewards/margins": 1.7148170471191406, "rewards/rejected": 2.7642745971679688, "step": 95600 }, { "epoch": 4.4389247411671855, "grad_norm": 27.43414306640625, "learning_rate": 3.3781512605042014e-08, "logits/chosen": -19.336320877075195, "logits/rejected": -18.206806182861328, "logps/chosen": -426.34979248046875, "logps/rejected": -225.6448974609375, "loss": 0.4024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7944741249084473, "rewards/margins": 1.5644967555999756, "rewards/rejected": 1.2299773693084717, "step": 95610 }, { "epoch": 4.439389015274618, "grad_norm": 6.990747451782227, "learning_rate": 3.3753656158596036e-08, "logits/chosen": -18.669353485107422, "logits/rejected": -17.51706314086914, "logps/chosen": -404.2047119140625, "logps/rejected": -293.2475891113281, "loss": 0.171, "rewards/accuracies": 1.0, "rewards/chosen": 4.282342433929443, "rewards/margins": 2.3636999130249023, "rewards/rejected": 1.9186424016952515, "step": 95620 }, { "epoch": 4.439853289382051, "grad_norm": 27.495473861694336, "learning_rate": 3.372579971215006e-08, "logits/chosen": -20.025854110717773, "logits/rejected": -18.8887939453125, "logps/chosen": -313.0288391113281, "logps/rejected": -286.42236328125, "loss": 0.2412, "rewards/accuracies": 1.0, "rewards/chosen": 3.7171273231506348, "rewards/margins": 1.455966591835022, "rewards/rejected": 2.2611606121063232, "step": 95630 }, { "epoch": 4.440317563489484, "grad_norm": 259.27886962890625, "learning_rate": 3.369794326570407e-08, "logits/chosen": -18.546527862548828, "logits/rejected": -17.82185935974121, "logps/chosen": -381.90423583984375, "logps/rejected": -349.9701232910156, "loss": 0.5371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.296262741088867, "rewards/margins": 0.9659668803215027, "rewards/rejected": 2.330296039581299, "step": 95640 }, { "epoch": 4.4407818375969175, "grad_norm": 142.46932983398438, "learning_rate": 3.3670086819258086e-08, "logits/chosen": -19.342031478881836, "logits/rejected": -18.577110290527344, "logps/chosen": -322.0499267578125, "logps/rejected": -259.22186279296875, "loss": 0.4892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3954286575317383, "rewards/margins": 1.4832080602645874, "rewards/rejected": 0.9122206568717957, "step": 95650 }, { "epoch": 4.44124611170435, "grad_norm": 10.803940773010254, "learning_rate": 3.364223037281211e-08, "logits/chosen": -18.899486541748047, "logits/rejected": -18.62508201599121, "logps/chosen": -325.760009765625, "logps/rejected": -223.34866333007812, "loss": 0.7646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3275070190429688, "rewards/margins": 1.514289140701294, "rewards/rejected": 1.8132178783416748, "step": 95660 }, { "epoch": 4.441710385811783, "grad_norm": 25.253787994384766, "learning_rate": 3.361437392636612e-08, "logits/chosen": -18.45032501220703, "logits/rejected": -18.559064865112305, "logps/chosen": -445.5604553222656, "logps/rejected": -323.26611328125, "loss": 0.8315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.410299301147461, "rewards/margins": 2.0248942375183105, "rewards/rejected": 2.3854050636291504, "step": 95670 }, { "epoch": 4.442174659919217, "grad_norm": 115.23148345947266, "learning_rate": 3.358651747992014e-08, "logits/chosen": -18.72224998474121, "logits/rejected": -17.68581771850586, "logps/chosen": -430.1200256347656, "logps/rejected": -312.5433044433594, "loss": 0.5628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.943819046020508, "rewards/margins": 1.4888436794281006, "rewards/rejected": 1.4549751281738281, "step": 95680 }, { "epoch": 4.4426389340266494, "grad_norm": 155.6549072265625, "learning_rate": 3.3558661033474164e-08, "logits/chosen": -19.469303131103516, "logits/rejected": -18.498289108276367, "logps/chosen": -466.4736328125, "logps/rejected": -353.372314453125, "loss": 0.5463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.02425479888916, "rewards/margins": 0.5150924921035767, "rewards/rejected": 3.509162187576294, "step": 95690 }, { "epoch": 4.443103208134082, "grad_norm": 15.845606803894043, "learning_rate": 3.353080458702818e-08, "logits/chosen": -18.857999801635742, "logits/rejected": -18.77980613708496, "logps/chosen": -353.1932678222656, "logps/rejected": -297.88629150390625, "loss": 0.8689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1889872550964355, "rewards/margins": 0.4566716253757477, "rewards/rejected": 2.7323155403137207, "step": 95700 }, { "epoch": 4.443567482241515, "grad_norm": 9.531781196594238, "learning_rate": 3.35029481405822e-08, "logits/chosen": -18.900156021118164, "logits/rejected": -17.953500747680664, "logps/chosen": -415.7276916503906, "logps/rejected": -389.02862548828125, "loss": 0.5984, "rewards/accuracies": 0.5, "rewards/chosen": 4.587612152099609, "rewards/margins": 1.9543250799179077, "rewards/rejected": 2.6332874298095703, "step": 95710 }, { "epoch": 4.444031756348949, "grad_norm": 126.26100158691406, "learning_rate": 3.347509169413622e-08, "logits/chosen": -18.413461685180664, "logits/rejected": -18.026357650756836, "logps/chosen": -467.6484375, "logps/rejected": -404.77239990234375, "loss": 1.1318, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8368759155273438, "rewards/margins": 1.0923669338226318, "rewards/rejected": 2.744508981704712, "step": 95720 }, { "epoch": 4.444496030456381, "grad_norm": 168.7482452392578, "learning_rate": 3.3447235247690235e-08, "logits/chosen": -18.34119415283203, "logits/rejected": -17.645771026611328, "logps/chosen": -398.1383056640625, "logps/rejected": -336.28240966796875, "loss": 0.6226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.479322910308838, "rewards/margins": 1.6461451053619385, "rewards/rejected": 1.833177924156189, "step": 95730 }, { "epoch": 4.444960304563814, "grad_norm": 0.4170450270175934, "learning_rate": 3.3419378801244256e-08, "logits/chosen": -19.39418601989746, "logits/rejected": -18.637510299682617, "logps/chosen": -394.77691650390625, "logps/rejected": -260.7659606933594, "loss": 0.4854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3450324535369873, "rewards/margins": 1.6894162893295288, "rewards/rejected": 1.655616044998169, "step": 95740 }, { "epoch": 4.445424578671248, "grad_norm": 15.230878829956055, "learning_rate": 3.339152235479827e-08, "logits/chosen": -18.30868911743164, "logits/rejected": -18.185115814208984, "logps/chosen": -382.7652282714844, "logps/rejected": -362.00823974609375, "loss": 0.6446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1300106048583984, "rewards/margins": 0.5020087361335754, "rewards/rejected": 1.6280019283294678, "step": 95750 }, { "epoch": 4.445888852778681, "grad_norm": 1.9134869575500488, "learning_rate": 3.3363665908352285e-08, "logits/chosen": -19.355735778808594, "logits/rejected": -18.556135177612305, "logps/chosen": -454.68408203125, "logps/rejected": -371.8661804199219, "loss": 0.6137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.228671073913574, "rewards/margins": 1.412229061126709, "rewards/rejected": 2.816441535949707, "step": 95760 }, { "epoch": 4.446353126886113, "grad_norm": 73.57189178466797, "learning_rate": 3.3335809461906307e-08, "logits/chosen": -19.15756607055664, "logits/rejected": -19.424022674560547, "logps/chosen": -338.09765625, "logps/rejected": -398.9211730957031, "loss": 1.2446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3576343059539795, "rewards/margins": -0.3582856059074402, "rewards/rejected": 2.7159199714660645, "step": 95770 }, { "epoch": 4.446817400993547, "grad_norm": 17.163105010986328, "learning_rate": 3.330795301546033e-08, "logits/chosen": -19.167102813720703, "logits/rejected": -18.143627166748047, "logps/chosen": -356.4073181152344, "logps/rejected": -252.1333770751953, "loss": 0.4324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1211185455322266, "rewards/margins": 1.903595209121704, "rewards/rejected": 1.217523455619812, "step": 95780 }, { "epoch": 4.44728167510098, "grad_norm": 44.39375305175781, "learning_rate": 3.328009656901434e-08, "logits/chosen": -19.360050201416016, "logits/rejected": -18.7301082611084, "logps/chosen": -374.7692565917969, "logps/rejected": -376.0710754394531, "loss": 0.7505, "rewards/accuracies": 0.5, "rewards/chosen": 3.411278486251831, "rewards/margins": 0.41994810104370117, "rewards/rejected": 2.99133038520813, "step": 95790 }, { "epoch": 4.447745949208413, "grad_norm": 116.56407165527344, "learning_rate": 3.3252240122568363e-08, "logits/chosen": -18.639232635498047, "logits/rejected": -18.665632247924805, "logps/chosen": -487.41241455078125, "logps/rejected": -382.30255126953125, "loss": 0.7629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.69575834274292, "rewards/margins": 1.2186137437820435, "rewards/rejected": 2.477145195007324, "step": 95800 }, { "epoch": 4.448210223315845, "grad_norm": 9.80475902557373, "learning_rate": 3.3224383676122385e-08, "logits/chosen": -18.606975555419922, "logits/rejected": -18.30416488647461, "logps/chosen": -406.89886474609375, "logps/rejected": -324.8313903808594, "loss": 0.6193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.423008680343628, "rewards/margins": 0.8845470547676086, "rewards/rejected": 2.538461685180664, "step": 95810 }, { "epoch": 4.448674497423279, "grad_norm": 0.7464114427566528, "learning_rate": 3.31965272296764e-08, "logits/chosen": -19.549484252929688, "logits/rejected": -17.969207763671875, "logps/chosen": -303.2642517089844, "logps/rejected": -213.689697265625, "loss": 0.2956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.418578863143921, "rewards/margins": 2.135549306869507, "rewards/rejected": 0.2830291986465454, "step": 95820 }, { "epoch": 4.449138771530712, "grad_norm": 136.63278198242188, "learning_rate": 3.316867078323042e-08, "logits/chosen": -19.054180145263672, "logits/rejected": -17.90538787841797, "logps/chosen": -302.2893981933594, "logps/rejected": -251.457275390625, "loss": 0.2752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.04007625579834, "rewards/margins": 2.3967082500457764, "rewards/rejected": 0.6433683633804321, "step": 95830 }, { "epoch": 4.449603045638145, "grad_norm": 9.985634803771973, "learning_rate": 3.314081433678444e-08, "logits/chosen": -19.102197647094727, "logits/rejected": -17.80405044555664, "logps/chosen": -398.65582275390625, "logps/rejected": -316.6722412109375, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8582165241241455, "rewards/margins": 2.1113038063049316, "rewards/rejected": 1.7469127178192139, "step": 95840 }, { "epoch": 4.450067319745578, "grad_norm": 275.2033386230469, "learning_rate": 3.3112957890338456e-08, "logits/chosen": -19.571308135986328, "logits/rejected": -18.670085906982422, "logps/chosen": -377.1700134277344, "logps/rejected": -279.4326477050781, "loss": 0.5852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.085758209228516, "rewards/margins": 1.1791455745697021, "rewards/rejected": 2.9066126346588135, "step": 95850 }, { "epoch": 4.450531593853011, "grad_norm": 94.42912292480469, "learning_rate": 3.308510144389247e-08, "logits/chosen": -19.619972229003906, "logits/rejected": -18.95395851135254, "logps/chosen": -453.45233154296875, "logps/rejected": -445.893798828125, "loss": 1.1823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.957415819168091, "rewards/margins": 0.7994450330734253, "rewards/rejected": 3.157970666885376, "step": 95860 }, { "epoch": 4.450995867960444, "grad_norm": 3.6987524032592773, "learning_rate": 3.305724499744649e-08, "logits/chosen": -18.797283172607422, "logits/rejected": -18.22991371154785, "logps/chosen": -366.7498779296875, "logps/rejected": -336.22467041015625, "loss": 0.5406, "rewards/accuracies": 0.5, "rewards/chosen": 4.122054100036621, "rewards/margins": 1.5425300598144531, "rewards/rejected": 2.579524278640747, "step": 95870 }, { "epoch": 4.4514601420678765, "grad_norm": 6.516318321228027, "learning_rate": 3.3029388551000506e-08, "logits/chosen": -18.24704933166504, "logits/rejected": -17.49774742126465, "logps/chosen": -390.53497314453125, "logps/rejected": -243.6273651123047, "loss": 0.658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.909353733062744, "rewards/margins": 1.4029996395111084, "rewards/rejected": 1.5063542127609253, "step": 95880 }, { "epoch": 4.45192441617531, "grad_norm": 30.104026794433594, "learning_rate": 3.300153210455453e-08, "logits/chosen": -19.05158805847168, "logits/rejected": -18.073131561279297, "logps/chosen": -342.2362365722656, "logps/rejected": -273.2636413574219, "loss": 0.6809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.048673152923584, "rewards/margins": 2.3411502838134766, "rewards/rejected": 1.7075226306915283, "step": 95890 }, { "epoch": 4.452388690282743, "grad_norm": 64.42591094970703, "learning_rate": 3.297367565810855e-08, "logits/chosen": -18.58383560180664, "logits/rejected": -17.716411590576172, "logps/chosen": -432.7757263183594, "logps/rejected": -328.46331787109375, "loss": 0.7778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.548382520675659, "rewards/margins": 1.2824878692626953, "rewards/rejected": 2.2658944129943848, "step": 95900 }, { "epoch": 4.452852964390176, "grad_norm": 83.10028839111328, "learning_rate": 3.294581921166256e-08, "logits/chosen": -19.727230072021484, "logits/rejected": -19.65680503845215, "logps/chosen": -407.8191833496094, "logps/rejected": -330.232421875, "loss": 0.7716, "rewards/accuracies": 0.5, "rewards/chosen": 3.446502685546875, "rewards/margins": 0.6915351152420044, "rewards/rejected": 2.7549679279327393, "step": 95910 }, { "epoch": 4.453317238497609, "grad_norm": 20.89700698852539, "learning_rate": 3.2917962765216584e-08, "logits/chosen": -19.472169876098633, "logits/rejected": -18.735225677490234, "logps/chosen": -412.9864196777344, "logps/rejected": -348.0545349121094, "loss": 0.2992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.330502510070801, "rewards/margins": 2.2852413654327393, "rewards/rejected": 2.0452613830566406, "step": 95920 }, { "epoch": 4.453781512605042, "grad_norm": 195.69259643554688, "learning_rate": 3.2890106318770605e-08, "logits/chosen": -19.146804809570312, "logits/rejected": -19.328948974609375, "logps/chosen": -298.23846435546875, "logps/rejected": -344.5440979003906, "loss": 0.9978, "rewards/accuracies": 0.5, "rewards/chosen": 2.7356836795806885, "rewards/margins": 0.2880484163761139, "rewards/rejected": 2.4476351737976074, "step": 95930 }, { "epoch": 4.454245786712475, "grad_norm": 122.7421875, "learning_rate": 3.286224987232462e-08, "logits/chosen": -19.256362915039062, "logits/rejected": -17.4152889251709, "logps/chosen": -409.08087158203125, "logps/rejected": -273.5337829589844, "loss": 0.1417, "rewards/accuracies": 1.0, "rewards/chosen": 4.585984706878662, "rewards/margins": 3.1771481037139893, "rewards/rejected": 1.4088375568389893, "step": 95940 }, { "epoch": 4.454710060819908, "grad_norm": 0.26776909828186035, "learning_rate": 3.2834393425878634e-08, "logits/chosen": -18.303478240966797, "logits/rejected": -17.085338592529297, "logps/chosen": -546.9549560546875, "logps/rejected": -355.0885009765625, "loss": 0.3264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.857192039489746, "rewards/margins": 3.131077289581299, "rewards/rejected": 1.7261152267456055, "step": 95950 }, { "epoch": 4.455174334927341, "grad_norm": 25.699817657470703, "learning_rate": 3.2806536979432656e-08, "logits/chosen": -19.844552993774414, "logits/rejected": -17.850444793701172, "logps/chosen": -426.1507263183594, "logps/rejected": -268.61932373046875, "loss": 0.3725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.758281707763672, "rewards/margins": 2.7391531467437744, "rewards/rejected": 2.0191285610198975, "step": 95960 }, { "epoch": 4.455638609034774, "grad_norm": 3.3867950439453125, "learning_rate": 3.277868053298667e-08, "logits/chosen": -19.267780303955078, "logits/rejected": -18.278940200805664, "logps/chosen": -400.32537841796875, "logps/rejected": -370.2442321777344, "loss": 0.5558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1000101566314697, "rewards/margins": 0.9501675367355347, "rewards/rejected": 2.1498425006866455, "step": 95970 }, { "epoch": 4.456102883142207, "grad_norm": 78.39669799804688, "learning_rate": 3.275082408654069e-08, "logits/chosen": -19.18061637878418, "logits/rejected": -19.531742095947266, "logps/chosen": -362.0551452636719, "logps/rejected": -417.7383728027344, "loss": 1.1125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6708874702453613, "rewards/margins": -0.10391576588153839, "rewards/rejected": 3.774803638458252, "step": 95980 }, { "epoch": 4.456567157249641, "grad_norm": 185.47921752929688, "learning_rate": 3.272296764009471e-08, "logits/chosen": -18.478261947631836, "logits/rejected": -18.400503158569336, "logps/chosen": -352.33282470703125, "logps/rejected": -297.8467102050781, "loss": 0.6776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9132933616638184, "rewards/margins": 0.5827730894088745, "rewards/rejected": 2.3305203914642334, "step": 95990 }, { "epoch": 4.457031431357073, "grad_norm": 17.83748435974121, "learning_rate": 3.269511119364873e-08, "logits/chosen": -18.380657196044922, "logits/rejected": -17.273157119750977, "logps/chosen": -358.52655029296875, "logps/rejected": -219.44839477539062, "loss": 0.431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0637784004211426, "rewards/margins": 1.6989977359771729, "rewards/rejected": 0.3647807240486145, "step": 96000 }, { "epoch": 4.457495705464506, "grad_norm": 2.1506640911102295, "learning_rate": 3.266725474720275e-08, "logits/chosen": -18.66379165649414, "logits/rejected": -17.394987106323242, "logps/chosen": -376.4839782714844, "logps/rejected": -256.2743225097656, "loss": 0.5831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1909451484680176, "rewards/margins": 1.5252573490142822, "rewards/rejected": 1.6656877994537354, "step": 96010 }, { "epoch": 4.457959979571939, "grad_norm": 21.663440704345703, "learning_rate": 3.263939830075677e-08, "logits/chosen": -18.902164459228516, "logits/rejected": -17.9649715423584, "logps/chosen": -470.20318603515625, "logps/rejected": -383.087890625, "loss": 0.5725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5546631813049316, "rewards/margins": 1.5392112731933594, "rewards/rejected": 2.015451669692993, "step": 96020 }, { "epoch": 4.458424253679373, "grad_norm": 5.35239315032959, "learning_rate": 3.2611541854310784e-08, "logits/chosen": -19.62861442565918, "logits/rejected": -18.226139068603516, "logps/chosen": -417.95477294921875, "logps/rejected": -342.73358154296875, "loss": 0.9537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2517595291137695, "rewards/margins": 1.946488618850708, "rewards/rejected": 2.3052709102630615, "step": 96030 }, { "epoch": 4.458888527786805, "grad_norm": 119.25259399414062, "learning_rate": 3.25836854078648e-08, "logits/chosen": -18.455209732055664, "logits/rejected": -17.90087127685547, "logps/chosen": -383.9930114746094, "logps/rejected": -278.11346435546875, "loss": 0.3954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6153671741485596, "rewards/margins": 1.3064922094345093, "rewards/rejected": 1.3088749647140503, "step": 96040 }, { "epoch": 4.459352801894238, "grad_norm": 167.156982421875, "learning_rate": 3.255582896141882e-08, "logits/chosen": -18.517688751220703, "logits/rejected": -18.157428741455078, "logps/chosen": -381.6533508300781, "logps/rejected": -348.4153747558594, "loss": 1.1553, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5581116676330566, "rewards/margins": 0.5095760226249695, "rewards/rejected": 3.0485358238220215, "step": 96050 }, { "epoch": 4.459817076001672, "grad_norm": 17.71601104736328, "learning_rate": 3.252797251497284e-08, "logits/chosen": -18.65914535522461, "logits/rejected": -18.210847854614258, "logps/chosen": -374.66644287109375, "logps/rejected": -297.70855712890625, "loss": 0.7489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.375084638595581, "rewards/margins": 1.2886180877685547, "rewards/rejected": 2.0864667892456055, "step": 96060 }, { "epoch": 4.4602813501091045, "grad_norm": 4.609218120574951, "learning_rate": 3.2500116068526855e-08, "logits/chosen": -19.83773422241211, "logits/rejected": -19.565162658691406, "logps/chosen": -383.3969421386719, "logps/rejected": -316.4718017578125, "loss": 0.2588, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3382086753845215, "rewards/margins": 1.9998584985733032, "rewards/rejected": 2.338350534439087, "step": 96070 }, { "epoch": 4.460745624216537, "grad_norm": 9.764716148376465, "learning_rate": 3.2472259622080876e-08, "logits/chosen": -18.75626564025879, "logits/rejected": -18.424877166748047, "logps/chosen": -390.8609313964844, "logps/rejected": -300.17889404296875, "loss": 1.0395, "rewards/accuracies": 0.5, "rewards/chosen": 2.839996576309204, "rewards/margins": 1.153912901878357, "rewards/rejected": 1.6860835552215576, "step": 96080 }, { "epoch": 4.46120989832397, "grad_norm": 9.814024925231934, "learning_rate": 3.244440317563489e-08, "logits/chosen": -19.855802536010742, "logits/rejected": -20.010740280151367, "logps/chosen": -410.1314392089844, "logps/rejected": -374.3804016113281, "loss": 0.3005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.859494686126709, "rewards/margins": 1.6058645248413086, "rewards/rejected": 2.2536301612854004, "step": 96090 }, { "epoch": 4.461674172431404, "grad_norm": 144.3162078857422, "learning_rate": 3.241654672918891e-08, "logits/chosen": -19.1246395111084, "logits/rejected": -18.593421936035156, "logps/chosen": -339.6616516113281, "logps/rejected": -313.7955017089844, "loss": 0.4886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.711730480194092, "rewards/margins": 0.8113783597946167, "rewards/rejected": 1.900352120399475, "step": 96100 }, { "epoch": 4.4621384465388365, "grad_norm": 136.0010986328125, "learning_rate": 3.238869028274293e-08, "logits/chosen": -18.524036407470703, "logits/rejected": -18.371288299560547, "logps/chosen": -347.125, "logps/rejected": -276.49713134765625, "loss": 0.5485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5761091709136963, "rewards/margins": 0.8439946174621582, "rewards/rejected": 1.732114553451538, "step": 96110 }, { "epoch": 4.462602720646269, "grad_norm": 80.80750274658203, "learning_rate": 3.236083383629695e-08, "logits/chosen": -20.329601287841797, "logits/rejected": -19.881534576416016, "logps/chosen": -406.69647216796875, "logps/rejected": -380.39752197265625, "loss": 0.2887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.946847438812256, "rewards/margins": 1.964277982711792, "rewards/rejected": 1.9825694561004639, "step": 96120 }, { "epoch": 4.463066994753703, "grad_norm": 81.72018432617188, "learning_rate": 3.233297738985097e-08, "logits/chosen": -19.42226219177246, "logits/rejected": -19.012548446655273, "logps/chosen": -345.69781494140625, "logps/rejected": -333.1711120605469, "loss": 0.868, "rewards/accuracies": 0.5, "rewards/chosen": 2.0567502975463867, "rewards/margins": 0.05213572829961777, "rewards/rejected": 2.0046143531799316, "step": 96130 }, { "epoch": 4.463531268861136, "grad_norm": 25.672136306762695, "learning_rate": 3.2305120943404983e-08, "logits/chosen": -18.517831802368164, "logits/rejected": -17.536331176757812, "logps/chosen": -268.68438720703125, "logps/rejected": -195.41436767578125, "loss": 0.3315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.740544080734253, "rewards/margins": 1.811505913734436, "rewards/rejected": -0.07096154987812042, "step": 96140 }, { "epoch": 4.4639955429685685, "grad_norm": 40.93906784057617, "learning_rate": 3.2277264496959005e-08, "logits/chosen": -18.599790573120117, "logits/rejected": -18.620990753173828, "logps/chosen": -390.4104919433594, "logps/rejected": -375.78912353515625, "loss": 1.0607, "rewards/accuracies": 0.5, "rewards/chosen": 2.955631732940674, "rewards/margins": 0.21314501762390137, "rewards/rejected": 2.7424864768981934, "step": 96150 }, { "epoch": 4.464459817076001, "grad_norm": 1.1207780838012695, "learning_rate": 3.224940805051302e-08, "logits/chosen": -18.12204360961914, "logits/rejected": -17.80034828186035, "logps/chosen": -280.6324768066406, "logps/rejected": -249.11758422851562, "loss": 0.9886, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.550614356994629, "rewards/margins": 0.5783596038818359, "rewards/rejected": 0.972254753112793, "step": 96160 }, { "epoch": 4.464924091183435, "grad_norm": 7.973782539367676, "learning_rate": 3.222155160406704e-08, "logits/chosen": -19.051340103149414, "logits/rejected": -17.66680145263672, "logps/chosen": -434.41937255859375, "logps/rejected": -283.1610107421875, "loss": 0.3742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.055399417877197, "rewards/margins": 2.3175671100616455, "rewards/rejected": 1.7378320693969727, "step": 96170 }, { "epoch": 4.465388365290868, "grad_norm": 0.5929165482521057, "learning_rate": 3.2193695157621055e-08, "logits/chosen": -19.340038299560547, "logits/rejected": -17.610469818115234, "logps/chosen": -377.71331787109375, "logps/rejected": -242.84982299804688, "loss": 0.6096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.440716743469238, "rewards/margins": 2.2408244609832764, "rewards/rejected": 2.19989275932312, "step": 96180 }, { "epoch": 4.4658526393983005, "grad_norm": 34.49563217163086, "learning_rate": 3.2165838711175076e-08, "logits/chosen": -18.697128295898438, "logits/rejected": -17.872211456298828, "logps/chosen": -328.68035888671875, "logps/rejected": -230.83316040039062, "loss": 0.5281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2521660327911377, "rewards/margins": 1.727569341659546, "rewards/rejected": 0.5245965719223022, "step": 96190 }, { "epoch": 4.466316913505734, "grad_norm": 47.87236785888672, "learning_rate": 3.21379822647291e-08, "logits/chosen": -18.968223571777344, "logits/rejected": -19.257930755615234, "logps/chosen": -328.7803649902344, "logps/rejected": -336.66473388671875, "loss": 1.0297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.139552593231201, "rewards/margins": 0.855053722858429, "rewards/rejected": 2.284499406814575, "step": 96200 }, { "epoch": 4.466781187613167, "grad_norm": 79.6543960571289, "learning_rate": 3.211012581828311e-08, "logits/chosen": -18.432785034179688, "logits/rejected": -17.955707550048828, "logps/chosen": -318.96514892578125, "logps/rejected": -297.4548645019531, "loss": 0.8163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3885669708251953, "rewards/margins": 0.6750284433364868, "rewards/rejected": 1.7135385274887085, "step": 96210 }, { "epoch": 4.4672454617206, "grad_norm": 289.57708740234375, "learning_rate": 3.208226937183713e-08, "logits/chosen": -18.42531967163086, "logits/rejected": -18.547401428222656, "logps/chosen": -235.3230743408203, "logps/rejected": -261.2666320800781, "loss": 1.2227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.267330527305603, "rewards/margins": 0.293314129114151, "rewards/rejected": 0.9740163683891296, "step": 96220 }, { "epoch": 4.467709735828032, "grad_norm": 17.660696029663086, "learning_rate": 3.2054412925391154e-08, "logits/chosen": -19.93886947631836, "logits/rejected": -18.433406829833984, "logps/chosen": -450.25604248046875, "logps/rejected": -316.56048583984375, "loss": 0.284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.004178047180176, "rewards/margins": 2.3142237663269043, "rewards/rejected": 2.6899542808532715, "step": 96230 }, { "epoch": 4.468174009935466, "grad_norm": 85.2146224975586, "learning_rate": 3.202655647894517e-08, "logits/chosen": -19.791954040527344, "logits/rejected": -18.768184661865234, "logps/chosen": -432.73419189453125, "logps/rejected": -301.51605224609375, "loss": 0.3685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.378396987915039, "rewards/margins": 1.884368658065796, "rewards/rejected": 2.494028329849243, "step": 96240 }, { "epoch": 4.468638284042899, "grad_norm": 88.7799072265625, "learning_rate": 3.199870003249918e-08, "logits/chosen": -18.806846618652344, "logits/rejected": -18.60881233215332, "logps/chosen": -417.4447326660156, "logps/rejected": -263.4990234375, "loss": 0.7102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.657266139984131, "rewards/margins": 0.7980908155441284, "rewards/rejected": 1.859175443649292, "step": 96250 }, { "epoch": 4.469102558150332, "grad_norm": 10.646954536437988, "learning_rate": 3.1970843586053204e-08, "logits/chosen": -20.173778533935547, "logits/rejected": -19.010778427124023, "logps/chosen": -386.35333251953125, "logps/rejected": -287.8878479003906, "loss": 0.609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.555957078933716, "rewards/margins": 1.3607081174850464, "rewards/rejected": 2.195248603820801, "step": 96260 }, { "epoch": 4.469566832257765, "grad_norm": 24.303869247436523, "learning_rate": 3.1942987139607225e-08, "logits/chosen": -19.544862747192383, "logits/rejected": -19.195756912231445, "logps/chosen": -322.82537841796875, "logps/rejected": -266.8738098144531, "loss": 0.4817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.426164150238037, "rewards/margins": 1.3267860412597656, "rewards/rejected": 1.0993781089782715, "step": 96270 }, { "epoch": 4.470031106365198, "grad_norm": 192.81248474121094, "learning_rate": 3.191513069316124e-08, "logits/chosen": -18.7342472076416, "logits/rejected": -18.979921340942383, "logps/chosen": -482.14801025390625, "logps/rejected": -390.9756164550781, "loss": 0.8312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0454046726226807, "rewards/margins": 0.6264413595199585, "rewards/rejected": 2.4189631938934326, "step": 96280 }, { "epoch": 4.470495380472631, "grad_norm": 1.597400426864624, "learning_rate": 3.188727424671526e-08, "logits/chosen": -19.433063507080078, "logits/rejected": -18.959192276000977, "logps/chosen": -356.8709411621094, "logps/rejected": -370.2447204589844, "loss": 0.6269, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0420026779174805, "rewards/margins": 1.2293977737426758, "rewards/rejected": 2.8126046657562256, "step": 96290 }, { "epoch": 4.470959654580064, "grad_norm": 8.514338493347168, "learning_rate": 3.1859417800269276e-08, "logits/chosen": -18.664134979248047, "logits/rejected": -17.686384201049805, "logps/chosen": -434.10498046875, "logps/rejected": -353.13385009765625, "loss": 0.884, "rewards/accuracies": 0.5, "rewards/chosen": 3.5704455375671387, "rewards/margins": 0.8790786862373352, "rewards/rejected": 2.6913671493530273, "step": 96300 }, { "epoch": 4.471423928687497, "grad_norm": 8.47651481628418, "learning_rate": 3.183434699846789e-08, "logits/chosen": -19.44382095336914, "logits/rejected": -18.98837661743164, "logps/chosen": -458.34161376953125, "logps/rejected": -372.18206787109375, "loss": 0.585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.01536226272583, "rewards/margins": 1.0813947916030884, "rewards/rejected": 2.9339675903320312, "step": 96310 }, { "epoch": 4.47188820279493, "grad_norm": 86.70118713378906, "learning_rate": 3.180649055202191e-08, "logits/chosen": -19.5703067779541, "logits/rejected": -17.86252212524414, "logps/chosen": -378.09539794921875, "logps/rejected": -237.18862915039062, "loss": 0.3281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9255454540252686, "rewards/margins": 2.395803689956665, "rewards/rejected": 1.5297415256500244, "step": 96320 }, { "epoch": 4.472352476902363, "grad_norm": 10.945174217224121, "learning_rate": 3.177863410557593e-08, "logits/chosen": -19.160938262939453, "logits/rejected": -17.84805679321289, "logps/chosen": -335.4590759277344, "logps/rejected": -260.90545654296875, "loss": 0.3915, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.131680727005005, "rewards/margins": 1.7333341836929321, "rewards/rejected": 1.3983466625213623, "step": 96330 }, { "epoch": 4.4728167510097965, "grad_norm": 0.3678884506225586, "learning_rate": 3.175077765912995e-08, "logits/chosen": -19.312732696533203, "logits/rejected": -18.69196128845215, "logps/chosen": -353.2586364746094, "logps/rejected": -343.24163818359375, "loss": 0.3531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.530603408813477, "rewards/margins": 2.627202272415161, "rewards/rejected": 1.9034007787704468, "step": 96340 }, { "epoch": 4.473281025117229, "grad_norm": 0.5083808302879333, "learning_rate": 3.172292121268397e-08, "logits/chosen": -19.191102981567383, "logits/rejected": -19.02980613708496, "logps/chosen": -462.8330993652344, "logps/rejected": -362.509765625, "loss": 0.5207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.626784086227417, "rewards/margins": 1.1080862283706665, "rewards/rejected": 2.518698215484619, "step": 96350 }, { "epoch": 4.473745299224662, "grad_norm": 217.42523193359375, "learning_rate": 3.169506476623799e-08, "logits/chosen": -19.286319732666016, "logits/rejected": -19.267667770385742, "logps/chosen": -383.4866027832031, "logps/rejected": -353.50750732421875, "loss": 1.1984, "rewards/accuracies": 0.5, "rewards/chosen": 2.4628028869628906, "rewards/margins": 0.1076514944434166, "rewards/rejected": 2.355151653289795, "step": 96360 }, { "epoch": 4.474209573332096, "grad_norm": 0.3687494695186615, "learning_rate": 3.1667208319792004e-08, "logits/chosen": -18.3494815826416, "logits/rejected": -17.33128547668457, "logps/chosen": -276.3697204589844, "logps/rejected": -183.43829345703125, "loss": 0.4051, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.249180316925049, "rewards/margins": 1.9769136905670166, "rewards/rejected": 0.2722666561603546, "step": 96370 }, { "epoch": 4.4746738474395285, "grad_norm": 29.99972152709961, "learning_rate": 3.1639351873346025e-08, "logits/chosen": -18.386516571044922, "logits/rejected": -17.22481918334961, "logps/chosen": -410.7215270996094, "logps/rejected": -287.61285400390625, "loss": 0.5262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.71913480758667, "rewards/margins": 2.1180503368377686, "rewards/rejected": 1.6010844707489014, "step": 96380 }, { "epoch": 4.475138121546961, "grad_norm": 17.216135025024414, "learning_rate": 3.161149542690004e-08, "logits/chosen": -18.88252830505371, "logits/rejected": -17.897525787353516, "logps/chosen": -481.0835876464844, "logps/rejected": -384.7727966308594, "loss": 0.2234, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9253604412078857, "rewards/margins": 2.174323558807373, "rewards/rejected": 1.7510360479354858, "step": 96390 }, { "epoch": 4.475602395654394, "grad_norm": 0.1999358981847763, "learning_rate": 3.1583638980454054e-08, "logits/chosen": -19.02681541442871, "logits/rejected": -18.546855926513672, "logps/chosen": -437.16351318359375, "logps/rejected": -285.27593994140625, "loss": 0.696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1018686294555664, "rewards/margins": 1.3954131603240967, "rewards/rejected": 1.7064555883407593, "step": 96400 }, { "epoch": 4.476066669761828, "grad_norm": 54.706844329833984, "learning_rate": 3.1555782534008075e-08, "logits/chosen": -18.96969223022461, "logits/rejected": -18.20026397705078, "logps/chosen": -389.2225646972656, "logps/rejected": -290.9993591308594, "loss": 0.3899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8551032543182373, "rewards/margins": 1.8121821880340576, "rewards/rejected": 2.0429210662841797, "step": 96410 }, { "epoch": 4.47653094386926, "grad_norm": 23.223920822143555, "learning_rate": 3.1527926087562096e-08, "logits/chosen": -19.321456909179688, "logits/rejected": -18.19042205810547, "logps/chosen": -418.1397399902344, "logps/rejected": -319.20050048828125, "loss": 0.8391, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0387916564941406, "rewards/margins": 1.3436706066131592, "rewards/rejected": 1.6951210498809814, "step": 96420 }, { "epoch": 4.476995217976693, "grad_norm": 191.16461181640625, "learning_rate": 3.150006964111611e-08, "logits/chosen": -19.049617767333984, "logits/rejected": -18.495622634887695, "logps/chosen": -360.025390625, "logps/rejected": -320.46453857421875, "loss": 1.0295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5763537883758545, "rewards/margins": 0.9747588038444519, "rewards/rejected": 2.6015944480895996, "step": 96430 }, { "epoch": 4.477459492084127, "grad_norm": 16.745269775390625, "learning_rate": 3.147221319467013e-08, "logits/chosen": -20.47373390197754, "logits/rejected": -19.284765243530273, "logps/chosen": -500.5409240722656, "logps/rejected": -380.2020263671875, "loss": 0.2394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.930636405944824, "rewards/margins": 2.3584556579589844, "rewards/rejected": 2.572181463241577, "step": 96440 }, { "epoch": 4.47792376619156, "grad_norm": 90.80781555175781, "learning_rate": 3.144435674822415e-08, "logits/chosen": -20.557397842407227, "logits/rejected": -18.942222595214844, "logps/chosen": -474.9571838378906, "logps/rejected": -393.12689208984375, "loss": 0.46, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.053890228271484, "rewards/margins": 1.2435266971588135, "rewards/rejected": 2.8103630542755127, "step": 96450 }, { "epoch": 4.478388040298992, "grad_norm": 1.0481898784637451, "learning_rate": 3.141650030177817e-08, "logits/chosen": -19.478256225585938, "logits/rejected": -18.2784366607666, "logps/chosen": -433.22149658203125, "logps/rejected": -345.0267639160156, "loss": 0.3573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.732541561126709, "rewards/margins": 2.5637900829315186, "rewards/rejected": 2.1687514781951904, "step": 96460 }, { "epoch": 4.478852314406425, "grad_norm": 221.15509033203125, "learning_rate": 3.138864385533219e-08, "logits/chosen": -18.94676971435547, "logits/rejected": -18.405261993408203, "logps/chosen": -429.48590087890625, "logps/rejected": -399.6593933105469, "loss": 1.1893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.536216735839844, "rewards/margins": 0.8054541349411011, "rewards/rejected": 3.730762004852295, "step": 96470 }, { "epoch": 4.479316588513859, "grad_norm": 0.04884415119886398, "learning_rate": 3.13607874088862e-08, "logits/chosen": -18.43592071533203, "logits/rejected": -17.817672729492188, "logps/chosen": -414.62213134765625, "logps/rejected": -400.9008483886719, "loss": 0.4248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7314810752868652, "rewards/margins": 1.936693787574768, "rewards/rejected": 1.7947877645492554, "step": 96480 }, { "epoch": 4.479780862621292, "grad_norm": 14.128072738647461, "learning_rate": 3.1332930962440224e-08, "logits/chosen": -19.878158569335938, "logits/rejected": -18.742910385131836, "logps/chosen": -484.43548583984375, "logps/rejected": -341.5446472167969, "loss": 0.3983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.184161186218262, "rewards/margins": 2.100215196609497, "rewards/rejected": 2.0839457511901855, "step": 96490 }, { "epoch": 4.480245136728724, "grad_norm": 42.409664154052734, "learning_rate": 3.130507451599424e-08, "logits/chosen": -19.332475662231445, "logits/rejected": -18.39810562133789, "logps/chosen": -404.8173522949219, "logps/rejected": -345.8384094238281, "loss": 0.3877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3580870628356934, "rewards/margins": 1.5025066137313843, "rewards/rejected": 1.8555809259414673, "step": 96500 }, { "epoch": 4.480709410836158, "grad_norm": 135.28271484375, "learning_rate": 3.127721806954826e-08, "logits/chosen": -18.052705764770508, "logits/rejected": -18.656457901000977, "logps/chosen": -317.5360412597656, "logps/rejected": -269.87176513671875, "loss": 1.1782, "rewards/accuracies": 0.5, "rewards/chosen": 2.4032514095306396, "rewards/margins": 0.9570502042770386, "rewards/rejected": 1.4462010860443115, "step": 96510 }, { "epoch": 4.481173684943591, "grad_norm": 59.11973571777344, "learning_rate": 3.1249361623102275e-08, "logits/chosen": -18.85397720336914, "logits/rejected": -17.801815032958984, "logps/chosen": -319.15936279296875, "logps/rejected": -210.5610809326172, "loss": 0.4909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.824129343032837, "rewards/margins": 0.6497329473495483, "rewards/rejected": 1.174396276473999, "step": 96520 }, { "epoch": 4.481637959051024, "grad_norm": 1.3187562227249146, "learning_rate": 3.1221505176656296e-08, "logits/chosen": -19.398073196411133, "logits/rejected": -18.992380142211914, "logps/chosen": -458.824951171875, "logps/rejected": -370.03985595703125, "loss": 1.2479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.930420398712158, "rewards/margins": 0.4678735136985779, "rewards/rejected": 3.4625465869903564, "step": 96530 }, { "epoch": 4.482102233158456, "grad_norm": 64.1837387084961, "learning_rate": 3.119364873021032e-08, "logits/chosen": -17.960439682006836, "logits/rejected": -17.610671997070312, "logps/chosen": -375.1528015136719, "logps/rejected": -364.60833740234375, "loss": 1.3001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.852860689163208, "rewards/margins": 1.2554419040679932, "rewards/rejected": 1.5974185466766357, "step": 96540 }, { "epoch": 4.48256650726589, "grad_norm": 214.5835723876953, "learning_rate": 3.116579228376433e-08, "logits/chosen": -19.43739891052246, "logits/rejected": -18.284313201904297, "logps/chosen": -414.99853515625, "logps/rejected": -416.7972717285156, "loss": 0.6813, "rewards/accuracies": 0.5, "rewards/chosen": 4.233331680297852, "rewards/margins": 0.9034382700920105, "rewards/rejected": 3.3298935890197754, "step": 96550 }, { "epoch": 4.483030781373323, "grad_norm": 77.06616973876953, "learning_rate": 3.113793583731835e-08, "logits/chosen": -18.988140106201172, "logits/rejected": -17.69028663635254, "logps/chosen": -383.0924072265625, "logps/rejected": -244.1703643798828, "loss": 0.2784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5088305473327637, "rewards/margins": 2.13824725151062, "rewards/rejected": 1.370583415031433, "step": 96560 }, { "epoch": 4.4834950554807556, "grad_norm": 8.190610885620117, "learning_rate": 3.1110079390872374e-08, "logits/chosen": -18.898210525512695, "logits/rejected": -17.07767105102539, "logps/chosen": -360.36358642578125, "logps/rejected": -207.5986785888672, "loss": 0.2166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3453707695007324, "rewards/margins": 3.2402443885803223, "rewards/rejected": 0.1051262766122818, "step": 96570 }, { "epoch": 4.483959329588189, "grad_norm": 8.022046089172363, "learning_rate": 3.108222294442639e-08, "logits/chosen": -18.951005935668945, "logits/rejected": -18.85947608947754, "logps/chosen": -320.74249267578125, "logps/rejected": -311.8602600097656, "loss": 1.0499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5869853496551514, "rewards/margins": 0.5580962300300598, "rewards/rejected": 3.0288891792297363, "step": 96580 }, { "epoch": 4.484423603695622, "grad_norm": 2.3402442932128906, "learning_rate": 3.105436649798041e-08, "logits/chosen": -19.02971649169922, "logits/rejected": -18.902809143066406, "logps/chosen": -443.410888671875, "logps/rejected": -395.15130615234375, "loss": 0.9785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7610976696014404, "rewards/margins": 0.5921494960784912, "rewards/rejected": 3.1689484119415283, "step": 96590 }, { "epoch": 4.484887877803055, "grad_norm": 15.958969116210938, "learning_rate": 3.1026510051534424e-08, "logits/chosen": -19.296297073364258, "logits/rejected": -17.990474700927734, "logps/chosen": -498.5032653808594, "logps/rejected": -326.41168212890625, "loss": 0.5003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.662696838378906, "rewards/margins": 2.2413158416748047, "rewards/rejected": 2.4213805198669434, "step": 96600 }, { "epoch": 4.485352151910488, "grad_norm": 11.61629867553711, "learning_rate": 3.099865360508844e-08, "logits/chosen": -18.37885284423828, "logits/rejected": -18.61261558532715, "logps/chosen": -287.93218994140625, "logps/rejected": -407.3351745605469, "loss": 1.2387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4838814735412598, "rewards/margins": 0.3346736431121826, "rewards/rejected": 2.149207592010498, "step": 96610 }, { "epoch": 4.485816426017921, "grad_norm": 37.313228607177734, "learning_rate": 3.097079715864246e-08, "logits/chosen": -18.898963928222656, "logits/rejected": -17.926733016967773, "logps/chosen": -541.6369018554688, "logps/rejected": -373.54083251953125, "loss": 0.7709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3359174728393555, "rewards/margins": 1.6497008800506592, "rewards/rejected": 2.6862165927886963, "step": 96620 }, { "epoch": 4.486280700125354, "grad_norm": 35.28059387207031, "learning_rate": 3.094294071219648e-08, "logits/chosen": -18.248241424560547, "logits/rejected": -17.903079986572266, "logps/chosen": -354.01812744140625, "logps/rejected": -350.73004150390625, "loss": 0.9848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.962245464324951, "rewards/margins": 0.6122053861618042, "rewards/rejected": 2.3500404357910156, "step": 96630 }, { "epoch": 4.486744974232787, "grad_norm": 47.30995178222656, "learning_rate": 3.0915084265750495e-08, "logits/chosen": -19.810592651367188, "logits/rejected": -19.212411880493164, "logps/chosen": -390.58782958984375, "logps/rejected": -427.30059814453125, "loss": 0.4238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5791521072387695, "rewards/margins": 1.3233622312545776, "rewards/rejected": 3.2557899951934814, "step": 96640 }, { "epoch": 4.48720924834022, "grad_norm": 113.3856430053711, "learning_rate": 3.0887227819304517e-08, "logits/chosen": -19.675701141357422, "logits/rejected": -18.638341903686523, "logps/chosen": -467.880615234375, "logps/rejected": -365.6585998535156, "loss": 0.8397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.292206287384033, "rewards/margins": 0.8081432580947876, "rewards/rejected": 3.4840633869171143, "step": 96650 }, { "epoch": 4.487673522447653, "grad_norm": 119.41576385498047, "learning_rate": 3.085937137285854e-08, "logits/chosen": -18.103443145751953, "logits/rejected": -17.892473220825195, "logps/chosen": -470.99755859375, "logps/rejected": -392.9308776855469, "loss": 0.7569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7118351459503174, "rewards/margins": 0.9895746111869812, "rewards/rejected": 2.7222607135772705, "step": 96660 }, { "epoch": 4.488137796555086, "grad_norm": 94.37467193603516, "learning_rate": 3.083151492641255e-08, "logits/chosen": -20.280668258666992, "logits/rejected": -19.205371856689453, "logps/chosen": -405.8509216308594, "logps/rejected": -315.828857421875, "loss": 0.6074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.047374248504639, "rewards/margins": 0.9410182237625122, "rewards/rejected": 3.106356143951416, "step": 96670 }, { "epoch": 4.48860207066252, "grad_norm": 14.68758773803711, "learning_rate": 3.0803658479966573e-08, "logits/chosen": -18.867321014404297, "logits/rejected": -17.87449073791504, "logps/chosen": -508.97894287109375, "logps/rejected": -393.50537109375, "loss": 0.4479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.000046253204346, "rewards/margins": 1.279048204421997, "rewards/rejected": 2.7209975719451904, "step": 96680 }, { "epoch": 4.489066344769952, "grad_norm": 10.33227825164795, "learning_rate": 3.077580203352059e-08, "logits/chosen": -18.313037872314453, "logits/rejected": -17.73757553100586, "logps/chosen": -431.0237731933594, "logps/rejected": -400.0831604003906, "loss": 0.6117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6127638816833496, "rewards/margins": 0.959452748298645, "rewards/rejected": 1.6533113718032837, "step": 96690 }, { "epoch": 4.489530618877385, "grad_norm": 28.016183853149414, "learning_rate": 3.07479455870746e-08, "logits/chosen": -18.689952850341797, "logits/rejected": -17.836809158325195, "logps/chosen": -285.3563537597656, "logps/rejected": -207.84262084960938, "loss": 0.4193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.780608654022217, "rewards/margins": 2.236651659011841, "rewards/rejected": 0.5439571142196655, "step": 96700 }, { "epoch": 4.489994892984818, "grad_norm": 247.47158813476562, "learning_rate": 3.0720089140628624e-08, "logits/chosen": -18.577594757080078, "logits/rejected": -18.634685516357422, "logps/chosen": -409.984130859375, "logps/rejected": -366.9228515625, "loss": 1.2436, "rewards/accuracies": 0.5, "rewards/chosen": 3.3232951164245605, "rewards/margins": 0.16645650565624237, "rewards/rejected": 3.156838893890381, "step": 96710 }, { "epoch": 4.490459167092252, "grad_norm": 38.175048828125, "learning_rate": 3.0692232694182645e-08, "logits/chosen": -19.5711669921875, "logits/rejected": -19.9395694732666, "logps/chosen": -458.49627685546875, "logps/rejected": -486.974365234375, "loss": 0.9589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.164559841156006, "rewards/margins": 0.09496436268091202, "rewards/rejected": 4.069596290588379, "step": 96720 }, { "epoch": 4.490923441199684, "grad_norm": 207.482421875, "learning_rate": 3.066437624773666e-08, "logits/chosen": -18.759990692138672, "logits/rejected": -18.629907608032227, "logps/chosen": -379.72064208984375, "logps/rejected": -303.3125, "loss": 1.7409, "rewards/accuracies": 0.5, "rewards/chosen": 3.16489839553833, "rewards/margins": 0.4463161826133728, "rewards/rejected": 2.7185821533203125, "step": 96730 }, { "epoch": 4.491387715307117, "grad_norm": 257.48529052734375, "learning_rate": 3.063651980129068e-08, "logits/chosen": -18.858680725097656, "logits/rejected": -19.11567497253418, "logps/chosen": -341.61163330078125, "logps/rejected": -341.13592529296875, "loss": 0.9569, "rewards/accuracies": 0.5, "rewards/chosen": 2.671865940093994, "rewards/margins": 0.10839400440454483, "rewards/rejected": 2.563472270965576, "step": 96740 }, { "epoch": 4.491851989414551, "grad_norm": 3.7625572681427, "learning_rate": 3.06086633548447e-08, "logits/chosen": -18.656641006469727, "logits/rejected": -17.126567840576172, "logps/chosen": -409.74334716796875, "logps/rejected": -244.541259765625, "loss": 0.3452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5196919441223145, "rewards/margins": 2.3094286918640137, "rewards/rejected": 1.210262417793274, "step": 96750 }, { "epoch": 4.4923162635219835, "grad_norm": 0.9949008822441101, "learning_rate": 3.0580806908398716e-08, "logits/chosen": -19.052621841430664, "logits/rejected": -17.668182373046875, "logps/chosen": -510.577880859375, "logps/rejected": -298.0733642578125, "loss": 0.9277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.323152542114258, "rewards/margins": 2.5036888122558594, "rewards/rejected": 1.8194639682769775, "step": 96760 }, { "epoch": 4.492780537629416, "grad_norm": 75.6142578125, "learning_rate": 3.055295046195274e-08, "logits/chosen": -18.717729568481445, "logits/rejected": -18.33418846130371, "logps/chosen": -348.1717834472656, "logps/rejected": -297.83245849609375, "loss": 0.7214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.017282009124756, "rewards/margins": 0.8598654866218567, "rewards/rejected": 1.157416582107544, "step": 96770 }, { "epoch": 4.493244811736849, "grad_norm": 38.107059478759766, "learning_rate": 3.052509401550676e-08, "logits/chosen": -18.78671646118164, "logits/rejected": -18.8499813079834, "logps/chosen": -380.15350341796875, "logps/rejected": -367.3238830566406, "loss": 0.8499, "rewards/accuracies": 0.5, "rewards/chosen": 4.569129467010498, "rewards/margins": 0.9515434503555298, "rewards/rejected": 3.617586135864258, "step": 96780 }, { "epoch": 4.493709085844283, "grad_norm": 62.44498062133789, "learning_rate": 3.049723756906077e-08, "logits/chosen": -20.30426788330078, "logits/rejected": -19.00377082824707, "logps/chosen": -373.90625, "logps/rejected": -294.4194641113281, "loss": 0.393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9269137382507324, "rewards/margins": 1.7656457424163818, "rewards/rejected": 2.1612679958343506, "step": 96790 }, { "epoch": 4.4941733599517155, "grad_norm": 36.66542053222656, "learning_rate": 3.046938112261479e-08, "logits/chosen": -19.287059783935547, "logits/rejected": -18.385791778564453, "logps/chosen": -393.1922912597656, "logps/rejected": -338.00579833984375, "loss": 0.5509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8896644115448, "rewards/margins": 1.0993459224700928, "rewards/rejected": 2.790318012237549, "step": 96800 }, { "epoch": 4.494637634059148, "grad_norm": 128.57884216308594, "learning_rate": 3.044152467616881e-08, "logits/chosen": -18.920501708984375, "logits/rejected": -18.797441482543945, "logps/chosen": -367.626953125, "logps/rejected": -343.6565856933594, "loss": 0.8251, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6670167446136475, "rewards/margins": 0.44650277495384216, "rewards/rejected": 2.2205140590667725, "step": 96810 }, { "epoch": 4.495101908166582, "grad_norm": 64.48157501220703, "learning_rate": 3.041366822972282e-08, "logits/chosen": -19.233043670654297, "logits/rejected": -17.942119598388672, "logps/chosen": -369.52325439453125, "logps/rejected": -235.2703857421875, "loss": 0.7222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5298984050750732, "rewards/margins": 2.8911027908325195, "rewards/rejected": 0.6387956142425537, "step": 96820 }, { "epoch": 4.495566182274015, "grad_norm": 2.148663282394409, "learning_rate": 3.0385811783276844e-08, "logits/chosen": -19.133663177490234, "logits/rejected": -18.449148178100586, "logps/chosen": -389.1549072265625, "logps/rejected": -332.78173828125, "loss": 0.3434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9731178283691406, "rewards/margins": 1.9809796810150146, "rewards/rejected": 1.9921382665634155, "step": 96830 }, { "epoch": 4.4960304563814475, "grad_norm": 1.469356894493103, "learning_rate": 3.0357955336830866e-08, "logits/chosen": -19.0933780670166, "logits/rejected": -17.512134552001953, "logps/chosen": -461.7969665527344, "logps/rejected": -270.46124267578125, "loss": 0.2164, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.576127052307129, "rewards/margins": 2.650096893310547, "rewards/rejected": 1.9260305166244507, "step": 96840 }, { "epoch": 4.49649473048888, "grad_norm": 29.121232986450195, "learning_rate": 3.033009889038488e-08, "logits/chosen": -20.029781341552734, "logits/rejected": -19.48306655883789, "logps/chosen": -407.6043395996094, "logps/rejected": -347.09356689453125, "loss": 0.295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.052149772644043, "rewards/margins": 1.5946663618087769, "rewards/rejected": 2.4574830532073975, "step": 96850 }, { "epoch": 4.496959004596314, "grad_norm": 160.22816467285156, "learning_rate": 3.03022424439389e-08, "logits/chosen": -18.995521545410156, "logits/rejected": -19.420673370361328, "logps/chosen": -378.7063903808594, "logps/rejected": -441.50543212890625, "loss": 1.4106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7795114517211914, "rewards/margins": -0.42450839281082153, "rewards/rejected": 3.2040200233459473, "step": 96860 }, { "epoch": 4.497423278703747, "grad_norm": 197.6532745361328, "learning_rate": 3.027438599749292e-08, "logits/chosen": -18.1212158203125, "logits/rejected": -17.57607650756836, "logps/chosen": -302.9732971191406, "logps/rejected": -319.3825378417969, "loss": 1.5634, "rewards/accuracies": 0.5, "rewards/chosen": 2.2255232334136963, "rewards/margins": 0.5946768522262573, "rewards/rejected": 1.6308462619781494, "step": 96870 }, { "epoch": 4.4978875528111795, "grad_norm": 0.7697225213050842, "learning_rate": 3.024652955104694e-08, "logits/chosen": -19.666412353515625, "logits/rejected": -19.216020584106445, "logps/chosen": -498.38458251953125, "logps/rejected": -381.0932312011719, "loss": 0.3125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.25770378112793, "rewards/margins": 2.365110397338867, "rewards/rejected": 2.8925933837890625, "step": 96880 }, { "epoch": 4.498351826918613, "grad_norm": 13.77833080291748, "learning_rate": 3.021867310460095e-08, "logits/chosen": -20.043115615844727, "logits/rejected": -18.041698455810547, "logps/chosen": -511.87933349609375, "logps/rejected": -274.06427001953125, "loss": 0.1551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7070746421813965, "rewards/margins": 3.3784496784210205, "rewards/rejected": 1.328624963760376, "step": 96890 }, { "epoch": 4.498816101026046, "grad_norm": 19.867431640625, "learning_rate": 3.019081665815497e-08, "logits/chosen": -19.677337646484375, "logits/rejected": -17.923175811767578, "logps/chosen": -406.50103759765625, "logps/rejected": -294.17791748046875, "loss": 0.566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.35313081741333, "rewards/margins": 0.9661922454833984, "rewards/rejected": 1.3869386911392212, "step": 96900 }, { "epoch": 4.499280375133479, "grad_norm": 138.39511108398438, "learning_rate": 3.016296021170899e-08, "logits/chosen": -19.32828140258789, "logits/rejected": -18.2254581451416, "logps/chosen": -393.1079406738281, "logps/rejected": -212.79525756835938, "loss": 0.2966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.07757568359375, "rewards/margins": 2.526451587677002, "rewards/rejected": 1.5511245727539062, "step": 96910 }, { "epoch": 4.499744649240911, "grad_norm": 12.293354988098145, "learning_rate": 3.013510376526301e-08, "logits/chosen": -18.863245010375977, "logits/rejected": -19.27655029296875, "logps/chosen": -491.43743896484375, "logps/rejected": -397.2401428222656, "loss": 0.8065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.900723934173584, "rewards/margins": 0.8178808093070984, "rewards/rejected": 3.08284330368042, "step": 96920 }, { "epoch": 4.500208923348345, "grad_norm": 40.29788589477539, "learning_rate": 3.010724731881703e-08, "logits/chosen": -18.5594539642334, "logits/rejected": -17.64988899230957, "logps/chosen": -360.162109375, "logps/rejected": -249.70187377929688, "loss": 0.7663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7022299766540527, "rewards/margins": 1.3368269205093384, "rewards/rejected": 1.3654028177261353, "step": 96930 }, { "epoch": 4.500673197455778, "grad_norm": 93.95050811767578, "learning_rate": 3.0079390872371044e-08, "logits/chosen": -19.00067901611328, "logits/rejected": -18.795726776123047, "logps/chosen": -302.6178894042969, "logps/rejected": -259.70196533203125, "loss": 1.47, "rewards/accuracies": 0.5, "rewards/chosen": 3.2502388954162598, "rewards/margins": 0.47372111678123474, "rewards/rejected": 2.776517391204834, "step": 96940 }, { "epoch": 4.501137471563211, "grad_norm": 3.5500128269195557, "learning_rate": 3.0051534425925065e-08, "logits/chosen": -18.972137451171875, "logits/rejected": -18.774646759033203, "logps/chosen": -325.8126220703125, "logps/rejected": -256.9331970214844, "loss": 0.5246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8227174282073975, "rewards/margins": 1.7132623195648193, "rewards/rejected": 1.1094552278518677, "step": 96950 }, { "epoch": 4.501601745670644, "grad_norm": 7.876657962799072, "learning_rate": 3.0023677979479086e-08, "logits/chosen": -19.107715606689453, "logits/rejected": -18.04226303100586, "logps/chosen": -373.942138671875, "logps/rejected": -260.0247802734375, "loss": 0.3923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9053521156311035, "rewards/margins": 1.831479787826538, "rewards/rejected": 2.0738720893859863, "step": 96960 }, { "epoch": 4.502066019778077, "grad_norm": 201.86488342285156, "learning_rate": 2.99958215330331e-08, "logits/chosen": -19.658199310302734, "logits/rejected": -19.417728424072266, "logps/chosen": -379.4686584472656, "logps/rejected": -334.07354736328125, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2952606678009033, "rewards/margins": 0.821480929851532, "rewards/rejected": 2.4737799167633057, "step": 96970 }, { "epoch": 4.50253029388551, "grad_norm": 41.95185470581055, "learning_rate": 2.996796508658712e-08, "logits/chosen": -19.678882598876953, "logits/rejected": -19.530147552490234, "logps/chosen": -388.5533142089844, "logps/rejected": -398.8345947265625, "loss": 0.9006, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.663425922393799, "rewards/margins": 0.595670759677887, "rewards/rejected": 2.0677552223205566, "step": 96980 }, { "epoch": 4.502994567992943, "grad_norm": 4.008118629455566, "learning_rate": 2.9940108640141137e-08, "logits/chosen": -19.993091583251953, "logits/rejected": -18.910722732543945, "logps/chosen": -359.69476318359375, "logps/rejected": -307.1504821777344, "loss": 0.5314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1836624145507812, "rewards/margins": 1.5218867063522339, "rewards/rejected": 1.6617753505706787, "step": 96990 }, { "epoch": 4.503458842100376, "grad_norm": 2.745838165283203, "learning_rate": 2.991225219369516e-08, "logits/chosen": -20.21512222290039, "logits/rejected": -18.534944534301758, "logps/chosen": -496.0904235839844, "logps/rejected": -301.94476318359375, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.9809136390686035, "rewards/margins": 2.449857711791992, "rewards/rejected": 2.531055450439453, "step": 97000 }, { "epoch": 4.503923116207809, "grad_norm": 0.14571534097194672, "learning_rate": 2.988439574724917e-08, "logits/chosen": -18.955028533935547, "logits/rejected": -18.227375030517578, "logps/chosen": -354.69189453125, "logps/rejected": -335.8168029785156, "loss": 0.3064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.101945400238037, "rewards/margins": 2.490955114364624, "rewards/rejected": 1.610990285873413, "step": 97010 }, { "epoch": 4.504387390315242, "grad_norm": 10.32986831665039, "learning_rate": 2.9856539300803193e-08, "logits/chosen": -18.50459861755371, "logits/rejected": -17.958513259887695, "logps/chosen": -405.9141845703125, "logps/rejected": -361.7884826660156, "loss": 0.5683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.000152111053467, "rewards/margins": 2.0629889965057373, "rewards/rejected": 1.9371631145477295, "step": 97020 }, { "epoch": 4.5048516644226755, "grad_norm": 202.93458557128906, "learning_rate": 2.982868285435721e-08, "logits/chosen": -18.97663116455078, "logits/rejected": -18.64618492126465, "logps/chosen": -305.52325439453125, "logps/rejected": -337.8109436035156, "loss": 0.6259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.812443494796753, "rewards/margins": 0.9778926968574524, "rewards/rejected": 1.8345508575439453, "step": 97030 }, { "epoch": 4.505315938530108, "grad_norm": 2.8517537117004395, "learning_rate": 2.980082640791123e-08, "logits/chosen": -20.118986129760742, "logits/rejected": -18.545352935791016, "logps/chosen": -452.35772705078125, "logps/rejected": -322.4424743652344, "loss": 0.6426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.986607313156128, "rewards/margins": 1.02262282371521, "rewards/rejected": 1.963984489440918, "step": 97040 }, { "epoch": 4.505780212637541, "grad_norm": 233.33596801757812, "learning_rate": 2.9772969961465247e-08, "logits/chosen": -19.22507095336914, "logits/rejected": -18.95875358581543, "logps/chosen": -327.273681640625, "logps/rejected": -351.68060302734375, "loss": 0.6513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.601895809173584, "rewards/margins": 0.2699284553527832, "rewards/rejected": 2.33196759223938, "step": 97050 }, { "epoch": 4.506244486744974, "grad_norm": 2.3135433197021484, "learning_rate": 2.9745113515019268e-08, "logits/chosen": -20.15700340270996, "logits/rejected": -19.133075714111328, "logps/chosen": -500.1499938964844, "logps/rejected": -367.9322509765625, "loss": 0.5013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.208966255187988, "rewards/margins": 1.7195584774017334, "rewards/rejected": 3.489407777786255, "step": 97060 }, { "epoch": 4.5067087608524075, "grad_norm": 11.353780746459961, "learning_rate": 2.9717257068573286e-08, "logits/chosen": -19.72677230834961, "logits/rejected": -19.61583709716797, "logps/chosen": -406.7494201660156, "logps/rejected": -403.2217102050781, "loss": 1.1392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.473543882369995, "rewards/margins": 0.3605729639530182, "rewards/rejected": 3.1129708290100098, "step": 97070 }, { "epoch": 4.50717303495984, "grad_norm": 2.056652545928955, "learning_rate": 2.9689400622127304e-08, "logits/chosen": -18.875022888183594, "logits/rejected": -18.744976043701172, "logps/chosen": -344.86639404296875, "logps/rejected": -311.7961120605469, "loss": 1.0271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.687117099761963, "rewards/margins": 0.8712466955184937, "rewards/rejected": 1.8158702850341797, "step": 97080 }, { "epoch": 4.507637309067273, "grad_norm": 206.3984375, "learning_rate": 2.9661544175681318e-08, "logits/chosen": -19.08831214904785, "logits/rejected": -18.697872161865234, "logps/chosen": -361.865234375, "logps/rejected": -325.09686279296875, "loss": 0.5067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1209423542022705, "rewards/margins": 1.8323485851287842, "rewards/rejected": 1.2885935306549072, "step": 97090 }, { "epoch": 4.508101583174707, "grad_norm": 249.07054138183594, "learning_rate": 2.9633687729235336e-08, "logits/chosen": -18.219219207763672, "logits/rejected": -18.140277862548828, "logps/chosen": -314.82659912109375, "logps/rejected": -366.89349365234375, "loss": 1.0109, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3048512935638428, "rewards/margins": 0.028517937287688255, "rewards/rejected": 2.2763330936431885, "step": 97100 }, { "epoch": 4.508565857282139, "grad_norm": 86.50740814208984, "learning_rate": 2.9605831282789357e-08, "logits/chosen": -18.837411880493164, "logits/rejected": -18.459754943847656, "logps/chosen": -393.9085998535156, "logps/rejected": -350.8118591308594, "loss": 1.063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8336448669433594, "rewards/margins": 1.3975623846054077, "rewards/rejected": 2.436082363128662, "step": 97110 }, { "epoch": 4.509030131389572, "grad_norm": 30.835498809814453, "learning_rate": 2.9577974836343375e-08, "logits/chosen": -19.615020751953125, "logits/rejected": -18.569042205810547, "logps/chosen": -463.67999267578125, "logps/rejected": -378.0827941894531, "loss": 1.0168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.999476909637451, "rewards/margins": 1.0953924655914307, "rewards/rejected": 2.9040842056274414, "step": 97120 }, { "epoch": 4.509494405497005, "grad_norm": 18.620697021484375, "learning_rate": 2.9550118389897393e-08, "logits/chosen": -19.14539337158203, "logits/rejected": -17.51998519897461, "logps/chosen": -463.245849609375, "logps/rejected": -303.33062744140625, "loss": 0.5996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.376349925994873, "rewards/margins": 2.6662259101867676, "rewards/rejected": 1.7101237773895264, "step": 97130 }, { "epoch": 4.509958679604439, "grad_norm": 82.84352111816406, "learning_rate": 2.9522261943451414e-08, "logits/chosen": -19.185293197631836, "logits/rejected": -17.965988159179688, "logps/chosen": -363.8186950683594, "logps/rejected": -248.59933471679688, "loss": 0.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1291892528533936, "rewards/margins": 1.018904447555542, "rewards/rejected": 2.1102850437164307, "step": 97140 }, { "epoch": 4.510422953711871, "grad_norm": 79.13590240478516, "learning_rate": 2.9494405497005432e-08, "logits/chosen": -18.45439338684082, "logits/rejected": -17.868640899658203, "logps/chosen": -360.71417236328125, "logps/rejected": -339.70025634765625, "loss": 1.082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1280999183654785, "rewards/margins": 0.08091425895690918, "rewards/rejected": 3.0471854209899902, "step": 97150 }, { "epoch": 4.510887227819304, "grad_norm": 122.15037536621094, "learning_rate": 2.946654905055945e-08, "logits/chosen": -18.47418975830078, "logits/rejected": -18.15237808227539, "logps/chosen": -293.67333984375, "logps/rejected": -291.85223388671875, "loss": 0.6612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4456915855407715, "rewards/margins": 0.6412932276725769, "rewards/rejected": 1.8043981790542603, "step": 97160 }, { "epoch": 4.511351501926738, "grad_norm": 128.6134796142578, "learning_rate": 2.9438692604113468e-08, "logits/chosen": -18.902606964111328, "logits/rejected": -18.05660057067871, "logps/chosen": -418.70849609375, "logps/rejected": -345.701416015625, "loss": 0.8085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.380234956741333, "rewards/margins": 1.010576605796814, "rewards/rejected": 2.3696582317352295, "step": 97170 }, { "epoch": 4.511815776034171, "grad_norm": 207.1110382080078, "learning_rate": 2.941083615766749e-08, "logits/chosen": -18.72176742553711, "logits/rejected": -18.429821014404297, "logps/chosen": -289.8385925292969, "logps/rejected": -262.5317077636719, "loss": 1.1515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8363691568374634, "rewards/margins": 0.6787394881248474, "rewards/rejected": 1.1576296091079712, "step": 97180 }, { "epoch": 4.512280050141603, "grad_norm": 105.17008209228516, "learning_rate": 2.9382979711221503e-08, "logits/chosen": -19.026103973388672, "logits/rejected": -18.142892837524414, "logps/chosen": -376.1662902832031, "logps/rejected": -292.976318359375, "loss": 0.3489, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.935554027557373, "rewards/margins": 1.6752424240112305, "rewards/rejected": 1.2603113651275635, "step": 97190 }, { "epoch": 4.512744324249036, "grad_norm": 0.15023639798164368, "learning_rate": 2.935512326477552e-08, "logits/chosen": -19.228572845458984, "logits/rejected": -17.954437255859375, "logps/chosen": -494.32525634765625, "logps/rejected": -307.2493591308594, "loss": 0.2581, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1984333992004395, "rewards/margins": 3.219820499420166, "rewards/rejected": 0.9786123037338257, "step": 97200 }, { "epoch": 4.51320859835647, "grad_norm": 24.64549446105957, "learning_rate": 2.932726681832954e-08, "logits/chosen": -18.97085952758789, "logits/rejected": -18.546367645263672, "logps/chosen": -327.97052001953125, "logps/rejected": -325.203125, "loss": 1.3033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.86447811126709, "rewards/margins": -0.17315402626991272, "rewards/rejected": 3.0376319885253906, "step": 97210 }, { "epoch": 4.513672872463903, "grad_norm": 1.2367812395095825, "learning_rate": 2.9299410371883557e-08, "logits/chosen": -19.053852081298828, "logits/rejected": -17.42491340637207, "logps/chosen": -467.3037109375, "logps/rejected": -288.8123779296875, "loss": 0.506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6596896648406982, "rewards/margins": 2.5393624305725098, "rewards/rejected": 1.1203269958496094, "step": 97220 }, { "epoch": 4.514137146571335, "grad_norm": 36.92721939086914, "learning_rate": 2.9271553925437578e-08, "logits/chosen": -19.52599334716797, "logits/rejected": -19.01409912109375, "logps/chosen": -429.5731506347656, "logps/rejected": -371.5527648925781, "loss": 0.6547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7715706825256348, "rewards/margins": 1.0156781673431396, "rewards/rejected": 2.755892753601074, "step": 97230 }, { "epoch": 4.514601420678769, "grad_norm": 0.3800904452800751, "learning_rate": 2.9243697478991596e-08, "logits/chosen": -18.998750686645508, "logits/rejected": -18.704830169677734, "logps/chosen": -284.63897705078125, "logps/rejected": -240.0218048095703, "loss": 1.4779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.14070725440979, "rewards/margins": 0.7063068151473999, "rewards/rejected": 1.4344006776809692, "step": 97240 }, { "epoch": 4.515065694786202, "grad_norm": 267.9881286621094, "learning_rate": 2.9215841032545614e-08, "logits/chosen": -18.12187385559082, "logits/rejected": -17.59139633178711, "logps/chosen": -312.723876953125, "logps/rejected": -316.9261169433594, "loss": 0.5271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.682870388031006, "rewards/margins": 1.9019187688827515, "rewards/rejected": 0.7809513807296753, "step": 97250 }, { "epoch": 4.515529968893635, "grad_norm": 27.722570419311523, "learning_rate": 2.918798458609963e-08, "logits/chosen": -18.886470794677734, "logits/rejected": -17.366098403930664, "logps/chosen": -478.5906677246094, "logps/rejected": -290.5111389160156, "loss": 0.2647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.520005702972412, "rewards/margins": 2.576653480529785, "rewards/rejected": 0.9433521032333374, "step": 97260 }, { "epoch": 4.515994243001067, "grad_norm": 76.89613342285156, "learning_rate": 2.9160128139653653e-08, "logits/chosen": -19.600419998168945, "logits/rejected": -18.722028732299805, "logps/chosen": -457.55145263671875, "logps/rejected": -373.81048583984375, "loss": 0.3414, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.671365261077881, "rewards/margins": 1.6082096099853516, "rewards/rejected": 3.0631556510925293, "step": 97270 }, { "epoch": 4.516458517108501, "grad_norm": 26.42640495300293, "learning_rate": 2.913227169320767e-08, "logits/chosen": -19.180082321166992, "logits/rejected": -19.233322143554688, "logps/chosen": -455.3790588378906, "logps/rejected": -417.9178161621094, "loss": 1.2681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7382798194885254, "rewards/margins": 0.1928844004869461, "rewards/rejected": 2.5453953742980957, "step": 97280 }, { "epoch": 4.516922791215934, "grad_norm": 5.453152656555176, "learning_rate": 2.9104415246761685e-08, "logits/chosen": -19.185317993164062, "logits/rejected": -18.385557174682617, "logps/chosen": -435.71636962890625, "logps/rejected": -314.4934387207031, "loss": 0.2975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.849503993988037, "rewards/margins": 2.5865447521209717, "rewards/rejected": 1.2629592418670654, "step": 97290 }, { "epoch": 4.5173870653233665, "grad_norm": 296.7666015625, "learning_rate": 2.9076558800315703e-08, "logits/chosen": -20.191999435424805, "logits/rejected": -18.565135955810547, "logps/chosen": -461.577392578125, "logps/rejected": -381.4953918457031, "loss": 0.5633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.092965602874756, "rewards/margins": 0.9899373054504395, "rewards/rejected": 3.1030285358428955, "step": 97300 }, { "epoch": 4.5178513394308, "grad_norm": 103.61864471435547, "learning_rate": 2.904870235386972e-08, "logits/chosen": -19.314363479614258, "logits/rejected": -19.39011573791504, "logps/chosen": -414.87628173828125, "logps/rejected": -393.64825439453125, "loss": 1.5693, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.634105920791626, "rewards/margins": -1.092926263809204, "rewards/rejected": 3.72703218460083, "step": 97310 }, { "epoch": 4.518315613538233, "grad_norm": 8.499120712280273, "learning_rate": 2.9020845907423742e-08, "logits/chosen": -19.45390510559082, "logits/rejected": -19.659534454345703, "logps/chosen": -284.3479309082031, "logps/rejected": -326.72528076171875, "loss": 1.3944, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.822129726409912, "rewards/margins": -0.3469565510749817, "rewards/rejected": 3.169086456298828, "step": 97320 }, { "epoch": 4.518779887645666, "grad_norm": 74.676513671875, "learning_rate": 2.899298946097776e-08, "logits/chosen": -19.159210205078125, "logits/rejected": -17.779266357421875, "logps/chosen": -476.9730529785156, "logps/rejected": -325.868408203125, "loss": 0.4589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.366715908050537, "rewards/margins": 2.637218952178955, "rewards/rejected": 1.729496955871582, "step": 97330 }, { "epoch": 4.519244161753099, "grad_norm": 51.52458953857422, "learning_rate": 2.8965133014531778e-08, "logits/chosen": -18.781484603881836, "logits/rejected": -17.316442489624023, "logps/chosen": -449.53863525390625, "logps/rejected": -270.30255126953125, "loss": 0.2852, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9082324504852295, "rewards/margins": 2.452056884765625, "rewards/rejected": 1.4561755657196045, "step": 97340 }, { "epoch": 4.519708435860532, "grad_norm": 0.08509188890457153, "learning_rate": 2.89372765680858e-08, "logits/chosen": -19.26679039001465, "logits/rejected": -18.555110931396484, "logps/chosen": -470.9619140625, "logps/rejected": -381.9678039550781, "loss": 0.9186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.926868915557861, "rewards/margins": 1.7146202325820923, "rewards/rejected": 3.2122490406036377, "step": 97350 }, { "epoch": 4.520172709967965, "grad_norm": 98.37552642822266, "learning_rate": 2.8909420121639817e-08, "logits/chosen": -20.0470027923584, "logits/rejected": -19.379573822021484, "logps/chosen": -373.01031494140625, "logps/rejected": -290.2176818847656, "loss": 0.5304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.476609706878662, "rewards/margins": 1.4638161659240723, "rewards/rejected": 2.0127930641174316, "step": 97360 }, { "epoch": 4.520636984075399, "grad_norm": 61.562255859375, "learning_rate": 2.8881563675193834e-08, "logits/chosen": -18.6898136138916, "logits/rejected": -18.665231704711914, "logps/chosen": -346.48748779296875, "logps/rejected": -324.1114501953125, "loss": 1.4487, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.1617650985717773, "rewards/margins": -0.5156368613243103, "rewards/rejected": 2.6774020195007324, "step": 97370 }, { "epoch": 4.521101258182831, "grad_norm": 8.546470642089844, "learning_rate": 2.8853707228747852e-08, "logits/chosen": -19.256528854370117, "logits/rejected": -18.029417037963867, "logps/chosen": -497.56317138671875, "logps/rejected": -398.78729248046875, "loss": 0.3728, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.599029541015625, "rewards/margins": 1.8286702632904053, "rewards/rejected": 1.7703592777252197, "step": 97380 }, { "epoch": 4.521565532290264, "grad_norm": 48.990814208984375, "learning_rate": 2.8825850782301867e-08, "logits/chosen": -19.533849716186523, "logits/rejected": -19.034820556640625, "logps/chosen": -465.8155822753906, "logps/rejected": -341.99481201171875, "loss": 0.5183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8291423320770264, "rewards/margins": 1.809302568435669, "rewards/rejected": 2.0198397636413574, "step": 97390 }, { "epoch": 4.522029806397697, "grad_norm": 249.83836364746094, "learning_rate": 2.8797994335855888e-08, "logits/chosen": -19.102249145507812, "logits/rejected": -18.376148223876953, "logps/chosen": -366.80413818359375, "logps/rejected": -369.9690856933594, "loss": 1.0022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.932791233062744, "rewards/margins": 0.33023911714553833, "rewards/rejected": 2.6025521755218506, "step": 97400 }, { "epoch": 4.522494080505131, "grad_norm": 0.1370546519756317, "learning_rate": 2.8770137889409906e-08, "logits/chosen": -18.331912994384766, "logits/rejected": -18.351367950439453, "logps/chosen": -293.11077880859375, "logps/rejected": -284.6874694824219, "loss": 1.3571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7456328868865967, "rewards/margins": 1.0587718486785889, "rewards/rejected": 1.6868610382080078, "step": 97410 }, { "epoch": 4.522958354612563, "grad_norm": 48.509849548339844, "learning_rate": 2.8742281442963924e-08, "logits/chosen": -18.663604736328125, "logits/rejected": -18.119800567626953, "logps/chosen": -343.97882080078125, "logps/rejected": -296.1309814453125, "loss": 0.3615, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6195104122161865, "rewards/margins": 1.0640780925750732, "rewards/rejected": 0.5554323196411133, "step": 97420 }, { "epoch": 4.523422628719996, "grad_norm": 0.018479719758033752, "learning_rate": 2.871442499651794e-08, "logits/chosen": -19.270404815673828, "logits/rejected": -18.785364151000977, "logps/chosen": -426.0445861816406, "logps/rejected": -326.23193359375, "loss": 0.487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.095916271209717, "rewards/margins": 2.3780953884124756, "rewards/rejected": 2.7178213596343994, "step": 97430 }, { "epoch": 4.52388690282743, "grad_norm": 60.95444869995117, "learning_rate": 2.8686568550071963e-08, "logits/chosen": -19.733455657958984, "logits/rejected": -19.2222900390625, "logps/chosen": -436.48284912109375, "logps/rejected": -367.4817810058594, "loss": 0.4339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8653724193573, "rewards/margins": 1.3962891101837158, "rewards/rejected": 2.469083309173584, "step": 97440 }, { "epoch": 4.5243511769348626, "grad_norm": 168.53721618652344, "learning_rate": 2.865871210362598e-08, "logits/chosen": -19.826753616333008, "logits/rejected": -19.263912200927734, "logps/chosen": -318.1526184082031, "logps/rejected": -369.6378173828125, "loss": 1.1813, "rewards/accuracies": 0.5, "rewards/chosen": 2.6752138137817383, "rewards/margins": 0.007994865998625755, "rewards/rejected": 2.6672189235687256, "step": 97450 }, { "epoch": 4.524815451042295, "grad_norm": 2.121934413909912, "learning_rate": 2.863085565718e-08, "logits/chosen": -19.24690818786621, "logits/rejected": -18.10519790649414, "logps/chosen": -383.0975646972656, "logps/rejected": -400.29119873046875, "loss": 0.8777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.006926536560059, "rewards/margins": 1.5842206478118896, "rewards/rejected": 3.4227054119110107, "step": 97460 }, { "epoch": 4.525279725149728, "grad_norm": 107.93911743164062, "learning_rate": 2.8602999210734016e-08, "logits/chosen": -18.98748779296875, "logits/rejected": -19.14853286743164, "logps/chosen": -427.67950439453125, "logps/rejected": -378.2364196777344, "loss": 0.8388, "rewards/accuracies": 0.5, "rewards/chosen": 2.947335958480835, "rewards/margins": 0.3849857747554779, "rewards/rejected": 2.562350273132324, "step": 97470 }, { "epoch": 4.525743999257162, "grad_norm": 13.039617538452148, "learning_rate": 2.8575142764288037e-08, "logits/chosen": -18.831790924072266, "logits/rejected": -19.1475830078125, "logps/chosen": -302.3588562011719, "logps/rejected": -306.8577880859375, "loss": 0.8737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9005762338638306, "rewards/margins": 0.3360995054244995, "rewards/rejected": 1.5644766092300415, "step": 97480 }, { "epoch": 4.5262082733645945, "grad_norm": 168.62615966796875, "learning_rate": 2.8547286317842052e-08, "logits/chosen": -19.146808624267578, "logits/rejected": -18.621618270874023, "logps/chosen": -349.7686462402344, "logps/rejected": -337.83056640625, "loss": 1.3981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3165009021759033, "rewards/margins": 0.01908116415143013, "rewards/rejected": 2.297420024871826, "step": 97490 }, { "epoch": 4.526672547472027, "grad_norm": 260.588134765625, "learning_rate": 2.851942987139607e-08, "logits/chosen": -19.551233291625977, "logits/rejected": -18.279857635498047, "logps/chosen": -402.60150146484375, "logps/rejected": -352.04815673828125, "loss": 0.8317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0936970710754395, "rewards/margins": 1.528267502784729, "rewards/rejected": 2.565429449081421, "step": 97500 }, { "epoch": 4.527136821579461, "grad_norm": 3.8085508346557617, "learning_rate": 2.8491573424950088e-08, "logits/chosen": -18.920307159423828, "logits/rejected": -16.848957061767578, "logps/chosen": -401.4214782714844, "logps/rejected": -228.81015014648438, "loss": 0.2397, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.565784454345703, "rewards/margins": 2.836238384246826, "rewards/rejected": 0.7295461893081665, "step": 97510 }, { "epoch": 4.527601095686894, "grad_norm": 13.837515830993652, "learning_rate": 2.8463716978504105e-08, "logits/chosen": -19.491641998291016, "logits/rejected": -18.368602752685547, "logps/chosen": -406.7585754394531, "logps/rejected": -311.73187255859375, "loss": 0.3319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.801406145095825, "rewards/margins": 2.1563820838928223, "rewards/rejected": 1.6450239419937134, "step": 97520 }, { "epoch": 4.5280653697943265, "grad_norm": 63.701541900634766, "learning_rate": 2.8435860532058127e-08, "logits/chosen": -18.617137908935547, "logits/rejected": -18.566661834716797, "logps/chosen": -393.38568115234375, "logps/rejected": -340.80267333984375, "loss": 0.4009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.568622589111328, "rewards/margins": 1.4979121685028076, "rewards/rejected": 2.0707099437713623, "step": 97530 }, { "epoch": 4.528529643901759, "grad_norm": 22.00452995300293, "learning_rate": 2.8408004085612144e-08, "logits/chosen": -19.933671951293945, "logits/rejected": -19.142242431640625, "logps/chosen": -391.66156005859375, "logps/rejected": -318.018310546875, "loss": 0.5651, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2999515533447266, "rewards/margins": 1.4757230281829834, "rewards/rejected": 1.8242286443710327, "step": 97540 }, { "epoch": 4.528993918009193, "grad_norm": 14.670075416564941, "learning_rate": 2.8380147639166162e-08, "logits/chosen": -19.11151123046875, "logits/rejected": -17.906063079833984, "logps/chosen": -330.9996337890625, "logps/rejected": -237.97055053710938, "loss": 0.6108, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.215395450592041, "rewards/margins": 2.107837677001953, "rewards/rejected": 1.1075576543807983, "step": 97550 }, { "epoch": 4.529458192116626, "grad_norm": 1.5316816568374634, "learning_rate": 2.8352291192720183e-08, "logits/chosen": -19.544445037841797, "logits/rejected": -18.31858253479004, "logps/chosen": -340.3046875, "logps/rejected": -236.68692016601562, "loss": 0.6455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8895092010498047, "rewards/margins": 2.0733065605163574, "rewards/rejected": 0.8162027597427368, "step": 97560 }, { "epoch": 4.5299224662240585, "grad_norm": 9.663269996643066, "learning_rate": 2.83244347462742e-08, "logits/chosen": -19.592164993286133, "logits/rejected": -19.59901237487793, "logps/chosen": -450.4642639160156, "logps/rejected": -420.2217712402344, "loss": 0.6836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.907027006149292, "rewards/margins": 1.2231148481369019, "rewards/rejected": 2.6839118003845215, "step": 97570 }, { "epoch": 4.530386740331492, "grad_norm": 85.7212905883789, "learning_rate": 2.829657829982822e-08, "logits/chosen": -20.066072463989258, "logits/rejected": -19.959888458251953, "logps/chosen": -345.43634033203125, "logps/rejected": -318.3908386230469, "loss": 1.2245, "rewards/accuracies": 0.5, "rewards/chosen": 3.6376450061798096, "rewards/margins": 0.49694937467575073, "rewards/rejected": 3.140695810317993, "step": 97580 }, { "epoch": 4.530851014438925, "grad_norm": 136.2955322265625, "learning_rate": 2.8268721853382234e-08, "logits/chosen": -18.924577713012695, "logits/rejected": -18.517719268798828, "logps/chosen": -510.93548583984375, "logps/rejected": -438.981201171875, "loss": 0.7629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0498857498168945, "rewards/margins": 1.5608876943588257, "rewards/rejected": 2.4889979362487793, "step": 97590 }, { "epoch": 4.531315288546358, "grad_norm": 188.51133728027344, "learning_rate": 2.824086540693625e-08, "logits/chosen": -19.12772560119629, "logits/rejected": -18.573522567749023, "logps/chosen": -384.6192932128906, "logps/rejected": -355.97259521484375, "loss": 0.671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4061007499694824, "rewards/margins": 1.1034629344940186, "rewards/rejected": 2.3026375770568848, "step": 97600 }, { "epoch": 4.5317795626537904, "grad_norm": 3.225578784942627, "learning_rate": 2.8213008960490273e-08, "logits/chosen": -19.542999267578125, "logits/rejected": -18.03424072265625, "logps/chosen": -520.4740600585938, "logps/rejected": -352.54364013671875, "loss": 0.256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.553285598754883, "rewards/margins": 2.1661899089813232, "rewards/rejected": 2.3870954513549805, "step": 97610 }, { "epoch": 4.532243836761224, "grad_norm": 39.06935119628906, "learning_rate": 2.818515251404429e-08, "logits/chosen": -19.85287857055664, "logits/rejected": -18.644886016845703, "logps/chosen": -468.67462158203125, "logps/rejected": -347.572998046875, "loss": 0.6447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.153286933898926, "rewards/margins": 1.4539353847503662, "rewards/rejected": 2.6993513107299805, "step": 97620 }, { "epoch": 4.532708110868657, "grad_norm": 278.0680847167969, "learning_rate": 2.815729606759831e-08, "logits/chosen": -18.33220100402832, "logits/rejected": -17.968402862548828, "logps/chosen": -433.98583984375, "logps/rejected": -364.71807861328125, "loss": 0.7471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6202938556671143, "rewards/margins": 0.5772473216056824, "rewards/rejected": 3.043046712875366, "step": 97630 }, { "epoch": 4.53317238497609, "grad_norm": 0.5454840660095215, "learning_rate": 2.8129439621152326e-08, "logits/chosen": -19.520496368408203, "logits/rejected": -18.740331649780273, "logps/chosen": -403.09100341796875, "logps/rejected": -332.1002197265625, "loss": 0.8607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9115428924560547, "rewards/margins": 1.0182979106903076, "rewards/rejected": 2.893244981765747, "step": 97640 }, { "epoch": 4.533636659083523, "grad_norm": 103.30152130126953, "learning_rate": 2.8101583174706347e-08, "logits/chosen": -20.713794708251953, "logits/rejected": -18.963207244873047, "logps/chosen": -463.3709411621094, "logps/rejected": -355.73175048828125, "loss": 0.3606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9263057708740234, "rewards/margins": 2.0439372062683105, "rewards/rejected": 1.882368803024292, "step": 97650 }, { "epoch": 4.534100933190956, "grad_norm": 63.02589416503906, "learning_rate": 2.8073726728260365e-08, "logits/chosen": -19.94637107849121, "logits/rejected": -18.882261276245117, "logps/chosen": -486.2411193847656, "logps/rejected": -388.8586120605469, "loss": 0.8527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9695839881896973, "rewards/margins": 0.9168726205825806, "rewards/rejected": 3.0527117252349854, "step": 97660 }, { "epoch": 4.534565207298389, "grad_norm": 64.5820083618164, "learning_rate": 2.8045870281814383e-08, "logits/chosen": -19.049644470214844, "logits/rejected": -18.776668548583984, "logps/chosen": -397.35369873046875, "logps/rejected": -364.5935363769531, "loss": 0.452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6096065044403076, "rewards/margins": 1.5812180042266846, "rewards/rejected": 2.0283889770507812, "step": 97670 }, { "epoch": 4.535029481405822, "grad_norm": 1.1225099563598633, "learning_rate": 2.80180138353684e-08, "logits/chosen": -18.82998275756836, "logits/rejected": -17.112653732299805, "logps/chosen": -517.1349487304688, "logps/rejected": -353.3844909667969, "loss": 0.4472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.254240989685059, "rewards/margins": 1.983891248703003, "rewards/rejected": 2.2703497409820557, "step": 97680 }, { "epoch": 4.535493755513255, "grad_norm": 28.719301223754883, "learning_rate": 2.7990157388922415e-08, "logits/chosen": -19.602815628051758, "logits/rejected": -18.231956481933594, "logps/chosen": -420.1752014160156, "logps/rejected": -274.6708984375, "loss": 0.2237, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5770773887634277, "rewards/margins": 2.2708635330200195, "rewards/rejected": 1.3062140941619873, "step": 97690 }, { "epoch": 4.535958029620688, "grad_norm": 1.6566038131713867, "learning_rate": 2.7962300942476437e-08, "logits/chosen": -19.449928283691406, "logits/rejected": -18.76752281188965, "logps/chosen": -429.8114318847656, "logps/rejected": -353.34033203125, "loss": 0.5281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5880966186523438, "rewards/margins": 1.445183515548706, "rewards/rejected": 2.142913341522217, "step": 97700 }, { "epoch": 4.536422303728121, "grad_norm": 209.64784240722656, "learning_rate": 2.7934444496030454e-08, "logits/chosen": -18.839075088500977, "logits/rejected": -17.929243087768555, "logps/chosen": -415.7770080566406, "logps/rejected": -325.18756103515625, "loss": 0.4885, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.120669364929199, "rewards/margins": 1.7861478328704834, "rewards/rejected": 2.334521532058716, "step": 97710 }, { "epoch": 4.5368865778355545, "grad_norm": 251.05398559570312, "learning_rate": 2.7906588049584472e-08, "logits/chosen": -19.567180633544922, "logits/rejected": -18.417463302612305, "logps/chosen": -367.9477844238281, "logps/rejected": -237.01785278320312, "loss": 1.17, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.592024326324463, "rewards/margins": 1.9100263118743896, "rewards/rejected": 1.6819982528686523, "step": 97720 }, { "epoch": 4.537350851942987, "grad_norm": 23.15788459777832, "learning_rate": 2.787873160313849e-08, "logits/chosen": -19.275623321533203, "logits/rejected": -18.33300018310547, "logps/chosen": -432.8529357910156, "logps/rejected": -360.06707763671875, "loss": 0.8484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.175248146057129, "rewards/margins": 0.9507344961166382, "rewards/rejected": 2.224513530731201, "step": 97730 }, { "epoch": 4.53781512605042, "grad_norm": 3.64469313621521, "learning_rate": 2.785087515669251e-08, "logits/chosen": -19.489803314208984, "logits/rejected": -19.048511505126953, "logps/chosen": -429.429931640625, "logps/rejected": -310.27081298828125, "loss": 0.3092, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.568394660949707, "rewards/margins": 2.477367877960205, "rewards/rejected": 2.091026782989502, "step": 97740 }, { "epoch": 4.538279400157853, "grad_norm": 85.29208374023438, "learning_rate": 2.782301871024653e-08, "logits/chosen": -19.37752914428711, "logits/rejected": -19.094722747802734, "logps/chosen": -321.5355529785156, "logps/rejected": -260.2427673339844, "loss": 1.7736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.338707447052002, "rewards/margins": 0.06591419875621796, "rewards/rejected": 2.2727932929992676, "step": 97750 }, { "epoch": 4.5387436742652865, "grad_norm": 5.546432971954346, "learning_rate": 2.7795162263800547e-08, "logits/chosen": -19.980880737304688, "logits/rejected": -18.56740379333496, "logps/chosen": -419.69439697265625, "logps/rejected": -254.1465301513672, "loss": 0.536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.341434478759766, "rewards/margins": 3.0096967220306396, "rewards/rejected": 1.331737756729126, "step": 97760 }, { "epoch": 4.539207948372719, "grad_norm": 14.371956825256348, "learning_rate": 2.7767305817354568e-08, "logits/chosen": -19.185733795166016, "logits/rejected": -18.44609260559082, "logps/chosen": -467.53497314453125, "logps/rejected": -410.12103271484375, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": 3.5558667182922363, "rewards/margins": 0.7327675223350525, "rewards/rejected": 2.82309889793396, "step": 97770 }, { "epoch": 4.539672222480152, "grad_norm": 103.19424438476562, "learning_rate": 2.7739449370908583e-08, "logits/chosen": -20.19972038269043, "logits/rejected": -18.70069122314453, "logps/chosen": -381.27459716796875, "logps/rejected": -235.81222534179688, "loss": 0.1722, "rewards/accuracies": 1.0, "rewards/chosen": 3.749171733856201, "rewards/margins": 3.2086148262023926, "rewards/rejected": 0.5405570268630981, "step": 97780 }, { "epoch": 4.540136496587586, "grad_norm": 95.92215728759766, "learning_rate": 2.77115929244626e-08, "logits/chosen": -19.049211502075195, "logits/rejected": -18.601215362548828, "logps/chosen": -363.4411926269531, "logps/rejected": -359.7248840332031, "loss": 0.7756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.636587142944336, "rewards/margins": 0.47861605882644653, "rewards/rejected": 2.157971143722534, "step": 97790 }, { "epoch": 4.540600770695018, "grad_norm": 80.85787200927734, "learning_rate": 2.768373647801662e-08, "logits/chosen": -18.761547088623047, "logits/rejected": -18.970027923583984, "logps/chosen": -437.4039001464844, "logps/rejected": -377.14300537109375, "loss": 0.8481, "rewards/accuracies": 0.5, "rewards/chosen": 3.0109946727752686, "rewards/margins": 0.32223600149154663, "rewards/rejected": 2.6887588500976562, "step": 97800 }, { "epoch": 4.541065044802451, "grad_norm": 1.1965758800506592, "learning_rate": 2.7655880031570636e-08, "logits/chosen": -19.090911865234375, "logits/rejected": -18.363418579101562, "logps/chosen": -388.94049072265625, "logps/rejected": -307.7701416015625, "loss": 1.0029, "rewards/accuracies": 0.5, "rewards/chosen": 2.7126383781433105, "rewards/margins": 0.7621105909347534, "rewards/rejected": 1.9505277872085571, "step": 97810 }, { "epoch": 4.541529318909884, "grad_norm": 149.0620574951172, "learning_rate": 2.7628023585124657e-08, "logits/chosen": -19.909212112426758, "logits/rejected": -18.851512908935547, "logps/chosen": -342.7421875, "logps/rejected": -280.6572265625, "loss": 1.3097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8922414779663086, "rewards/margins": 0.17255298793315887, "rewards/rejected": 2.7196884155273438, "step": 97820 }, { "epoch": 4.541993593017318, "grad_norm": 7.034167289733887, "learning_rate": 2.7600167138678675e-08, "logits/chosen": -19.119903564453125, "logits/rejected": -18.006824493408203, "logps/chosen": -408.21514892578125, "logps/rejected": -298.97216796875, "loss": 0.6694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2627696990966797, "rewards/margins": 1.2267186641693115, "rewards/rejected": 2.0360512733459473, "step": 97830 }, { "epoch": 4.54245786712475, "grad_norm": 41.87709426879883, "learning_rate": 2.7572310692232693e-08, "logits/chosen": -18.996517181396484, "logits/rejected": -17.188318252563477, "logps/chosen": -404.6848449707031, "logps/rejected": -290.70635986328125, "loss": 0.3835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.602846622467041, "rewards/margins": 2.1555752754211426, "rewards/rejected": 1.4472713470458984, "step": 97840 }, { "epoch": 4.542922141232183, "grad_norm": 102.11029815673828, "learning_rate": 2.754445424578671e-08, "logits/chosen": -19.50668716430664, "logits/rejected": -19.066997528076172, "logps/chosen": -401.80950927734375, "logps/rejected": -429.9170837402344, "loss": 0.7858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.091292381286621, "rewards/margins": 0.6196869015693665, "rewards/rejected": 3.4716053009033203, "step": 97850 }, { "epoch": 4.543386415339617, "grad_norm": 1.4390969276428223, "learning_rate": 2.7516597799340732e-08, "logits/chosen": -19.376575469970703, "logits/rejected": -18.63750648498535, "logps/chosen": -418.6904296875, "logps/rejected": -339.61822509765625, "loss": 0.4194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.406078338623047, "rewards/margins": 1.6128826141357422, "rewards/rejected": 2.7931957244873047, "step": 97860 }, { "epoch": 4.54385068944705, "grad_norm": 30.860811233520508, "learning_rate": 2.748874135289475e-08, "logits/chosen": -19.779399871826172, "logits/rejected": -18.79037857055664, "logps/chosen": -440.97564697265625, "logps/rejected": -333.0479431152344, "loss": 0.8274, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3736331462860107, "rewards/margins": 1.1086814403533936, "rewards/rejected": 2.2649521827697754, "step": 97870 }, { "epoch": 4.544314963554482, "grad_norm": 31.936426162719727, "learning_rate": 2.7460884906448764e-08, "logits/chosen": -18.671764373779297, "logits/rejected": -18.451435089111328, "logps/chosen": -347.78369140625, "logps/rejected": -266.2801208496094, "loss": 1.1672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.663548707962036, "rewards/margins": 0.7064493894577026, "rewards/rejected": 1.9570995569229126, "step": 97880 }, { "epoch": 4.544779237661915, "grad_norm": 41.40879821777344, "learning_rate": 2.7433028460002782e-08, "logits/chosen": -19.86172103881836, "logits/rejected": -18.328800201416016, "logps/chosen": -288.39190673828125, "logps/rejected": -203.82177734375, "loss": 0.4174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0067248344421387, "rewards/margins": 2.2509257793426514, "rewards/rejected": 0.7557988166809082, "step": 97890 }, { "epoch": 4.545243511769349, "grad_norm": 10.125504493713379, "learning_rate": 2.74051720135568e-08, "logits/chosen": -19.14649200439453, "logits/rejected": -17.450679779052734, "logps/chosen": -381.53741455078125, "logps/rejected": -211.98971557617188, "loss": 0.4133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.465672016143799, "rewards/margins": 2.501312494277954, "rewards/rejected": 0.9643591642379761, "step": 97900 }, { "epoch": 4.545707785876782, "grad_norm": 234.19168090820312, "learning_rate": 2.737731556711082e-08, "logits/chosen": -19.740392684936523, "logits/rejected": -18.511581420898438, "logps/chosen": -397.01806640625, "logps/rejected": -343.63836669921875, "loss": 0.6546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.563045501708984, "rewards/margins": 2.240833044052124, "rewards/rejected": 2.3222126960754395, "step": 97910 }, { "epoch": 4.546172059984214, "grad_norm": 33.48667526245117, "learning_rate": 2.734945912066484e-08, "logits/chosen": -19.721237182617188, "logits/rejected": -18.587360382080078, "logps/chosen": -431.9810485839844, "logps/rejected": -358.65484619140625, "loss": 0.4269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.106362342834473, "rewards/margins": 1.2798031568527222, "rewards/rejected": 2.826558828353882, "step": 97920 }, { "epoch": 4.546636334091648, "grad_norm": 31.886205673217773, "learning_rate": 2.7321602674218857e-08, "logits/chosen": -18.780546188354492, "logits/rejected": -17.854907989501953, "logps/chosen": -405.6694030761719, "logps/rejected": -326.9270324707031, "loss": 0.4766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4306297302246094, "rewards/margins": 1.5540869235992432, "rewards/rejected": 1.876543402671814, "step": 97930 }, { "epoch": 4.547100608199081, "grad_norm": 0.07700683176517487, "learning_rate": 2.7293746227772875e-08, "logits/chosen": -19.978130340576172, "logits/rejected": -19.238399505615234, "logps/chosen": -428.8778381347656, "logps/rejected": -358.9617004394531, "loss": 0.607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6976635456085205, "rewards/margins": 1.6664152145385742, "rewards/rejected": 2.0312483310699463, "step": 97940 }, { "epoch": 4.547564882306514, "grad_norm": 66.6897201538086, "learning_rate": 2.7265889781326896e-08, "logits/chosen": -18.883792877197266, "logits/rejected": -18.79401397705078, "logps/chosen": -267.06488037109375, "logps/rejected": -279.51153564453125, "loss": 0.9262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.582930088043213, "rewards/margins": 0.2969803512096405, "rewards/rejected": 2.28594970703125, "step": 97950 }, { "epoch": 4.548029156413946, "grad_norm": 35.03367233276367, "learning_rate": 2.7238033334880914e-08, "logits/chosen": -18.474262237548828, "logits/rejected": -17.708486557006836, "logps/chosen": -346.9947509765625, "logps/rejected": -320.87420654296875, "loss": 0.8891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8325722217559814, "rewards/margins": 0.5749436020851135, "rewards/rejected": 2.2576284408569336, "step": 97960 }, { "epoch": 4.54849343052138, "grad_norm": 16.869140625, "learning_rate": 2.7210176888434932e-08, "logits/chosen": -19.69675064086914, "logits/rejected": -18.811546325683594, "logps/chosen": -401.2724914550781, "logps/rejected": -348.5337219238281, "loss": 0.9506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.973242998123169, "rewards/margins": 0.9033640027046204, "rewards/rejected": 3.0698795318603516, "step": 97970 }, { "epoch": 4.548957704628813, "grad_norm": 54.523773193359375, "learning_rate": 2.7182320441988946e-08, "logits/chosen": -19.375198364257812, "logits/rejected": -18.716575622558594, "logps/chosen": -436.6913146972656, "logps/rejected": -269.51739501953125, "loss": 0.3625, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4610953330993652, "rewards/margins": 1.973398208618164, "rewards/rejected": 1.4876971244812012, "step": 97980 }, { "epoch": 4.5494219787362455, "grad_norm": 3.9719347953796387, "learning_rate": 2.7154463995542967e-08, "logits/chosen": -18.53647232055664, "logits/rejected": -17.699880599975586, "logps/chosen": -467.61346435546875, "logps/rejected": -403.56817626953125, "loss": 0.7604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.983865261077881, "rewards/margins": 1.0203297138214111, "rewards/rejected": 1.9635359048843384, "step": 97990 }, { "epoch": 4.549886252843679, "grad_norm": 100.6948013305664, "learning_rate": 2.7126607549096985e-08, "logits/chosen": -18.991186141967773, "logits/rejected": -18.23341178894043, "logps/chosen": -404.72406005859375, "logps/rejected": -337.3684997558594, "loss": 0.4302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.683666944503784, "rewards/margins": 2.1737046241760254, "rewards/rejected": 1.5099620819091797, "step": 98000 }, { "epoch": 4.550350526951112, "grad_norm": 122.76990509033203, "learning_rate": 2.7098751102651003e-08, "logits/chosen": -18.819427490234375, "logits/rejected": -18.7442569732666, "logps/chosen": -475.33270263671875, "logps/rejected": -394.3626403808594, "loss": 0.6906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.211341857910156, "rewards/margins": 0.8861327171325684, "rewards/rejected": 3.325209379196167, "step": 98010 }, { "epoch": 4.550814801058545, "grad_norm": 7.098396301269531, "learning_rate": 2.707089465620502e-08, "logits/chosen": -18.971874237060547, "logits/rejected": -18.5122127532959, "logps/chosen": -421.29510498046875, "logps/rejected": -321.37091064453125, "loss": 0.5148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.805917263031006, "rewards/margins": 1.3935667276382446, "rewards/rejected": 1.4123504161834717, "step": 98020 }, { "epoch": 4.5512790751659775, "grad_norm": 18.697229385375977, "learning_rate": 2.7043038209759042e-08, "logits/chosen": -19.0003604888916, "logits/rejected": -18.86916160583496, "logps/chosen": -325.3089599609375, "logps/rejected": -345.26043701171875, "loss": 1.0369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.764774799346924, "rewards/margins": -0.230555459856987, "rewards/rejected": 2.9953300952911377, "step": 98030 }, { "epoch": 4.551743349273411, "grad_norm": 56.508907318115234, "learning_rate": 2.701518176331306e-08, "logits/chosen": -18.676136016845703, "logits/rejected": -18.615079879760742, "logps/chosen": -346.35894775390625, "logps/rejected": -374.7742614746094, "loss": 1.3649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.415663480758667, "rewards/margins": 0.17496132850646973, "rewards/rejected": 3.2407023906707764, "step": 98040 }, { "epoch": 4.552207623380844, "grad_norm": 64.27149200439453, "learning_rate": 2.6987325316867078e-08, "logits/chosen": -18.911426544189453, "logits/rejected": -18.505905151367188, "logps/chosen": -276.23040771484375, "logps/rejected": -239.27468872070312, "loss": 0.8536, "rewards/accuracies": 0.5, "rewards/chosen": 2.032278537750244, "rewards/margins": 0.6969154477119446, "rewards/rejected": 1.3353631496429443, "step": 98050 }, { "epoch": 4.552671897488277, "grad_norm": 40.253868103027344, "learning_rate": 2.6959468870421096e-08, "logits/chosen": -19.186433792114258, "logits/rejected": -17.7609920501709, "logps/chosen": -382.1148986816406, "logps/rejected": -227.0008544921875, "loss": 0.2326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5232577323913574, "rewards/margins": 3.287480115890503, "rewards/rejected": 0.23577766120433807, "step": 98060 }, { "epoch": 4.55313617159571, "grad_norm": 0.082560233771801, "learning_rate": 2.6931612423975117e-08, "logits/chosen": -19.052701950073242, "logits/rejected": -17.480838775634766, "logps/chosen": -558.9989013671875, "logps/rejected": -349.043212890625, "loss": 0.2097, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.438408851623535, "rewards/margins": 3.4323127269744873, "rewards/rejected": 2.006096839904785, "step": 98070 }, { "epoch": 4.553600445703143, "grad_norm": 1.807787299156189, "learning_rate": 2.690375597752913e-08, "logits/chosen": -18.345426559448242, "logits/rejected": -18.001283645629883, "logps/chosen": -380.35491943359375, "logps/rejected": -303.87677001953125, "loss": 0.5867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.610612392425537, "rewards/margins": 1.3244884014129639, "rewards/rejected": 2.286123752593994, "step": 98080 }, { "epoch": 4.554064719810576, "grad_norm": 49.02371597290039, "learning_rate": 2.687589953108315e-08, "logits/chosen": -18.993228912353516, "logits/rejected": -17.837162017822266, "logps/chosen": -350.62176513671875, "logps/rejected": -242.24331665039062, "loss": 0.9469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.085919141769409, "rewards/margins": 2.118786573410034, "rewards/rejected": 0.9671324491500854, "step": 98090 }, { "epoch": 4.554528993918009, "grad_norm": 14.500853538513184, "learning_rate": 2.6848043084637167e-08, "logits/chosen": -19.505611419677734, "logits/rejected": -18.54140853881836, "logps/chosen": -421.801513671875, "logps/rejected": -367.84002685546875, "loss": 0.4821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.339848279953003, "rewards/margins": 0.8845523595809937, "rewards/rejected": 2.455296039581299, "step": 98100 }, { "epoch": 4.554993268025442, "grad_norm": 183.6520538330078, "learning_rate": 2.6820186638191185e-08, "logits/chosen": -19.203460693359375, "logits/rejected": -18.578163146972656, "logps/chosen": -432.680908203125, "logps/rejected": -326.04913330078125, "loss": 0.3307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.902937650680542, "rewards/margins": 2.1165719032287598, "rewards/rejected": 1.7863657474517822, "step": 98110 }, { "epoch": 4.555457542132875, "grad_norm": 41.60740280151367, "learning_rate": 2.6792330191745206e-08, "logits/chosen": -18.750276565551758, "logits/rejected": -18.718833923339844, "logps/chosen": -363.95208740234375, "logps/rejected": -443.7378845214844, "loss": 0.7983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6420340538024902, "rewards/margins": 0.6193429827690125, "rewards/rejected": 3.022691249847412, "step": 98120 }, { "epoch": 4.555921816240308, "grad_norm": 106.81077575683594, "learning_rate": 2.6764473745299224e-08, "logits/chosen": -18.346294403076172, "logits/rejected": -17.788366317749023, "logps/chosen": -447.35687255859375, "logps/rejected": -400.0316467285156, "loss": 0.9131, "rewards/accuracies": 0.5, "rewards/chosen": 3.183363676071167, "rewards/margins": 0.35226911306381226, "rewards/rejected": 2.831094741821289, "step": 98130 }, { "epoch": 4.556386090347742, "grad_norm": 6.5023603439331055, "learning_rate": 2.6736617298853242e-08, "logits/chosen": -19.195913314819336, "logits/rejected": -18.73583221435547, "logps/chosen": -332.8590393066406, "logps/rejected": -224.6728515625, "loss": 1.6623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1432929039001465, "rewards/margins": 0.06377825886011124, "rewards/rejected": 2.079514503479004, "step": 98140 }, { "epoch": 4.556850364455174, "grad_norm": 76.40701293945312, "learning_rate": 2.670876085240726e-08, "logits/chosen": -19.22528648376465, "logits/rejected": -18.792917251586914, "logps/chosen": -409.579345703125, "logps/rejected": -365.1820068359375, "loss": 0.9861, "rewards/accuracies": 0.5, "rewards/chosen": 4.026413440704346, "rewards/margins": 1.0057332515716553, "rewards/rejected": 3.0206799507141113, "step": 98150 }, { "epoch": 4.557314638562607, "grad_norm": 2.472749710083008, "learning_rate": 2.668090440596128e-08, "logits/chosen": -18.668899536132812, "logits/rejected": -18.144357681274414, "logps/chosen": -450.6964416503906, "logps/rejected": -445.3147888183594, "loss": 0.6958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.029968738555908, "rewards/margins": 1.2766311168670654, "rewards/rejected": 2.7533373832702637, "step": 98160 }, { "epoch": 4.557778912670041, "grad_norm": 94.77426147460938, "learning_rate": 2.66530479595153e-08, "logits/chosen": -18.796308517456055, "logits/rejected": -19.015127182006836, "logps/chosen": -384.8506164550781, "logps/rejected": -399.26434326171875, "loss": 0.7985, "rewards/accuracies": 0.5, "rewards/chosen": 2.8875937461853027, "rewards/margins": 0.2060113400220871, "rewards/rejected": 2.68158221244812, "step": 98170 }, { "epoch": 4.5582431867774735, "grad_norm": 0.8178996443748474, "learning_rate": 2.6625191513069313e-08, "logits/chosen": -18.440696716308594, "logits/rejected": -16.863964080810547, "logps/chosen": -399.85833740234375, "logps/rejected": -219.96875, "loss": 0.2719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.054420232772827, "rewards/margins": 3.0623836517333984, "rewards/rejected": -0.00796356238424778, "step": 98180 }, { "epoch": 4.558707460884906, "grad_norm": 77.49618530273438, "learning_rate": 2.659733506662333e-08, "logits/chosen": -19.28030776977539, "logits/rejected": -18.89652442932129, "logps/chosen": -413.66162109375, "logps/rejected": -341.98065185546875, "loss": 0.2548, "rewards/accuracies": 1.0, "rewards/chosen": 4.413901329040527, "rewards/margins": 1.9781010150909424, "rewards/rejected": 2.435800075531006, "step": 98190 }, { "epoch": 4.55917173499234, "grad_norm": 10.144049644470215, "learning_rate": 2.6569478620177352e-08, "logits/chosen": -19.56591033935547, "logits/rejected": -19.053375244140625, "logps/chosen": -472.7430725097656, "logps/rejected": -388.19537353515625, "loss": 0.7512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5132203102111816, "rewards/margins": 1.2026419639587402, "rewards/rejected": 2.3105783462524414, "step": 98200 }, { "epoch": 4.559636009099773, "grad_norm": 4.119133949279785, "learning_rate": 2.654162217373137e-08, "logits/chosen": -19.621295928955078, "logits/rejected": -19.381771087646484, "logps/chosen": -527.0401000976562, "logps/rejected": -395.907958984375, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.819424629211426, "rewards/margins": 1.53390634059906, "rewards/rejected": 3.2855186462402344, "step": 98210 }, { "epoch": 4.5601002832072055, "grad_norm": 25.809919357299805, "learning_rate": 2.6513765727285388e-08, "logits/chosen": -19.67032241821289, "logits/rejected": -17.8544979095459, "logps/chosen": -428.6058044433594, "logps/rejected": -266.6941833496094, "loss": 0.968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5976786613464355, "rewards/margins": 1.6978470087051392, "rewards/rejected": 1.8998321294784546, "step": 98220 }, { "epoch": 4.560564557314638, "grad_norm": 2.4111580848693848, "learning_rate": 2.6485909280839406e-08, "logits/chosen": -19.710386276245117, "logits/rejected": -18.942520141601562, "logps/chosen": -289.3462829589844, "logps/rejected": -230.01882934570312, "loss": 0.4936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.880384683609009, "rewards/margins": 1.7556209564208984, "rewards/rejected": 1.1247632503509521, "step": 98230 }, { "epoch": 4.561028831422072, "grad_norm": 0.24441654980182648, "learning_rate": 2.6458052834393427e-08, "logits/chosen": -18.309276580810547, "logits/rejected": -18.399635314941406, "logps/chosen": -401.534912109375, "logps/rejected": -366.28375244140625, "loss": 0.8537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4449703693389893, "rewards/margins": 0.7651748657226562, "rewards/rejected": 2.679795503616333, "step": 98240 }, { "epoch": 4.561493105529505, "grad_norm": 38.869590759277344, "learning_rate": 2.6430196387947445e-08, "logits/chosen": -19.651050567626953, "logits/rejected": -18.360326766967773, "logps/chosen": -382.7568664550781, "logps/rejected": -301.00543212890625, "loss": 0.4613, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.639228343963623, "rewards/margins": 1.8935174942016602, "rewards/rejected": 1.7457107305526733, "step": 98250 }, { "epoch": 4.5619573796369375, "grad_norm": 123.91252899169922, "learning_rate": 2.6402339941501462e-08, "logits/chosen": -19.17610740661621, "logits/rejected": -18.33121681213379, "logps/chosen": -427.09124755859375, "logps/rejected": -347.68756103515625, "loss": 0.4467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.423117637634277, "rewards/margins": 2.1811842918395996, "rewards/rejected": 2.2419333457946777, "step": 98260 }, { "epoch": 4.562421653744371, "grad_norm": 100.94041442871094, "learning_rate": 2.637448349505548e-08, "logits/chosen": -19.143543243408203, "logits/rejected": -18.198184967041016, "logps/chosen": -503.79864501953125, "logps/rejected": -412.10003662109375, "loss": 1.3166, "rewards/accuracies": 0.5, "rewards/chosen": 4.26343297958374, "rewards/margins": 0.40366482734680176, "rewards/rejected": 3.8597683906555176, "step": 98270 }, { "epoch": 4.562885927851804, "grad_norm": 0.4182029962539673, "learning_rate": 2.6346627048609495e-08, "logits/chosen": -18.67446517944336, "logits/rejected": -18.025501251220703, "logps/chosen": -315.84625244140625, "logps/rejected": -257.1240539550781, "loss": 0.4477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0085129737854004, "rewards/margins": 1.5003511905670166, "rewards/rejected": 0.5081619620323181, "step": 98280 }, { "epoch": 4.563350201959237, "grad_norm": 86.59906768798828, "learning_rate": 2.6318770602163516e-08, "logits/chosen": -19.04817008972168, "logits/rejected": -19.251108169555664, "logps/chosen": -342.84320068359375, "logps/rejected": -345.3345947265625, "loss": 1.5311, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.116337299346924, "rewards/margins": -0.2970099449157715, "rewards/rejected": 3.413346767425537, "step": 98290 }, { "epoch": 4.5638144760666695, "grad_norm": 0.013562400825321674, "learning_rate": 2.6290914155717534e-08, "logits/chosen": -19.649494171142578, "logits/rejected": -18.00057029724121, "logps/chosen": -318.0181884765625, "logps/rejected": -211.87081909179688, "loss": 0.3772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7192604541778564, "rewards/margins": 3.123922109603882, "rewards/rejected": 0.5953382253646851, "step": 98300 }, { "epoch": 4.564278750174103, "grad_norm": 43.381004333496094, "learning_rate": 2.6263057709271552e-08, "logits/chosen": -19.905534744262695, "logits/rejected": -19.077030181884766, "logps/chosen": -399.1294860839844, "logps/rejected": -362.50701904296875, "loss": 0.541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.427110195159912, "rewards/margins": 1.5433287620544434, "rewards/rejected": 2.883781909942627, "step": 98310 }, { "epoch": 4.564743024281536, "grad_norm": 159.68382263183594, "learning_rate": 2.623520126282557e-08, "logits/chosen": -19.190074920654297, "logits/rejected": -17.79653549194336, "logps/chosen": -315.60198974609375, "logps/rejected": -218.50076293945312, "loss": 0.7298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1883304119110107, "rewards/margins": 1.8880802392959595, "rewards/rejected": 1.3002502918243408, "step": 98320 }, { "epoch": 4.565207298388969, "grad_norm": 129.7300567626953, "learning_rate": 2.620734481637959e-08, "logits/chosen": -18.689685821533203, "logits/rejected": -18.593544006347656, "logps/chosen": -396.6178894042969, "logps/rejected": -398.8534851074219, "loss": 1.2539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.870574951171875, "rewards/margins": -0.42361488938331604, "rewards/rejected": 3.294189929962158, "step": 98330 }, { "epoch": 4.565671572496402, "grad_norm": 237.50320434570312, "learning_rate": 2.617948836993361e-08, "logits/chosen": -18.883230209350586, "logits/rejected": -17.940919876098633, "logps/chosen": -401.03271484375, "logps/rejected": -381.2261657714844, "loss": 0.8662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7127463817596436, "rewards/margins": 0.9521709680557251, "rewards/rejected": 2.760575294494629, "step": 98340 }, { "epoch": 4.566135846603835, "grad_norm": 63.64388656616211, "learning_rate": 2.6151631923487626e-08, "logits/chosen": -19.474590301513672, "logits/rejected": -18.475847244262695, "logps/chosen": -391.25897216796875, "logps/rejected": -340.90802001953125, "loss": 1.484, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.553391695022583, "rewards/margins": 0.6223732233047485, "rewards/rejected": 2.931018829345703, "step": 98350 }, { "epoch": 4.566600120711268, "grad_norm": 1.5813307762145996, "learning_rate": 2.6123775477041644e-08, "logits/chosen": -19.293609619140625, "logits/rejected": -17.111164093017578, "logps/chosen": -425.83203125, "logps/rejected": -146.20468139648438, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 3.893085479736328, "rewards/margins": 3.864041805267334, "rewards/rejected": 0.029043514281511307, "step": 98360 }, { "epoch": 4.567064394818701, "grad_norm": 90.869384765625, "learning_rate": 2.6095919030595665e-08, "logits/chosen": -18.351276397705078, "logits/rejected": -18.553180694580078, "logps/chosen": -487.2936096191406, "logps/rejected": -475.34173583984375, "loss": 1.1755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.447460174560547, "rewards/margins": 0.7145542502403259, "rewards/rejected": 3.732905864715576, "step": 98370 }, { "epoch": 4.567528668926134, "grad_norm": 134.3535614013672, "learning_rate": 2.606806258414968e-08, "logits/chosen": -18.484622955322266, "logits/rejected": -18.72109603881836, "logps/chosen": -250.25100708007812, "logps/rejected": -272.8802490234375, "loss": 1.0897, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0714378356933594, "rewards/margins": 0.057488299906253815, "rewards/rejected": 2.0139496326446533, "step": 98380 }, { "epoch": 4.567992943033567, "grad_norm": 207.5844268798828, "learning_rate": 2.6040206137703698e-08, "logits/chosen": -18.400272369384766, "logits/rejected": -17.28877830505371, "logps/chosen": -441.45196533203125, "logps/rejected": -353.9435729980469, "loss": 0.4442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.67297625541687, "rewards/margins": 1.484449028968811, "rewards/rejected": 2.1885273456573486, "step": 98390 }, { "epoch": 4.568457217141, "grad_norm": 160.7442169189453, "learning_rate": 2.6012349691257716e-08, "logits/chosen": -18.177032470703125, "logits/rejected": -18.66298484802246, "logps/chosen": -451.5130310058594, "logps/rejected": -493.28369140625, "loss": 1.6841, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1361770629882812, "rewards/margins": -1.0081428289413452, "rewards/rejected": 4.144320487976074, "step": 98400 }, { "epoch": 4.5689214912484335, "grad_norm": 4.363706111907959, "learning_rate": 2.5984493244811737e-08, "logits/chosen": -19.20083999633789, "logits/rejected": -18.25321388244629, "logps/chosen": -600.1636352539062, "logps/rejected": -518.3465576171875, "loss": 0.5347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.86757755279541, "rewards/margins": 1.275510549545288, "rewards/rejected": 3.592067003250122, "step": 98410 }, { "epoch": 4.569385765355866, "grad_norm": 0.00272002792917192, "learning_rate": 2.5956636798365755e-08, "logits/chosen": -19.458385467529297, "logits/rejected": -18.581634521484375, "logps/chosen": -458.2862854003906, "logps/rejected": -318.1430969238281, "loss": 0.7026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.118739128112793, "rewards/margins": 1.8655812740325928, "rewards/rejected": 3.253157377243042, "step": 98420 }, { "epoch": 4.569850039463299, "grad_norm": 1.012676477432251, "learning_rate": 2.5928780351919772e-08, "logits/chosen": -19.617311477661133, "logits/rejected": -18.976436614990234, "logps/chosen": -383.7702331542969, "logps/rejected": -374.9683532714844, "loss": 0.4372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.775488376617432, "rewards/margins": 1.8123600482940674, "rewards/rejected": 2.9631285667419434, "step": 98430 }, { "epoch": 4.570314313570732, "grad_norm": 27.044843673706055, "learning_rate": 2.590092390547379e-08, "logits/chosen": -19.2891788482666, "logits/rejected": -17.691240310668945, "logps/chosen": -329.56622314453125, "logps/rejected": -202.0535125732422, "loss": 0.3212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.587258815765381, "rewards/margins": 4.10054349899292, "rewards/rejected": 0.486715167760849, "step": 98440 }, { "epoch": 4.5707785876781655, "grad_norm": 57.86620330810547, "learning_rate": 2.587306745902781e-08, "logits/chosen": -19.040584564208984, "logits/rejected": -18.915142059326172, "logps/chosen": -336.26678466796875, "logps/rejected": -351.89129638671875, "loss": 0.8847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.676037311553955, "rewards/margins": 0.14115755259990692, "rewards/rejected": 2.534879684448242, "step": 98450 }, { "epoch": 4.571242861785598, "grad_norm": 52.097957611083984, "learning_rate": 2.584521101258183e-08, "logits/chosen": -18.625873565673828, "logits/rejected": -18.338071823120117, "logps/chosen": -351.80096435546875, "logps/rejected": -279.8040466308594, "loss": 0.4581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5616774559020996, "rewards/margins": 1.5300166606903076, "rewards/rejected": 2.031660795211792, "step": 98460 }, { "epoch": 4.571707135893031, "grad_norm": 0.5277336239814758, "learning_rate": 2.5817354566135847e-08, "logits/chosen": -18.61235809326172, "logits/rejected": -18.528743743896484, "logps/chosen": -413.4244079589844, "logps/rejected": -319.28216552734375, "loss": 1.2678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2495434284210205, "rewards/margins": 0.7499558329582214, "rewards/rejected": 2.4995875358581543, "step": 98470 }, { "epoch": 4.572171410000465, "grad_norm": 340.6275329589844, "learning_rate": 2.5789498119689862e-08, "logits/chosen": -19.093860626220703, "logits/rejected": -18.15160369873047, "logps/chosen": -388.50079345703125, "logps/rejected": -343.5820007324219, "loss": 1.0855, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.3992252349853516, "rewards/margins": 0.3955576717853546, "rewards/rejected": 3.0036673545837402, "step": 98480 }, { "epoch": 4.5726356841078974, "grad_norm": 5.798739433288574, "learning_rate": 2.576164167324388e-08, "logits/chosen": -18.97747230529785, "logits/rejected": -18.67632484436035, "logps/chosen": -379.42144775390625, "logps/rejected": -359.3502502441406, "loss": 0.7284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.030022144317627, "rewards/margins": 1.3688592910766602, "rewards/rejected": 2.661162853240967, "step": 98490 }, { "epoch": 4.57309995821533, "grad_norm": 24.467918395996094, "learning_rate": 2.57337852267979e-08, "logits/chosen": -19.648250579833984, "logits/rejected": -19.554141998291016, "logps/chosen": -331.00128173828125, "logps/rejected": -284.0999450683594, "loss": 0.762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.908247470855713, "rewards/margins": 0.9224205017089844, "rewards/rejected": 1.9858267307281494, "step": 98500 }, { "epoch": 4.573564232322763, "grad_norm": 0.042282432317733765, "learning_rate": 2.570592878035192e-08, "logits/chosen": -19.509098052978516, "logits/rejected": -18.03156089782715, "logps/chosen": -430.70947265625, "logps/rejected": -273.1504821777344, "loss": 0.465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.206604480743408, "rewards/margins": 1.9840061664581299, "rewards/rejected": 2.2225985527038574, "step": 98510 }, { "epoch": 4.574028506430197, "grad_norm": 43.24309158325195, "learning_rate": 2.5678072333905936e-08, "logits/chosen": -19.56930160522461, "logits/rejected": -18.638484954833984, "logps/chosen": -371.69244384765625, "logps/rejected": -316.3008117675781, "loss": 0.7552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3758976459503174, "rewards/margins": 0.8929153680801392, "rewards/rejected": 2.482982635498047, "step": 98520 }, { "epoch": 4.574492780537629, "grad_norm": 0.5493108630180359, "learning_rate": 2.5650215887459954e-08, "logits/chosen": -19.220340728759766, "logits/rejected": -17.819164276123047, "logps/chosen": -434.6865234375, "logps/rejected": -303.1509094238281, "loss": 0.8835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.824756622314453, "rewards/margins": 1.7729008197784424, "rewards/rejected": 2.0518553256988525, "step": 98530 }, { "epoch": 4.574957054645062, "grad_norm": 273.73321533203125, "learning_rate": 2.5622359441013975e-08, "logits/chosen": -18.853425979614258, "logits/rejected": -18.146177291870117, "logps/chosen": -381.64605712890625, "logps/rejected": -332.9900207519531, "loss": 1.0061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.405930042266846, "rewards/margins": 1.515602469444275, "rewards/rejected": 2.890326976776123, "step": 98540 }, { "epoch": 4.575421328752496, "grad_norm": 299.51593017578125, "learning_rate": 2.5594502994567993e-08, "logits/chosen": -18.35177993774414, "logits/rejected": -18.019105911254883, "logps/chosen": -403.5938415527344, "logps/rejected": -331.72674560546875, "loss": 1.4631, "rewards/accuracies": 0.5, "rewards/chosen": 2.7366998195648193, "rewards/margins": 0.8179620504379272, "rewards/rejected": 1.918737769126892, "step": 98550 }, { "epoch": 4.575885602859929, "grad_norm": 14.225238800048828, "learning_rate": 2.556664654812201e-08, "logits/chosen": -18.079753875732422, "logits/rejected": -18.767255783081055, "logps/chosen": -266.6131896972656, "logps/rejected": -335.81646728515625, "loss": 1.1429, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.687370777130127, "rewards/margins": 0.11541976779699326, "rewards/rejected": 2.571950912475586, "step": 98560 }, { "epoch": 4.576349876967361, "grad_norm": 225.7623291015625, "learning_rate": 2.553879010167603e-08, "logits/chosen": -18.499740600585938, "logits/rejected": -18.370471954345703, "logps/chosen": -303.13525390625, "logps/rejected": -324.2501525878906, "loss": 1.3036, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5842084884643555, "rewards/margins": -0.3049449324607849, "rewards/rejected": 1.889153242111206, "step": 98570 }, { "epoch": 4.576814151074794, "grad_norm": 52.825191497802734, "learning_rate": 2.5510933655230043e-08, "logits/chosen": -18.58401870727539, "logits/rejected": -18.227645874023438, "logps/chosen": -311.05462646484375, "logps/rejected": -253.49508666992188, "loss": 0.9665, "rewards/accuracies": 0.5, "rewards/chosen": 2.7655928134918213, "rewards/margins": 0.9735286831855774, "rewards/rejected": 1.7920639514923096, "step": 98580 }, { "epoch": 4.577278425182228, "grad_norm": 4.998990058898926, "learning_rate": 2.5483077208784065e-08, "logits/chosen": -19.61993408203125, "logits/rejected": -18.409616470336914, "logps/chosen": -394.33599853515625, "logps/rejected": -277.03436279296875, "loss": 0.3668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8255226612091064, "rewards/margins": 2.5795109272003174, "rewards/rejected": 1.2460118532180786, "step": 98590 }, { "epoch": 4.577742699289661, "grad_norm": 0.022751733660697937, "learning_rate": 2.5455220762338082e-08, "logits/chosen": -18.79237174987793, "logits/rejected": -17.67720603942871, "logps/chosen": -611.5582885742188, "logps/rejected": -450.68450927734375, "loss": 0.8987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.396514415740967, "rewards/margins": 2.216387987136841, "rewards/rejected": 3.180126667022705, "step": 98600 }, { "epoch": 4.578206973397093, "grad_norm": 129.45997619628906, "learning_rate": 2.54273643158921e-08, "logits/chosen": -18.310232162475586, "logits/rejected": -17.78047752380371, "logps/chosen": -438.6764221191406, "logps/rejected": -365.48876953125, "loss": 0.4344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0489606857299805, "rewards/margins": 2.2228000164031982, "rewards/rejected": 1.8261604309082031, "step": 98610 }, { "epoch": 4.578671247504527, "grad_norm": 134.3046417236328, "learning_rate": 2.539950786944612e-08, "logits/chosen": -19.41134262084961, "logits/rejected": -18.997905731201172, "logps/chosen": -430.80242919921875, "logps/rejected": -388.0317687988281, "loss": 1.0076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.589878559112549, "rewards/margins": 0.5283839106559753, "rewards/rejected": 3.0614943504333496, "step": 98620 }, { "epoch": 4.57913552161196, "grad_norm": 157.2562255859375, "learning_rate": 2.537165142300014e-08, "logits/chosen": -20.285329818725586, "logits/rejected": -19.14054298400879, "logps/chosen": -408.0657043457031, "logps/rejected": -297.09356689453125, "loss": 0.2825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.292502403259277, "rewards/margins": 2.568251609802246, "rewards/rejected": 1.724250078201294, "step": 98630 }, { "epoch": 4.579599795719393, "grad_norm": 9.672836303710938, "learning_rate": 2.5343794976554157e-08, "logits/chosen": -19.86252784729004, "logits/rejected": -19.130916595458984, "logps/chosen": -379.4437561035156, "logps/rejected": -376.269287109375, "loss": 0.1771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.492146015167236, "rewards/margins": 2.786637783050537, "rewards/rejected": 1.7055082321166992, "step": 98640 }, { "epoch": 4.580064069826825, "grad_norm": 1.0065861940383911, "learning_rate": 2.5315938530108175e-08, "logits/chosen": -18.05247688293457, "logits/rejected": -17.924823760986328, "logps/chosen": -270.2347412109375, "logps/rejected": -209.91030883789062, "loss": 0.6063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4920778274536133, "rewards/margins": 0.9745110273361206, "rewards/rejected": 0.5175668597221375, "step": 98650 }, { "epoch": 4.580528343934259, "grad_norm": NaN, "learning_rate": 2.529086772830679e-08, "logits/chosen": -19.672264099121094, "logits/rejected": -18.914182662963867, "logps/chosen": -435.613037109375, "logps/rejected": -366.8993835449219, "loss": 0.9676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4635097980499268, "rewards/margins": 0.6230340003967285, "rewards/rejected": 2.8404765129089355, "step": 98660 }, { "epoch": 4.580992618041692, "grad_norm": 144.1663055419922, "learning_rate": 2.526301128186081e-08, "logits/chosen": -18.601945877075195, "logits/rejected": -18.242382049560547, "logps/chosen": -371.4856262207031, "logps/rejected": -300.7193298339844, "loss": 0.8175, "rewards/accuracies": 0.5, "rewards/chosen": 3.150574207305908, "rewards/margins": 0.4257654547691345, "rewards/rejected": 2.72480845451355, "step": 98670 }, { "epoch": 4.5814568921491245, "grad_norm": 172.6455078125, "learning_rate": 2.523515483541483e-08, "logits/chosen": -19.062641143798828, "logits/rejected": -18.545642852783203, "logps/chosen": -301.3674011230469, "logps/rejected": -411.8155822753906, "loss": 1.391, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9924728870391846, "rewards/margins": 0.024903178215026855, "rewards/rejected": 2.967569351196289, "step": 98680 }, { "epoch": 4.581921166256558, "grad_norm": 173.2485809326172, "learning_rate": 2.5207298388968846e-08, "logits/chosen": -18.658315658569336, "logits/rejected": -17.567306518554688, "logps/chosen": -434.6859436035156, "logps/rejected": -284.90740966796875, "loss": 0.8475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8891799449920654, "rewards/margins": 1.9447494745254517, "rewards/rejected": 1.9444305896759033, "step": 98690 }, { "epoch": 4.582385440363991, "grad_norm": 48.179927825927734, "learning_rate": 2.5179441942522864e-08, "logits/chosen": -20.223731994628906, "logits/rejected": -19.472272872924805, "logps/chosen": -394.9244689941406, "logps/rejected": -354.69500732421875, "loss": 0.5003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.776033878326416, "rewards/margins": 0.8653388023376465, "rewards/rejected": 2.9106945991516113, "step": 98700 }, { "epoch": 4.582849714471424, "grad_norm": 10.281394004821777, "learning_rate": 2.5151585496076885e-08, "logits/chosen": -19.249584197998047, "logits/rejected": -18.405439376831055, "logps/chosen": -410.0513610839844, "logps/rejected": -278.57012939453125, "loss": 0.2657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.98734712600708, "rewards/margins": 2.3363237380981445, "rewards/rejected": 1.651023268699646, "step": 98710 }, { "epoch": 4.5833139885788565, "grad_norm": 148.01052856445312, "learning_rate": 2.5123729049630903e-08, "logits/chosen": -19.373367309570312, "logits/rejected": -18.756563186645508, "logps/chosen": -401.3583679199219, "logps/rejected": -355.08447265625, "loss": 1.0954, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3715782165527344, "rewards/margins": 1.127922773361206, "rewards/rejected": 2.2436554431915283, "step": 98720 }, { "epoch": 4.58377826268629, "grad_norm": 31.806995391845703, "learning_rate": 2.5095872603184918e-08, "logits/chosen": -19.116779327392578, "logits/rejected": -17.549161911010742, "logps/chosen": -412.07501220703125, "logps/rejected": -238.2396240234375, "loss": 0.2668, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.183016300201416, "rewards/margins": 3.0891339778900146, "rewards/rejected": 0.09388218075037003, "step": 98730 }, { "epoch": 4.584242536793723, "grad_norm": 5.247154712677002, "learning_rate": 2.5068016156738935e-08, "logits/chosen": -19.769582748413086, "logits/rejected": -19.497182846069336, "logps/chosen": -541.6134033203125, "logps/rejected": -461.4562072753906, "loss": 0.4588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.974038600921631, "rewards/margins": 1.6316564083099365, "rewards/rejected": 3.3423819541931152, "step": 98740 }, { "epoch": 4.584706810901156, "grad_norm": 3.6373133659362793, "learning_rate": 2.5040159710292953e-08, "logits/chosen": -18.909807205200195, "logits/rejected": -18.559871673583984, "logps/chosen": -361.73187255859375, "logps/rejected": -330.7172546386719, "loss": 0.606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8895740509033203, "rewards/margins": 1.364061951637268, "rewards/rejected": 2.525512218475342, "step": 98750 }, { "epoch": 4.585171085008589, "grad_norm": 55.20087432861328, "learning_rate": 2.5012303263846974e-08, "logits/chosen": -19.02085304260254, "logits/rejected": -18.570968627929688, "logps/chosen": -363.9054260253906, "logps/rejected": -356.68939208984375, "loss": 0.8532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7537035942077637, "rewards/margins": 0.9154520034790039, "rewards/rejected": 1.8382513523101807, "step": 98760 }, { "epoch": 4.585635359116022, "grad_norm": 117.07117462158203, "learning_rate": 2.4984446817400992e-08, "logits/chosen": -17.958335876464844, "logits/rejected": -18.750736236572266, "logps/chosen": -323.041748046875, "logps/rejected": -396.143310546875, "loss": 1.3683, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.149930477142334, "rewards/margins": 0.4409714639186859, "rewards/rejected": 2.7089593410491943, "step": 98770 }, { "epoch": 4.586099633223455, "grad_norm": 164.8032989501953, "learning_rate": 2.495659037095501e-08, "logits/chosen": -19.236408233642578, "logits/rejected": -18.408660888671875, "logps/chosen": -351.20269775390625, "logps/rejected": -360.54620361328125, "loss": 0.9785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9917054176330566, "rewards/margins": 0.6263060569763184, "rewards/rejected": 2.365399122238159, "step": 98780 }, { "epoch": 4.586563907330888, "grad_norm": 44.77653503417969, "learning_rate": 2.4928733924509028e-08, "logits/chosen": -18.397491455078125, "logits/rejected": -19.088708877563477, "logps/chosen": -298.0898132324219, "logps/rejected": -310.518310546875, "loss": 0.9014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4536938667297363, "rewards/margins": 0.18060967326164246, "rewards/rejected": 2.2730846405029297, "step": 98790 }, { "epoch": 4.587028181438321, "grad_norm": 140.40184020996094, "learning_rate": 2.490087747806305e-08, "logits/chosen": -18.505342483520508, "logits/rejected": -17.709962844848633, "logps/chosen": -363.21148681640625, "logps/rejected": -265.87060546875, "loss": 0.4947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.926750421524048, "rewards/margins": 1.571381688117981, "rewards/rejected": 1.3553686141967773, "step": 98800 }, { "epoch": 4.587492455545754, "grad_norm": 1.5298329591751099, "learning_rate": 2.4873021031617067e-08, "logits/chosen": -20.41232681274414, "logits/rejected": -19.115585327148438, "logps/chosen": -377.03753662109375, "logps/rejected": -349.3095703125, "loss": 0.5805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4284682273864746, "rewards/margins": 1.7968933582305908, "rewards/rejected": 1.6315752267837524, "step": 98810 }, { "epoch": 4.587956729653187, "grad_norm": 25.34510040283203, "learning_rate": 2.4845164585171085e-08, "logits/chosen": -19.172269821166992, "logits/rejected": -18.571483612060547, "logps/chosen": -350.8287658691406, "logps/rejected": -274.06268310546875, "loss": 0.6287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1716628074645996, "rewards/margins": 1.0191876888275146, "rewards/rejected": 2.152475118637085, "step": 98820 }, { "epoch": 4.588421003760621, "grad_norm": 267.8529357910156, "learning_rate": 2.48173081387251e-08, "logits/chosen": -18.349987030029297, "logits/rejected": -18.63211441040039, "logps/chosen": -438.43194580078125, "logps/rejected": -417.8075256347656, "loss": 1.5821, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.389085054397583, "rewards/margins": -0.3545684218406677, "rewards/rejected": 3.7436537742614746, "step": 98830 }, { "epoch": 4.588885277868053, "grad_norm": 73.97370147705078, "learning_rate": 2.4789451692279117e-08, "logits/chosen": -18.575138092041016, "logits/rejected": -18.278532028198242, "logps/chosen": -397.7163391113281, "logps/rejected": -289.48895263671875, "loss": 0.6994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.457507610321045, "rewards/margins": 0.8627594113349915, "rewards/rejected": 1.5947482585906982, "step": 98840 }, { "epoch": 4.589349551975486, "grad_norm": 112.43964385986328, "learning_rate": 2.476159524583314e-08, "logits/chosen": -18.642454147338867, "logits/rejected": -17.292959213256836, "logps/chosen": -452.5618591308594, "logps/rejected": -293.2599792480469, "loss": 0.3441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.539651870727539, "rewards/margins": 2.6170172691345215, "rewards/rejected": 1.9226350784301758, "step": 98850 }, { "epoch": 4.589813826082919, "grad_norm": 117.22399139404297, "learning_rate": 2.4733738799387156e-08, "logits/chosen": -19.146987915039062, "logits/rejected": -19.11501693725586, "logps/chosen": -427.3154296875, "logps/rejected": -653.5009155273438, "loss": 0.8145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2369391918182373, "rewards/margins": 2.2006144523620605, "rewards/rejected": 1.0363248586654663, "step": 98860 }, { "epoch": 4.5902781001903525, "grad_norm": 73.64876556396484, "learning_rate": 2.4705882352941174e-08, "logits/chosen": -19.86748695373535, "logits/rejected": -18.934375762939453, "logps/chosen": -435.7945861816406, "logps/rejected": -409.3052673339844, "loss": 0.5886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.378151893615723, "rewards/margins": 1.0034319162368774, "rewards/rejected": 3.3747200965881348, "step": 98870 }, { "epoch": 4.590742374297785, "grad_norm": 91.54808044433594, "learning_rate": 2.4678025906495195e-08, "logits/chosen": -18.90924644470215, "logits/rejected": -19.7836856842041, "logps/chosen": -354.9561462402344, "logps/rejected": -365.4425354003906, "loss": 1.2104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.807664632797241, "rewards/margins": 0.26322388648986816, "rewards/rejected": 2.544440984725952, "step": 98880 }, { "epoch": 4.591206648405218, "grad_norm": 90.97909545898438, "learning_rate": 2.4650169460049213e-08, "logits/chosen": -20.06814193725586, "logits/rejected": -18.807125091552734, "logps/chosen": -503.7071228027344, "logps/rejected": -439.497802734375, "loss": 0.6376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.435676574707031, "rewards/margins": 1.2635064125061035, "rewards/rejected": 3.1721701622009277, "step": 98890 }, { "epoch": 4.591670922512652, "grad_norm": 128.36204528808594, "learning_rate": 2.462231301360323e-08, "logits/chosen": -18.019075393676758, "logits/rejected": -17.99730682373047, "logps/chosen": -279.2696838378906, "logps/rejected": -305.3953857421875, "loss": 1.3312, "rewards/accuracies": 0.5, "rewards/chosen": 2.5590457916259766, "rewards/margins": -0.13093867897987366, "rewards/rejected": 2.6899847984313965, "step": 98900 }, { "epoch": 4.5921351966200845, "grad_norm": 61.040035247802734, "learning_rate": 2.459445656715725e-08, "logits/chosen": -19.698627471923828, "logits/rejected": -18.940502166748047, "logps/chosen": -358.3282165527344, "logps/rejected": -340.2089538574219, "loss": 0.4279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.662830114364624, "rewards/margins": 1.059148907661438, "rewards/rejected": 1.603681206703186, "step": 98910 }, { "epoch": 4.592599470727517, "grad_norm": 12.676488876342773, "learning_rate": 2.456660012071127e-08, "logits/chosen": -18.33898162841797, "logits/rejected": -18.234996795654297, "logps/chosen": -403.6888122558594, "logps/rejected": -346.5406188964844, "loss": 0.8207, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.221090316772461, "rewards/margins": 0.7820407748222351, "rewards/rejected": 3.439049243927002, "step": 98920 }, { "epoch": 4.59306374483495, "grad_norm": 3.93532133102417, "learning_rate": 2.4538743674265284e-08, "logits/chosen": -20.24648666381836, "logits/rejected": -19.88275909423828, "logps/chosen": -415.69610595703125, "logps/rejected": -383.2884826660156, "loss": 0.5784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.746525764465332, "rewards/margins": 1.9580333232879639, "rewards/rejected": 2.7884926795959473, "step": 98930 }, { "epoch": 4.593528018942384, "grad_norm": 239.04725646972656, "learning_rate": 2.4510887227819302e-08, "logits/chosen": -18.92508316040039, "logits/rejected": -18.611465454101562, "logps/chosen": -351.7267150878906, "logps/rejected": -279.5123596191406, "loss": 0.7454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.548701047897339, "rewards/margins": 1.5523121356964111, "rewards/rejected": 1.9963886737823486, "step": 98940 }, { "epoch": 4.5939922930498165, "grad_norm": 0.8162503838539124, "learning_rate": 2.448303078137332e-08, "logits/chosen": -18.91022491455078, "logits/rejected": -18.085668563842773, "logps/chosen": -390.08660888671875, "logps/rejected": -335.9210510253906, "loss": 0.6692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6833794116973877, "rewards/margins": 1.5915937423706055, "rewards/rejected": 2.0917859077453613, "step": 98950 }, { "epoch": 4.594456567157249, "grad_norm": 152.60720825195312, "learning_rate": 2.4455174334927338e-08, "logits/chosen": -19.14643669128418, "logits/rejected": -19.362789154052734, "logps/chosen": -319.5837707519531, "logps/rejected": -306.4180908203125, "loss": 1.3998, "rewards/accuracies": 0.5, "rewards/chosen": 2.9580492973327637, "rewards/margins": -0.1538987159729004, "rewards/rejected": 3.111947774887085, "step": 98960 }, { "epoch": 4.594920841264683, "grad_norm": 38.40946578979492, "learning_rate": 2.442731788848136e-08, "logits/chosen": -18.45174789428711, "logits/rejected": -17.515193939208984, "logps/chosen": -309.1680603027344, "logps/rejected": -235.6670684814453, "loss": 0.3243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1750731468200684, "rewards/margins": 1.9557712078094482, "rewards/rejected": 0.21930205821990967, "step": 98970 }, { "epoch": 4.595385115372116, "grad_norm": 113.4792251586914, "learning_rate": 2.4399461442035377e-08, "logits/chosen": -20.016611099243164, "logits/rejected": -18.885181427001953, "logps/chosen": -390.59991455078125, "logps/rejected": -200.68106079101562, "loss": 0.2969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3199496269226074, "rewards/margins": 2.196075916290283, "rewards/rejected": 1.1238728761672974, "step": 98980 }, { "epoch": 4.5958493894795485, "grad_norm": 60.00958251953125, "learning_rate": 2.4371604995589395e-08, "logits/chosen": -19.60544204711914, "logits/rejected": -19.06867218017578, "logps/chosen": -310.2018737792969, "logps/rejected": -248.8392333984375, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": 3.1654820442199707, "rewards/margins": 1.5813699960708618, "rewards/rejected": 1.5841120481491089, "step": 98990 }, { "epoch": 4.596313663586982, "grad_norm": 146.31007385253906, "learning_rate": 2.4343748549143413e-08, "logits/chosen": -19.341943740844727, "logits/rejected": -19.310558319091797, "logps/chosen": -338.3692626953125, "logps/rejected": -338.30126953125, "loss": 0.9056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1027069091796875, "rewards/margins": 0.17335811257362366, "rewards/rejected": 1.9293487071990967, "step": 99000 }, { "epoch": 4.596777937694415, "grad_norm": 104.70417022705078, "learning_rate": 2.4315892102697434e-08, "logits/chosen": -18.359643936157227, "logits/rejected": -18.132299423217773, "logps/chosen": -427.8677673339844, "logps/rejected": -392.8997802734375, "loss": 0.6287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9382071495056152, "rewards/margins": 0.6986303329467773, "rewards/rejected": 2.239577293395996, "step": 99010 }, { "epoch": 4.597242211801848, "grad_norm": 35.58302688598633, "learning_rate": 2.4288035656251452e-08, "logits/chosen": -19.199556350708008, "logits/rejected": -17.914684295654297, "logps/chosen": -379.2261657714844, "logps/rejected": -293.9762878417969, "loss": 0.5055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.598923444747925, "rewards/margins": 1.2442104816436768, "rewards/rejected": 1.3547130823135376, "step": 99020 }, { "epoch": 4.597706485909281, "grad_norm": 155.3734588623047, "learning_rate": 2.4260179209805466e-08, "logits/chosen": -19.140146255493164, "logits/rejected": -18.676929473876953, "logps/chosen": -289.54736328125, "logps/rejected": -271.46124267578125, "loss": 1.0324, "rewards/accuracies": 0.5, "rewards/chosen": 2.6533379554748535, "rewards/margins": 0.7151376008987427, "rewards/rejected": 1.9381999969482422, "step": 99030 }, { "epoch": 4.598170760016714, "grad_norm": 129.1954345703125, "learning_rate": 2.4232322763359484e-08, "logits/chosen": -18.079740524291992, "logits/rejected": -19.240631103515625, "logps/chosen": -274.20050048828125, "logps/rejected": -331.9027404785156, "loss": 1.5584, "rewards/accuracies": 0.5, "rewards/chosen": 1.5921804904937744, "rewards/margins": -0.6373753547668457, "rewards/rejected": 2.229555606842041, "step": 99040 }, { "epoch": 4.598635034124147, "grad_norm": 23.279367446899414, "learning_rate": 2.4204466316913502e-08, "logits/chosen": -20.44914436340332, "logits/rejected": -19.269289016723633, "logps/chosen": -518.2713623046875, "logps/rejected": -422.36077880859375, "loss": 0.4556, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1772918701171875, "rewards/margins": 1.8693857192993164, "rewards/rejected": 2.307906150817871, "step": 99050 }, { "epoch": 4.59909930823158, "grad_norm": 18.22722053527832, "learning_rate": 2.4176609870467523e-08, "logits/chosen": -20.425546646118164, "logits/rejected": -20.261001586914062, "logps/chosen": -463.33062744140625, "logps/rejected": -471.54644775390625, "loss": 0.8322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8308804035186768, "rewards/margins": 0.4492618143558502, "rewards/rejected": 3.3816184997558594, "step": 99060 }, { "epoch": 4.599563582339013, "grad_norm": 5.539674758911133, "learning_rate": 2.414875342402154e-08, "logits/chosen": -19.416494369506836, "logits/rejected": -17.911643981933594, "logps/chosen": -412.1951599121094, "logps/rejected": -264.0562438964844, "loss": 0.2589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.651402473449707, "rewards/margins": 3.192300319671631, "rewards/rejected": 1.459101676940918, "step": 99070 }, { "epoch": 4.600027856446446, "grad_norm": 261.9225769042969, "learning_rate": 2.412089697757556e-08, "logits/chosen": -19.65231704711914, "logits/rejected": -18.792089462280273, "logps/chosen": -336.9241027832031, "logps/rejected": -359.4423522949219, "loss": 0.7676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.516261577606201, "rewards/margins": 1.1246280670166016, "rewards/rejected": 2.3916335105895996, "step": 99080 }, { "epoch": 4.600492130553879, "grad_norm": 2.32879376411438, "learning_rate": 2.409304053112958e-08, "logits/chosen": -19.000534057617188, "logits/rejected": -18.100276947021484, "logps/chosen": -407.33428955078125, "logps/rejected": -325.25677490234375, "loss": 0.421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.568098545074463, "rewards/margins": 2.9071261882781982, "rewards/rejected": 1.6609723567962646, "step": 99090 }, { "epoch": 4.6009564046613125, "grad_norm": 65.38603973388672, "learning_rate": 2.4065184084683598e-08, "logits/chosen": -19.77800178527832, "logits/rejected": -18.431007385253906, "logps/chosen": -431.83489990234375, "logps/rejected": -364.9188232421875, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 4.097559452056885, "rewards/margins": 2.320467948913574, "rewards/rejected": 1.777091383934021, "step": 99100 }, { "epoch": 4.601420678768745, "grad_norm": 102.6268081665039, "learning_rate": 2.4037327638237616e-08, "logits/chosen": -19.41482162475586, "logits/rejected": -18.531217575073242, "logps/chosen": -468.4239196777344, "logps/rejected": -367.9486999511719, "loss": 0.429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.068245887756348, "rewards/margins": 1.828287124633789, "rewards/rejected": 2.2399587631225586, "step": 99110 }, { "epoch": 4.601884952876178, "grad_norm": 142.84071350097656, "learning_rate": 2.4009471191791633e-08, "logits/chosen": -19.452083587646484, "logits/rejected": -18.139009475708008, "logps/chosen": -443.22113037109375, "logps/rejected": -286.3187561035156, "loss": 0.5294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1614651679992676, "rewards/margins": 1.5113952159881592, "rewards/rejected": 1.6500695943832397, "step": 99120 }, { "epoch": 4.602349226983611, "grad_norm": 95.5003890991211, "learning_rate": 2.3981614745345648e-08, "logits/chosen": -19.725618362426758, "logits/rejected": -19.512697219848633, "logps/chosen": -482.9593811035156, "logps/rejected": -405.76177978515625, "loss": 0.567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.448616981506348, "rewards/margins": 1.9871009588241577, "rewards/rejected": 2.4615159034729004, "step": 99130 }, { "epoch": 4.6028135010910445, "grad_norm": 67.94087982177734, "learning_rate": 2.395375829889967e-08, "logits/chosen": -18.696487426757812, "logits/rejected": -18.67784309387207, "logps/chosen": -246.180908203125, "logps/rejected": -338.7411193847656, "loss": 1.6167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5467356443405151, "rewards/margins": -0.9825218319892883, "rewards/rejected": 2.5292575359344482, "step": 99140 }, { "epoch": 4.603277775198477, "grad_norm": 152.98385620117188, "learning_rate": 2.3925901852453687e-08, "logits/chosen": -20.279842376708984, "logits/rejected": -18.165645599365234, "logps/chosen": -456.483154296875, "logps/rejected": -297.1772155761719, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 4.220950603485107, "rewards/margins": 2.710946559906006, "rewards/rejected": 1.5100042819976807, "step": 99150 }, { "epoch": 4.60374204930591, "grad_norm": 113.83055114746094, "learning_rate": 2.3898045406007705e-08, "logits/chosen": -18.799907684326172, "logits/rejected": -18.359045028686523, "logps/chosen": -442.57537841796875, "logps/rejected": -395.982666015625, "loss": 0.3125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.301608085632324, "rewards/margins": 2.0814805030822754, "rewards/rejected": 2.2201268672943115, "step": 99160 }, { "epoch": 4.604206323413344, "grad_norm": 4.830586910247803, "learning_rate": 2.3870188959561723e-08, "logits/chosen": -19.473508834838867, "logits/rejected": -19.080806732177734, "logps/chosen": -383.96099853515625, "logps/rejected": -379.8036193847656, "loss": 0.3852, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.876469373703003, "rewards/margins": 1.3333736658096313, "rewards/rejected": 2.543095827102661, "step": 99170 }, { "epoch": 4.6046705975207765, "grad_norm": 253.58334350585938, "learning_rate": 2.3842332513115744e-08, "logits/chosen": -20.047683715820312, "logits/rejected": -20.62765884399414, "logps/chosen": -301.4323425292969, "logps/rejected": -422.057373046875, "loss": 1.5332, "rewards/accuracies": 0.5, "rewards/chosen": 3.5468056201934814, "rewards/margins": -0.49122029542922974, "rewards/rejected": 4.038025379180908, "step": 99180 }, { "epoch": 4.605134871628209, "grad_norm": 2.9839038848876953, "learning_rate": 2.3814476066669762e-08, "logits/chosen": -19.007373809814453, "logits/rejected": -18.00299072265625, "logps/chosen": -562.8677978515625, "logps/rejected": -437.2975158691406, "loss": 0.1956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.726590156555176, "rewards/margins": 2.8006203174591064, "rewards/rejected": 2.9259698390960693, "step": 99190 }, { "epoch": 4.605599145735642, "grad_norm": 6.919375419616699, "learning_rate": 2.378661962022378e-08, "logits/chosen": -19.393369674682617, "logits/rejected": -19.026065826416016, "logps/chosen": -509.1685485839844, "logps/rejected": -416.77142333984375, "loss": 0.751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.1583685874938965, "rewards/margins": 1.0093284845352173, "rewards/rejected": 4.1490397453308105, "step": 99200 }, { "epoch": 4.606063419843076, "grad_norm": 0.27017104625701904, "learning_rate": 2.3758763173777797e-08, "logits/chosen": -19.407201766967773, "logits/rejected": -17.847211837768555, "logps/chosen": -385.9035339355469, "logps/rejected": -267.25250244140625, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.386565685272217, "rewards/margins": 2.244126081466675, "rewards/rejected": 1.1424397230148315, "step": 99210 }, { "epoch": 4.606527693950508, "grad_norm": 0.2566637396812439, "learning_rate": 2.373090672733182e-08, "logits/chosen": -19.030941009521484, "logits/rejected": -18.690990447998047, "logps/chosen": -484.21038818359375, "logps/rejected": -383.3115539550781, "loss": 0.6919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.009008407592773, "rewards/margins": 1.2120006084442139, "rewards/rejected": 2.7970080375671387, "step": 99220 }, { "epoch": 4.606991968057941, "grad_norm": 7.936439037322998, "learning_rate": 2.3703050280885833e-08, "logits/chosen": -19.794727325439453, "logits/rejected": -19.895732879638672, "logps/chosen": -410.79132080078125, "logps/rejected": -444.0928649902344, "loss": 0.7282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.283024787902832, "rewards/margins": 0.7227210402488708, "rewards/rejected": 3.5603041648864746, "step": 99230 }, { "epoch": 4.607456242165375, "grad_norm": 0.7229048609733582, "learning_rate": 2.367519383443985e-08, "logits/chosen": -19.509723663330078, "logits/rejected": -18.236753463745117, "logps/chosen": -409.6212463378906, "logps/rejected": -335.0597229003906, "loss": 0.8089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2168145179748535, "rewards/margins": 1.301102876663208, "rewards/rejected": 1.9157114028930664, "step": 99240 }, { "epoch": 4.607920516272808, "grad_norm": 19.403457641601562, "learning_rate": 2.364733738799387e-08, "logits/chosen": -19.520347595214844, "logits/rejected": -18.74846076965332, "logps/chosen": -437.7493591308594, "logps/rejected": -350.2894287109375, "loss": 0.3032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.662865161895752, "rewards/margins": 2.2737104892730713, "rewards/rejected": 2.3891549110412598, "step": 99250 }, { "epoch": 4.60838479038024, "grad_norm": 0.12554754316806793, "learning_rate": 2.3619480941547887e-08, "logits/chosen": -18.532575607299805, "logits/rejected": -17.81357192993164, "logps/chosen": -311.7486572265625, "logps/rejected": -288.1066589355469, "loss": 0.592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.185685873031616, "rewards/margins": 2.115946054458618, "rewards/rejected": 1.0697399377822876, "step": 99260 }, { "epoch": 4.608849064487673, "grad_norm": 1.6811050176620483, "learning_rate": 2.3591624495101908e-08, "logits/chosen": -18.533267974853516, "logits/rejected": -16.709842681884766, "logps/chosen": -365.00396728515625, "logps/rejected": -202.10009765625, "loss": 0.2409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.08132266998291, "rewards/margins": 3.5199344158172607, "rewards/rejected": 0.5613876581192017, "step": 99270 }, { "epoch": 4.609313338595107, "grad_norm": 151.12892150878906, "learning_rate": 2.3563768048655926e-08, "logits/chosen": -19.41668128967285, "logits/rejected": -18.826139450073242, "logps/chosen": -316.1385803222656, "logps/rejected": -260.6593322753906, "loss": 0.5608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.944451093673706, "rewards/margins": 1.749355673789978, "rewards/rejected": 2.1950955390930176, "step": 99280 }, { "epoch": 4.60977761270254, "grad_norm": 10.595823287963867, "learning_rate": 2.3535911602209943e-08, "logits/chosen": -18.99862289428711, "logits/rejected": -18.596513748168945, "logps/chosen": -411.5059509277344, "logps/rejected": -334.7220153808594, "loss": 0.6263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8319485187530518, "rewards/margins": 1.899593710899353, "rewards/rejected": 1.9323546886444092, "step": 99290 }, { "epoch": 4.610241886809972, "grad_norm": 123.18144989013672, "learning_rate": 2.3508055155763965e-08, "logits/chosen": -18.19388771057129, "logits/rejected": -18.205896377563477, "logps/chosen": -351.1875, "logps/rejected": -366.9295654296875, "loss": 1.1951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8181238174438477, "rewards/margins": 0.20808248221874237, "rewards/rejected": 2.610041379928589, "step": 99300 }, { "epoch": 4.610706160917406, "grad_norm": 13.998746871948242, "learning_rate": 2.3480198709317982e-08, "logits/chosen": -19.401844024658203, "logits/rejected": -18.911067962646484, "logps/chosen": -368.4892883300781, "logps/rejected": -340.08233642578125, "loss": 0.4214, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2227940559387207, "rewards/margins": 1.8205745220184326, "rewards/rejected": 1.4022194147109985, "step": 99310 }, { "epoch": 4.611170435024839, "grad_norm": 34.43833923339844, "learning_rate": 2.3452342262872e-08, "logits/chosen": -19.40694236755371, "logits/rejected": -17.953365325927734, "logps/chosen": -480.79095458984375, "logps/rejected": -341.54022216796875, "loss": 0.5911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.056000709533691, "rewards/margins": 1.386048674583435, "rewards/rejected": 2.669952392578125, "step": 99320 }, { "epoch": 4.611634709132272, "grad_norm": 236.39743041992188, "learning_rate": 2.3424485816426015e-08, "logits/chosen": -17.72544288635254, "logits/rejected": -17.501726150512695, "logps/chosen": -391.16046142578125, "logps/rejected": -395.1900329589844, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": 3.355792999267578, "rewards/margins": 1.5802364349365234, "rewards/rejected": 1.7755565643310547, "step": 99330 }, { "epoch": 4.612098983239704, "grad_norm": 149.76490783691406, "learning_rate": 2.3396629369980033e-08, "logits/chosen": -19.625585556030273, "logits/rejected": -18.69135284423828, "logps/chosen": -455.7162170410156, "logps/rejected": -400.0585632324219, "loss": 0.4772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.689133644104004, "rewards/margins": 1.7496578693389893, "rewards/rejected": 2.9394752979278564, "step": 99340 }, { "epoch": 4.612563257347138, "grad_norm": 1.159769058227539, "learning_rate": 2.3368772923534054e-08, "logits/chosen": -18.34381866455078, "logits/rejected": -17.66251564025879, "logps/chosen": -305.18011474609375, "logps/rejected": -195.5502166748047, "loss": 0.5194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9500970840454102, "rewards/margins": 2.20564341545105, "rewards/rejected": -0.25554654002189636, "step": 99350 }, { "epoch": 4.613027531454571, "grad_norm": 182.9435577392578, "learning_rate": 2.3340916477088072e-08, "logits/chosen": -19.08095932006836, "logits/rejected": -19.611318588256836, "logps/chosen": -388.4892578125, "logps/rejected": -366.961181640625, "loss": 0.9576, "rewards/accuracies": 0.5, "rewards/chosen": 2.8290348052978516, "rewards/margins": 0.36755186319351196, "rewards/rejected": 2.4614830017089844, "step": 99360 }, { "epoch": 4.613491805562004, "grad_norm": 0.12487338483333588, "learning_rate": 2.331306003064209e-08, "logits/chosen": -19.223295211791992, "logits/rejected": -17.95734977722168, "logps/chosen": -540.3902587890625, "logps/rejected": -319.1109313964844, "loss": 0.5407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.6264567375183105, "rewards/margins": 3.720676898956299, "rewards/rejected": 1.9057796001434326, "step": 99370 }, { "epoch": 4.613956079669437, "grad_norm": 2.3204426765441895, "learning_rate": 2.3285203584196107e-08, "logits/chosen": -19.043445587158203, "logits/rejected": -18.008316040039062, "logps/chosen": -364.1197814941406, "logps/rejected": -346.5272521972656, "loss": 0.6342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4591121673583984, "rewards/margins": 1.5793625116348267, "rewards/rejected": 1.8797495365142822, "step": 99380 }, { "epoch": 4.61442035377687, "grad_norm": 1.1918861865997314, "learning_rate": 2.325734713775013e-08, "logits/chosen": -18.237228393554688, "logits/rejected": -17.04714584350586, "logps/chosen": -374.30621337890625, "logps/rejected": -242.6738739013672, "loss": 0.6375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9738757610321045, "rewards/margins": 2.1775615215301514, "rewards/rejected": 0.7963141202926636, "step": 99390 }, { "epoch": 4.614884627884303, "grad_norm": 213.37814331054688, "learning_rate": 2.3229490691304146e-08, "logits/chosen": -19.555908203125, "logits/rejected": -18.990747451782227, "logps/chosen": -527.1595458984375, "logps/rejected": -439.71771240234375, "loss": 0.7238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.0259857177734375, "rewards/margins": 1.4110305309295654, "rewards/rejected": 3.614955186843872, "step": 99400 }, { "epoch": 4.6153489019917355, "grad_norm": 94.14093017578125, "learning_rate": 2.3201634244858164e-08, "logits/chosen": -19.975088119506836, "logits/rejected": -18.698623657226562, "logps/chosen": -410.9903869628906, "logps/rejected": -321.37445068359375, "loss": 0.3373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.285974979400635, "rewards/margins": 1.701746940612793, "rewards/rejected": 2.584228038787842, "step": 99410 }, { "epoch": 4.615813176099169, "grad_norm": 10.943997383117676, "learning_rate": 2.3173777798412182e-08, "logits/chosen": -19.110410690307617, "logits/rejected": -17.835994720458984, "logps/chosen": -520.1495361328125, "logps/rejected": -389.82037353515625, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": 5.1827497482299805, "rewards/margins": 3.48394513130188, "rewards/rejected": 1.6988050937652588, "step": 99420 }, { "epoch": 4.616277450206602, "grad_norm": 185.12596130371094, "learning_rate": 2.3145921351966197e-08, "logits/chosen": -19.70657730102539, "logits/rejected": -19.038869857788086, "logps/chosen": -289.91796875, "logps/rejected": -325.4993591308594, "loss": 1.208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.053053617477417, "rewards/margins": 0.4272739291191101, "rewards/rejected": 2.625779628753662, "step": 99430 }, { "epoch": 4.616741724314035, "grad_norm": 136.44398498535156, "learning_rate": 2.3118064905520218e-08, "logits/chosen": -18.847246170043945, "logits/rejected": -18.309289932250977, "logps/chosen": -408.18218994140625, "logps/rejected": -378.8560485839844, "loss": 0.9459, "rewards/accuracies": 0.5, "rewards/chosen": 3.508563280105591, "rewards/margins": 1.0988714694976807, "rewards/rejected": 2.40969181060791, "step": 99440 }, { "epoch": 4.617205998421468, "grad_norm": 1.0176539421081543, "learning_rate": 2.3090208459074236e-08, "logits/chosen": -18.72956657409668, "logits/rejected": -18.41763687133789, "logps/chosen": -311.226806640625, "logps/rejected": -309.98779296875, "loss": 0.7526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.196709632873535, "rewards/margins": 0.4533967971801758, "rewards/rejected": 1.7433128356933594, "step": 99450 }, { "epoch": 4.617670272528901, "grad_norm": 7.535536289215088, "learning_rate": 2.3062352012628253e-08, "logits/chosen": -19.737539291381836, "logits/rejected": -18.70207977294922, "logps/chosen": -366.7002868652344, "logps/rejected": -299.03399658203125, "loss": 0.4529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.009259223937988, "rewards/margins": 0.8706560134887695, "rewards/rejected": 3.1386029720306396, "step": 99460 }, { "epoch": 4.618134546636334, "grad_norm": 67.97579956054688, "learning_rate": 2.303449556618227e-08, "logits/chosen": -19.59024429321289, "logits/rejected": -18.929759979248047, "logps/chosen": -476.0908203125, "logps/rejected": -401.2003479003906, "loss": 0.7317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7471790313720703, "rewards/margins": 0.6431077122688293, "rewards/rejected": 3.1040711402893066, "step": 99470 }, { "epoch": 4.618598820743767, "grad_norm": 43.25349044799805, "learning_rate": 2.3006639119736292e-08, "logits/chosen": -19.271465301513672, "logits/rejected": -19.234901428222656, "logps/chosen": -415.69287109375, "logps/rejected": -414.12286376953125, "loss": 0.4141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4894134998321533, "rewards/margins": 1.201401948928833, "rewards/rejected": 2.288011074066162, "step": 99480 }, { "epoch": 4.6190630948512, "grad_norm": 107.62771606445312, "learning_rate": 2.297878267329031e-08, "logits/chosen": -19.932209014892578, "logits/rejected": -19.937314987182617, "logps/chosen": -410.68035888671875, "logps/rejected": -364.1360168457031, "loss": 1.0601, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2936389446258545, "rewards/margins": 0.689288318157196, "rewards/rejected": 2.6043508052825928, "step": 99490 }, { "epoch": 4.619527368958633, "grad_norm": 9.53071403503418, "learning_rate": 2.2950926226844328e-08, "logits/chosen": -20.404870986938477, "logits/rejected": -19.47877311706543, "logps/chosen": -347.387939453125, "logps/rejected": -284.18524169921875, "loss": 0.6087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6138529777526855, "rewards/margins": 1.7932751178741455, "rewards/rejected": 0.8205782771110535, "step": 99500 }, { "epoch": 4.619991643066066, "grad_norm": 94.45165252685547, "learning_rate": 2.292306978039835e-08, "logits/chosen": -19.2489070892334, "logits/rejected": -19.31622886657715, "logps/chosen": -363.7187194824219, "logps/rejected": -365.46795654296875, "loss": 0.8064, "rewards/accuracies": 0.5, "rewards/chosen": 2.7638516426086426, "rewards/margins": 0.5776710510253906, "rewards/rejected": 2.186180591583252, "step": 99510 }, { "epoch": 4.6204559171735, "grad_norm": 214.1158447265625, "learning_rate": 2.2895213333952367e-08, "logits/chosen": -18.54129981994629, "logits/rejected": -17.59081268310547, "logps/chosen": -434.657958984375, "logps/rejected": -339.0596618652344, "loss": 0.5734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6954731941223145, "rewards/margins": 2.1702301502227783, "rewards/rejected": 1.5252430438995361, "step": 99520 }, { "epoch": 4.620920191280932, "grad_norm": 1.5365384817123413, "learning_rate": 2.286735688750638e-08, "logits/chosen": -19.460790634155273, "logits/rejected": -17.524391174316406, "logps/chosen": -431.58746337890625, "logps/rejected": -250.50399780273438, "loss": 0.704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.278369426727295, "rewards/margins": 1.9061710834503174, "rewards/rejected": 1.3721988201141357, "step": 99530 }, { "epoch": 4.621384465388365, "grad_norm": 38.33507537841797, "learning_rate": 2.28395004410604e-08, "logits/chosen": -19.98918342590332, "logits/rejected": -19.837448120117188, "logps/chosen": -475.5665588378906, "logps/rejected": -413.6356506347656, "loss": 1.1785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.567626953125, "rewards/margins": -0.18025442957878113, "rewards/rejected": 3.7478816509246826, "step": 99540 }, { "epoch": 4.621848739495798, "grad_norm": 25.68034553527832, "learning_rate": 2.2811643994614417e-08, "logits/chosen": -18.993417739868164, "logits/rejected": -18.420961380004883, "logps/chosen": -408.3379821777344, "logps/rejected": -343.61431884765625, "loss": 0.4727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.555427312850952, "rewards/margins": 0.8975235819816589, "rewards/rejected": 1.6579039096832275, "step": 99550 }, { "epoch": 4.6223130136032315, "grad_norm": 457.8119201660156, "learning_rate": 2.278378754816844e-08, "logits/chosen": -19.274009704589844, "logits/rejected": -18.1136531829834, "logps/chosen": -408.6029968261719, "logps/rejected": -312.9721374511719, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.740039110183716, "rewards/margins": 1.8919763565063477, "rewards/rejected": 1.8480628728866577, "step": 99560 }, { "epoch": 4.622777287710664, "grad_norm": 97.72007751464844, "learning_rate": 2.2755931101722456e-08, "logits/chosen": -20.115428924560547, "logits/rejected": -19.81398582458496, "logps/chosen": -409.94573974609375, "logps/rejected": -371.3177795410156, "loss": 0.7833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6247551441192627, "rewards/margins": 0.052577387541532516, "rewards/rejected": 3.5721778869628906, "step": 99570 }, { "epoch": 4.623241561818097, "grad_norm": 4.574824810028076, "learning_rate": 2.2728074655276474e-08, "logits/chosen": -19.76692771911621, "logits/rejected": -19.306108474731445, "logps/chosen": -445.3582458496094, "logps/rejected": -388.5129089355469, "loss": 0.6953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6732017993927, "rewards/margins": 0.8414286375045776, "rewards/rejected": 2.831773281097412, "step": 99580 }, { "epoch": 4.623705835925531, "grad_norm": 54.13166809082031, "learning_rate": 2.2700218208830492e-08, "logits/chosen": -19.07784080505371, "logits/rejected": -17.337129592895508, "logps/chosen": -489.05877685546875, "logps/rejected": -337.62396240234375, "loss": 0.1525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.799190521240234, "rewards/margins": 3.384686231613159, "rewards/rejected": 1.414504051208496, "step": 99590 }, { "epoch": 4.6241701100329635, "grad_norm": 32.50321578979492, "learning_rate": 2.2672361762384513e-08, "logits/chosen": -18.72011947631836, "logits/rejected": -17.84686851501465, "logps/chosen": -332.9149169921875, "logps/rejected": -248.95352172851562, "loss": 0.6328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1491446495056152, "rewards/margins": 0.7394178509712219, "rewards/rejected": 1.409726619720459, "step": 99600 }, { "epoch": 4.624634384140396, "grad_norm": 203.8704376220703, "learning_rate": 2.264450531593853e-08, "logits/chosen": -19.51315689086914, "logits/rejected": -19.322063446044922, "logps/chosen": -482.5875549316406, "logps/rejected": -412.3985900878906, "loss": 0.5989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5674190521240234, "rewards/margins": 0.7993255853652954, "rewards/rejected": 2.7680933475494385, "step": 99610 }, { "epoch": 4.625098658247829, "grad_norm": 34.71236801147461, "learning_rate": 2.2616648869492546e-08, "logits/chosen": -18.28093910217285, "logits/rejected": -17.768007278442383, "logps/chosen": -334.18426513671875, "logps/rejected": -301.1309509277344, "loss": 0.4851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.431652069091797, "rewards/margins": 0.9422346353530884, "rewards/rejected": 1.4894174337387085, "step": 99620 }, { "epoch": 4.625562932355263, "grad_norm": 106.70616149902344, "learning_rate": 2.2588792423046563e-08, "logits/chosen": -18.663827896118164, "logits/rejected": -19.095571517944336, "logps/chosen": -448.39691162109375, "logps/rejected": -448.2861328125, "loss": 1.2168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1185736656188965, "rewards/margins": 0.22856292128562927, "rewards/rejected": 3.8900113105773926, "step": 99630 }, { "epoch": 4.6260272064626955, "grad_norm": 132.25917053222656, "learning_rate": 2.256093597660058e-08, "logits/chosen": -19.08925437927246, "logits/rejected": -19.31977081298828, "logps/chosen": -316.7513427734375, "logps/rejected": -314.94366455078125, "loss": 0.7354, "rewards/accuracies": 0.5, "rewards/chosen": 3.0234885215759277, "rewards/margins": 0.868986964225769, "rewards/rejected": 2.1545016765594482, "step": 99640 }, { "epoch": 4.626491480570128, "grad_norm": 3.6362953186035156, "learning_rate": 2.2533079530154602e-08, "logits/chosen": -19.2025203704834, "logits/rejected": -17.142780303955078, "logps/chosen": -416.6548767089844, "logps/rejected": -234.9523468017578, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 3.855247974395752, "rewards/margins": 3.659144878387451, "rewards/rejected": 0.1961028277873993, "step": 99650 }, { "epoch": 4.626955754677562, "grad_norm": 93.53575897216797, "learning_rate": 2.250522308370862e-08, "logits/chosen": -18.475994110107422, "logits/rejected": -17.971210479736328, "logps/chosen": -449.4190979003906, "logps/rejected": -321.5495300292969, "loss": 0.8925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.939918279647827, "rewards/margins": 1.021257758140564, "rewards/rejected": 1.9186605215072632, "step": 99660 }, { "epoch": 4.627420028784995, "grad_norm": 173.33242797851562, "learning_rate": 2.2477366637262638e-08, "logits/chosen": -18.564891815185547, "logits/rejected": -18.000568389892578, "logps/chosen": -381.80718994140625, "logps/rejected": -284.60845947265625, "loss": 0.5232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.717275857925415, "rewards/margins": 1.0606940984725952, "rewards/rejected": 1.6565815210342407, "step": 99670 }, { "epoch": 4.6278843028924275, "grad_norm": 13.8236722946167, "learning_rate": 2.2449510190816656e-08, "logits/chosen": -19.139938354492188, "logits/rejected": -18.73334503173828, "logps/chosen": -361.7845153808594, "logps/rejected": -359.44110107421875, "loss": 0.8174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.970203399658203, "rewards/margins": 0.4732302725315094, "rewards/rejected": 2.4969730377197266, "step": 99680 }, { "epoch": 4.62834857699986, "grad_norm": 1.1598830223083496, "learning_rate": 2.2421653744370677e-08, "logits/chosen": -19.659685134887695, "logits/rejected": -17.910175323486328, "logps/chosen": -485.6499938964844, "logps/rejected": -306.4729309082031, "loss": 0.4935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6042704582214355, "rewards/margins": 2.2057087421417236, "rewards/rejected": 1.398561954498291, "step": 99690 }, { "epoch": 4.628812851107294, "grad_norm": 290.9273681640625, "learning_rate": 2.2393797297924695e-08, "logits/chosen": -19.749530792236328, "logits/rejected": -18.366683959960938, "logps/chosen": -465.62237548828125, "logps/rejected": -317.2381591796875, "loss": 0.7682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.068901062011719, "rewards/margins": 2.2083497047424316, "rewards/rejected": 1.8605514764785767, "step": 99700 }, { "epoch": 4.629277125214727, "grad_norm": 161.02862548828125, "learning_rate": 2.2365940851478713e-08, "logits/chosen": -18.773784637451172, "logits/rejected": -18.865358352661133, "logps/chosen": -426.1492614746094, "logps/rejected": -444.27154541015625, "loss": 1.4552, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.004347324371338, "rewards/margins": -0.21447142958641052, "rewards/rejected": 4.218818187713623, "step": 99710 }, { "epoch": 4.6297413993221594, "grad_norm": 121.62699890136719, "learning_rate": 2.2338084405032727e-08, "logits/chosen": -19.90322494506836, "logits/rejected": -18.493209838867188, "logps/chosen": -403.1903381347656, "logps/rejected": -322.3644104003906, "loss": 0.8445, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.9809653759002686, "rewards/margins": 1.3779407739639282, "rewards/rejected": 2.60302472114563, "step": 99720 }, { "epoch": 4.630205673429593, "grad_norm": 4.4051899909973145, "learning_rate": 2.231022795858675e-08, "logits/chosen": -18.898418426513672, "logits/rejected": -18.066640853881836, "logps/chosen": -412.9463806152344, "logps/rejected": -347.88360595703125, "loss": 1.2298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.411806583404541, "rewards/margins": 0.4080948829650879, "rewards/rejected": 2.003711462020874, "step": 99730 }, { "epoch": 4.630669947537026, "grad_norm": 37.840667724609375, "learning_rate": 2.2282371512140766e-08, "logits/chosen": -19.096248626708984, "logits/rejected": -18.883817672729492, "logps/chosen": -387.49395751953125, "logps/rejected": -360.6819152832031, "loss": 0.5316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1582143306732178, "rewards/margins": 0.8582744598388672, "rewards/rejected": 2.2999396324157715, "step": 99740 }, { "epoch": 4.631134221644459, "grad_norm": 4.305328369140625, "learning_rate": 2.2254515065694784e-08, "logits/chosen": -19.001506805419922, "logits/rejected": -18.159706115722656, "logps/chosen": -430.1812438964844, "logps/rejected": -307.921630859375, "loss": 0.3664, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.341042518615723, "rewards/margins": 2.145158290863037, "rewards/rejected": 2.1958839893341064, "step": 99750 }, { "epoch": 4.631598495751891, "grad_norm": 111.60044860839844, "learning_rate": 2.2226658619248802e-08, "logits/chosen": -18.103240966796875, "logits/rejected": -18.175710678100586, "logps/chosen": -410.2554626464844, "logps/rejected": -374.3739013671875, "loss": 0.6223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2485415935516357, "rewards/margins": 0.8822010159492493, "rewards/rejected": 2.3663406372070312, "step": 99760 }, { "epoch": 4.632062769859325, "grad_norm": 211.3529510498047, "learning_rate": 2.2198802172802823e-08, "logits/chosen": -19.276214599609375, "logits/rejected": -18.053457260131836, "logps/chosen": -402.7826843261719, "logps/rejected": -303.26007080078125, "loss": 0.528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1829843521118164, "rewards/margins": 1.7372009754180908, "rewards/rejected": 1.4457833766937256, "step": 99770 }, { "epoch": 4.632527043966758, "grad_norm": 1.184918761253357, "learning_rate": 2.217094572635684e-08, "logits/chosen": -19.587547302246094, "logits/rejected": -19.701894760131836, "logps/chosen": -494.01904296875, "logps/rejected": -393.1895446777344, "loss": 0.7161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.58648157119751, "rewards/margins": 1.6920509338378906, "rewards/rejected": 3.894430637359619, "step": 99780 }, { "epoch": 4.632991318074191, "grad_norm": 108.71934509277344, "learning_rate": 2.214308927991086e-08, "logits/chosen": -19.526931762695312, "logits/rejected": -19.12095832824707, "logps/chosen": -358.8796691894531, "logps/rejected": -281.7130126953125, "loss": 0.7467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9552738666534424, "rewards/margins": 0.5930148363113403, "rewards/rejected": 2.3622589111328125, "step": 99790 }, { "epoch": 4.633455592181624, "grad_norm": 4.336381912231445, "learning_rate": 2.2115232833464877e-08, "logits/chosen": -20.518993377685547, "logits/rejected": -18.889741897583008, "logps/chosen": -573.5938720703125, "logps/rejected": -364.09832763671875, "loss": 0.2241, "rewards/accuracies": 1.0, "rewards/chosen": 5.1436848640441895, "rewards/margins": 2.368072032928467, "rewards/rejected": 2.7756125926971436, "step": 99800 }, { "epoch": 4.633919866289057, "grad_norm": 5.693633556365967, "learning_rate": 2.2087376387018898e-08, "logits/chosen": -19.502117156982422, "logits/rejected": -19.07312774658203, "logps/chosen": -445.6104431152344, "logps/rejected": -370.4649658203125, "loss": 0.7398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3571858406066895, "rewards/margins": 0.7493900656700134, "rewards/rejected": 2.607795476913452, "step": 99810 }, { "epoch": 4.63438414039649, "grad_norm": 55.136070251464844, "learning_rate": 2.2059519940572912e-08, "logits/chosen": -19.765098571777344, "logits/rejected": -19.376995086669922, "logps/chosen": -298.10736083984375, "logps/rejected": -266.878173828125, "loss": 0.7578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.279916763305664, "rewards/margins": 0.26139527559280396, "rewards/rejected": 2.018521547317505, "step": 99820 }, { "epoch": 4.6348484145039235, "grad_norm": 130.08111572265625, "learning_rate": 2.203166349412693e-08, "logits/chosen": -18.794158935546875, "logits/rejected": -17.994726181030273, "logps/chosen": -467.73577880859375, "logps/rejected": -347.5993957519531, "loss": 0.7111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4501125812530518, "rewards/margins": 1.5182193517684937, "rewards/rejected": 1.931892991065979, "step": 99830 }, { "epoch": 4.635312688611356, "grad_norm": 219.59503173828125, "learning_rate": 2.2003807047680948e-08, "logits/chosen": -19.444103240966797, "logits/rejected": -18.496299743652344, "logps/chosen": -349.14788818359375, "logps/rejected": -283.3177185058594, "loss": 0.8687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.872265577316284, "rewards/margins": 1.2272613048553467, "rewards/rejected": 1.6450042724609375, "step": 99840 }, { "epoch": 4.635776962718789, "grad_norm": 226.40521240234375, "learning_rate": 2.1975950601234966e-08, "logits/chosen": -18.900676727294922, "logits/rejected": -19.286766052246094, "logps/chosen": -480.4833068847656, "logps/rejected": -491.75018310546875, "loss": 1.2876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7499804496765137, "rewards/margins": -0.23958206176757812, "rewards/rejected": 3.989562511444092, "step": 99850 }, { "epoch": 4.636241236826223, "grad_norm": 15.421483993530273, "learning_rate": 2.1948094154788987e-08, "logits/chosen": -19.874895095825195, "logits/rejected": -18.87640953063965, "logps/chosen": -474.48834228515625, "logps/rejected": -316.47967529296875, "loss": 0.5924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.07985782623291, "rewards/margins": 1.7391420602798462, "rewards/rejected": 2.3407158851623535, "step": 99860 }, { "epoch": 4.6367055109336555, "grad_norm": 48.59461975097656, "learning_rate": 2.1920237708343005e-08, "logits/chosen": -20.311281204223633, "logits/rejected": -18.061267852783203, "logps/chosen": -354.2904968261719, "logps/rejected": -247.77835083007812, "loss": 0.4623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.769166946411133, "rewards/margins": 1.2738536596298218, "rewards/rejected": 1.495313286781311, "step": 99870 }, { "epoch": 4.637169785041088, "grad_norm": 55.23863983154297, "learning_rate": 2.1892381261897023e-08, "logits/chosen": -20.423824310302734, "logits/rejected": -19.563175201416016, "logps/chosen": -423.6712951660156, "logps/rejected": -381.1128845214844, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 3.8913357257843018, "rewards/margins": 0.8749167323112488, "rewards/rejected": 3.0164194107055664, "step": 99880 }, { "epoch": 4.637634059148521, "grad_norm": 107.95682525634766, "learning_rate": 2.186452481545104e-08, "logits/chosen": -19.495851516723633, "logits/rejected": -18.91815185546875, "logps/chosen": -426.9273376464844, "logps/rejected": -464.71856689453125, "loss": 0.4755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.916404724121094, "rewards/margins": 2.076336622238159, "rewards/rejected": 3.8400681018829346, "step": 99890 }, { "epoch": 4.638098333255955, "grad_norm": 44.165653228759766, "learning_rate": 2.1836668369005062e-08, "logits/chosen": -18.907930374145508, "logits/rejected": -18.13237953186035, "logps/chosen": -480.6439514160156, "logps/rejected": -310.39862060546875, "loss": 0.2264, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.069641590118408, "rewards/margins": 3.3512892723083496, "rewards/rejected": 1.7183520793914795, "step": 99900 }, { "epoch": 4.638562607363387, "grad_norm": 175.8701171875, "learning_rate": 2.180881192255908e-08, "logits/chosen": -19.772781372070312, "logits/rejected": -19.29534339904785, "logps/chosen": -347.0973205566406, "logps/rejected": -376.74957275390625, "loss": 0.7658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.069214820861816, "rewards/margins": 0.6386992335319519, "rewards/rejected": 3.4305152893066406, "step": 99910 }, { "epoch": 4.63902688147082, "grad_norm": 64.50179290771484, "learning_rate": 2.1780955476113094e-08, "logits/chosen": -19.878103256225586, "logits/rejected": -18.577285766601562, "logps/chosen": -331.6427001953125, "logps/rejected": -211.45425415039062, "loss": 0.6784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0178720951080322, "rewards/margins": 1.8248258829116821, "rewards/rejected": 1.19304621219635, "step": 99920 }, { "epoch": 4.639491155578254, "grad_norm": 30.770483016967773, "learning_rate": 2.1753099029667112e-08, "logits/chosen": -19.044017791748047, "logits/rejected": -18.312549591064453, "logps/chosen": -411.38128662109375, "logps/rejected": -312.5755615234375, "loss": 0.4094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.013434410095215, "rewards/margins": 1.0501768589019775, "rewards/rejected": 1.9632575511932373, "step": 99930 }, { "epoch": 4.639955429685687, "grad_norm": 0.15950162708759308, "learning_rate": 2.1725242583221133e-08, "logits/chosen": -19.397789001464844, "logits/rejected": -18.181432723999023, "logps/chosen": -407.1075744628906, "logps/rejected": -314.478759765625, "loss": 0.3497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.973665714263916, "rewards/margins": 2.500166416168213, "rewards/rejected": 0.47349920868873596, "step": 99940 }, { "epoch": 4.640419703793119, "grad_norm": 6.639317989349365, "learning_rate": 2.169738613677515e-08, "logits/chosen": -18.888744354248047, "logits/rejected": -19.767887115478516, "logps/chosen": -382.6304626464844, "logps/rejected": -399.5351257324219, "loss": 1.3252, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.9963088035583496, "rewards/margins": -0.593846321105957, "rewards/rejected": 3.5901551246643066, "step": 99950 }, { "epoch": 4.640883977900552, "grad_norm": 0.5790950059890747, "learning_rate": 2.166952969032917e-08, "logits/chosen": -19.94967269897461, "logits/rejected": -18.19847297668457, "logps/chosen": -504.958984375, "logps/rejected": -333.8136901855469, "loss": 0.2292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.765620708465576, "rewards/margins": 3.266244888305664, "rewards/rejected": 1.4993760585784912, "step": 99960 }, { "epoch": 4.641348252007986, "grad_norm": 16.812070846557617, "learning_rate": 2.1641673243883187e-08, "logits/chosen": -19.090110778808594, "logits/rejected": -17.521339416503906, "logps/chosen": -405.41302490234375, "logps/rejected": -233.7152557373047, "loss": 0.53, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.257474422454834, "rewards/margins": 2.0211198329925537, "rewards/rejected": 1.2363545894622803, "step": 99970 }, { "epoch": 4.641812526115419, "grad_norm": 14.3233642578125, "learning_rate": 2.1613816797437208e-08, "logits/chosen": -19.935840606689453, "logits/rejected": -20.241703033447266, "logps/chosen": -448.0613708496094, "logps/rejected": -405.1259460449219, "loss": 1.3043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.149857521057129, "rewards/margins": 0.4056980609893799, "rewards/rejected": 3.744159698486328, "step": 99980 }, { "epoch": 4.642276800222851, "grad_norm": 172.73529052734375, "learning_rate": 2.1585960350991226e-08, "logits/chosen": -19.61211395263672, "logits/rejected": -19.003938674926758, "logps/chosen": -428.12359619140625, "logps/rejected": -428.8262634277344, "loss": 0.6168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0413594245910645, "rewards/margins": 1.0214412212371826, "rewards/rejected": 2.0199179649353027, "step": 99990 }, { "epoch": 4.642741074330285, "grad_norm": 107.20759582519531, "learning_rate": 2.1558103904545244e-08, "logits/chosen": -19.601858139038086, "logits/rejected": -19.152889251708984, "logps/chosen": -388.33380126953125, "logps/rejected": -363.12603759765625, "loss": 0.374, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.470818519592285, "rewards/margins": 1.3933775424957275, "rewards/rejected": 3.0774409770965576, "step": 100000 }, { "epoch": 4.643205348437718, "grad_norm": 0.9047223925590515, "learning_rate": 2.153024745809926e-08, "logits/chosen": -19.98309898376465, "logits/rejected": -19.681926727294922, "logps/chosen": -361.04974365234375, "logps/rejected": -286.60968017578125, "loss": 0.4114, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.600032091140747, "rewards/margins": 1.9999160766601562, "rewards/rejected": 1.6001155376434326, "step": 100010 }, { "epoch": 4.643669622545151, "grad_norm": 29.74701690673828, "learning_rate": 2.1502391011653276e-08, "logits/chosen": -20.352285385131836, "logits/rejected": -18.926626205444336, "logps/chosen": -478.3831481933594, "logps/rejected": -367.9764099121094, "loss": 0.6231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.515812873840332, "rewards/margins": 1.2876836061477661, "rewards/rejected": 3.2281291484832764, "step": 100020 }, { "epoch": 4.644133896652583, "grad_norm": 0.0050827618688344955, "learning_rate": 2.1474534565207297e-08, "logits/chosen": -18.69413185119629, "logits/rejected": -17.710262298583984, "logps/chosen": -381.18243408203125, "logps/rejected": -355.9840087890625, "loss": 1.2483, "rewards/accuracies": 0.5, "rewards/chosen": 2.70265531539917, "rewards/margins": 1.1502418518066406, "rewards/rejected": 1.5524133443832397, "step": 100030 }, { "epoch": 4.644598170760017, "grad_norm": 1.1266812086105347, "learning_rate": 2.1446678118761315e-08, "logits/chosen": -19.211780548095703, "logits/rejected": -18.559650421142578, "logps/chosen": -301.0788879394531, "logps/rejected": -245.64834594726562, "loss": 0.6703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.658751964569092, "rewards/margins": 0.923924446105957, "rewards/rejected": 1.7348277568817139, "step": 100040 }, { "epoch": 4.64506244486745, "grad_norm": 102.96089172363281, "learning_rate": 2.1418821672315333e-08, "logits/chosen": -18.70259666442871, "logits/rejected": -18.715322494506836, "logps/chosen": -411.20269775390625, "logps/rejected": -420.75909423828125, "loss": 0.6253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.089860916137695, "rewards/margins": 0.5591899752616882, "rewards/rejected": 3.5306713581085205, "step": 100050 }, { "epoch": 4.645526718974883, "grad_norm": 85.65664672851562, "learning_rate": 2.139096522586935e-08, "logits/chosen": -19.659719467163086, "logits/rejected": -19.108320236206055, "logps/chosen": -352.1569519042969, "logps/rejected": -332.20623779296875, "loss": 1.0263, "rewards/accuracies": 0.5, "rewards/chosen": 3.2981343269348145, "rewards/margins": 0.7196592092514038, "rewards/rejected": 2.578474760055542, "step": 100060 }, { "epoch": 4.645990993082316, "grad_norm": 182.69398498535156, "learning_rate": 2.1363108779423372e-08, "logits/chosen": -19.508424758911133, "logits/rejected": -18.93875503540039, "logps/chosen": -520.3854370117188, "logps/rejected": -391.4483337402344, "loss": 0.7092, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8720927238464355, "rewards/margins": 1.1232364177703857, "rewards/rejected": 2.74885630607605, "step": 100070 }, { "epoch": 4.646455267189749, "grad_norm": 158.50572204589844, "learning_rate": 2.133525233297739e-08, "logits/chosen": -18.086536407470703, "logits/rejected": -18.382051467895508, "logps/chosen": -307.1148376464844, "logps/rejected": -331.747802734375, "loss": 1.0238, "rewards/accuracies": 0.5, "rewards/chosen": 2.7390666007995605, "rewards/margins": 0.01523977518081665, "rewards/rejected": 2.7238268852233887, "step": 100080 }, { "epoch": 4.646919541297182, "grad_norm": 61.952178955078125, "learning_rate": 2.1307395886531407e-08, "logits/chosen": -18.277713775634766, "logits/rejected": -18.291723251342773, "logps/chosen": -354.84185791015625, "logps/rejected": -375.7782897949219, "loss": 1.0535, "rewards/accuracies": 0.5, "rewards/chosen": 2.2045934200286865, "rewards/margins": 0.1350182592868805, "rewards/rejected": 2.069575309753418, "step": 100090 }, { "epoch": 4.6473838154046145, "grad_norm": 37.5067138671875, "learning_rate": 2.1279539440085425e-08, "logits/chosen": -19.1625919342041, "logits/rejected": -19.962791442871094, "logps/chosen": -281.1312255859375, "logps/rejected": -365.24609375, "loss": 0.8302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5998706817626953, "rewards/margins": 0.5542621612548828, "rewards/rejected": 2.0456087589263916, "step": 100100 }, { "epoch": 4.647848089512048, "grad_norm": 151.16375732421875, "learning_rate": 2.1251682993639446e-08, "logits/chosen": -20.154804229736328, "logits/rejected": -18.331266403198242, "logps/chosen": -313.28851318359375, "logps/rejected": -239.54067993164062, "loss": 0.606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2628014087677, "rewards/margins": 1.670493483543396, "rewards/rejected": 1.592307686805725, "step": 100110 }, { "epoch": 4.648312363619481, "grad_norm": 198.7168731689453, "learning_rate": 2.122382654719346e-08, "logits/chosen": -19.482276916503906, "logits/rejected": -17.958358764648438, "logps/chosen": -578.5635375976562, "logps/rejected": -490.5210876464844, "loss": 0.8658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.023192882537842, "rewards/margins": 1.3980106115341187, "rewards/rejected": 3.6251819133758545, "step": 100120 }, { "epoch": 4.648776637726914, "grad_norm": 147.9778594970703, "learning_rate": 2.119597010074748e-08, "logits/chosen": -18.690921783447266, "logits/rejected": -17.938167572021484, "logps/chosen": -521.0849609375, "logps/rejected": -446.79571533203125, "loss": 0.4668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.788926601409912, "rewards/margins": 1.58039391040802, "rewards/rejected": 3.2085328102111816, "step": 100130 }, { "epoch": 4.649240911834347, "grad_norm": 3.0254201889038086, "learning_rate": 2.1168113654301497e-08, "logits/chosen": -19.002201080322266, "logits/rejected": -17.463464736938477, "logps/chosen": -471.12689208984375, "logps/rejected": -255.99526977539062, "loss": 0.225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4480366706848145, "rewards/margins": 2.2626700401306152, "rewards/rejected": 1.185366153717041, "step": 100140 }, { "epoch": 4.64970518594178, "grad_norm": 33.92940902709961, "learning_rate": 2.1140257207855518e-08, "logits/chosen": -19.780271530151367, "logits/rejected": -18.512849807739258, "logps/chosen": -458.59527587890625, "logps/rejected": -328.0158386230469, "loss": 0.526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.149581432342529, "rewards/margins": 1.9688804149627686, "rewards/rejected": 2.180701732635498, "step": 100150 }, { "epoch": 4.650169460049213, "grad_norm": 111.40394592285156, "learning_rate": 2.1112400761409536e-08, "logits/chosen": -19.143939971923828, "logits/rejected": -19.429630279541016, "logps/chosen": -280.7062072753906, "logps/rejected": -309.1732177734375, "loss": 1.2733, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0265841484069824, "rewards/margins": -0.4769827723503113, "rewards/rejected": 2.5035672187805176, "step": 100160 }, { "epoch": 4.650633734156646, "grad_norm": 31.43073081970215, "learning_rate": 2.1084544314963554e-08, "logits/chosen": -19.20405387878418, "logits/rejected": -19.155973434448242, "logps/chosen": -323.83404541015625, "logps/rejected": -374.2973327636719, "loss": 1.293, "rewards/accuracies": 0.5, "rewards/chosen": 3.740319013595581, "rewards/margins": 0.3363496661186218, "rewards/rejected": 3.403970241546631, "step": 100170 }, { "epoch": 4.651098008264079, "grad_norm": 141.29212951660156, "learning_rate": 2.105668786851757e-08, "logits/chosen": -17.91487693786621, "logits/rejected": -19.6981258392334, "logps/chosen": -273.1507568359375, "logps/rejected": -384.3382263183594, "loss": 2.1718, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3861842155456543, "rewards/margins": -1.482961893081665, "rewards/rejected": 3.8691468238830566, "step": 100180 }, { "epoch": 4.651562282371512, "grad_norm": 3.1704163551330566, "learning_rate": 2.1028831422071593e-08, "logits/chosen": -19.033784866333008, "logits/rejected": -18.6217041015625, "logps/chosen": -412.9024353027344, "logps/rejected": -380.5436096191406, "loss": 1.0892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.657477855682373, "rewards/margins": 0.9251216650009155, "rewards/rejected": 2.732356548309326, "step": 100190 }, { "epoch": 4.652026556478945, "grad_norm": 21.31275749206543, "learning_rate": 2.100097497562561e-08, "logits/chosen": -19.38804817199707, "logits/rejected": -18.794923782348633, "logps/chosen": -294.5287170410156, "logps/rejected": -270.84454345703125, "loss": 0.3753, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5665416717529297, "rewards/margins": 1.0433268547058105, "rewards/rejected": 0.5232148170471191, "step": 100200 }, { "epoch": 4.652490830586379, "grad_norm": 87.72606658935547, "learning_rate": 2.0973118529179628e-08, "logits/chosen": -18.24382781982422, "logits/rejected": -18.391164779663086, "logps/chosen": -350.1090087890625, "logps/rejected": -278.42987060546875, "loss": 0.9591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.081821918487549, "rewards/margins": 0.5072008371353149, "rewards/rejected": 2.574620485305786, "step": 100210 }, { "epoch": 4.652955104693811, "grad_norm": 16.861785888671875, "learning_rate": 2.0945262082733643e-08, "logits/chosen": -18.920650482177734, "logits/rejected": -18.5797061920166, "logps/chosen": -424.49090576171875, "logps/rejected": -387.874267578125, "loss": 0.6735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.130530595779419, "rewards/margins": 1.1790748834609985, "rewards/rejected": 1.9514554738998413, "step": 100220 }, { "epoch": 4.653419378801244, "grad_norm": NaN, "learning_rate": 2.092019128093226e-08, "logits/chosen": -18.922428131103516, "logits/rejected": -18.28219223022461, "logps/chosen": -430.6040954589844, "logps/rejected": -385.07861328125, "loss": 0.8676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8684356212615967, "rewards/margins": 0.4437442421913147, "rewards/rejected": 2.424691677093506, "step": 100230 }, { "epoch": 4.653883652908677, "grad_norm": 0.23228861391544342, "learning_rate": 2.089233483448628e-08, "logits/chosen": -18.870014190673828, "logits/rejected": -17.7066593170166, "logps/chosen": -483.2145080566406, "logps/rejected": -336.9330749511719, "loss": 0.5782, "rewards/accuracies": 0.5, "rewards/chosen": 3.39448618888855, "rewards/margins": 1.4829487800598145, "rewards/rejected": 1.911537766456604, "step": 100240 }, { "epoch": 4.6543479270161106, "grad_norm": 4.630156993865967, "learning_rate": 2.08644783880403e-08, "logits/chosen": -19.505332946777344, "logits/rejected": -19.02508544921875, "logps/chosen": -487.6221618652344, "logps/rejected": -357.907470703125, "loss": 0.3281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.286266803741455, "rewards/margins": 1.607480764389038, "rewards/rejected": 2.678785800933838, "step": 100250 }, { "epoch": 4.654812201123543, "grad_norm": 91.75190734863281, "learning_rate": 2.0836621941594317e-08, "logits/chosen": -18.81673812866211, "logits/rejected": -18.502613067626953, "logps/chosen": -271.4074401855469, "logps/rejected": -276.5982360839844, "loss": 0.6949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7025387287139893, "rewards/margins": 0.5751263499259949, "rewards/rejected": 2.1274123191833496, "step": 100260 }, { "epoch": 4.655276475230976, "grad_norm": 152.27505493164062, "learning_rate": 2.0808765495148335e-08, "logits/chosen": -19.312320709228516, "logits/rejected": -19.115474700927734, "logps/chosen": -408.09393310546875, "logps/rejected": -333.9162292480469, "loss": 0.8569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.690607070922852, "rewards/margins": 1.3409181833267212, "rewards/rejected": 3.34968900680542, "step": 100270 }, { "epoch": 4.65574074933841, "grad_norm": 73.46197509765625, "learning_rate": 2.078090904870235e-08, "logits/chosen": -19.381919860839844, "logits/rejected": -18.6435546875, "logps/chosen": -291.5467834472656, "logps/rejected": -249.87405395507812, "loss": 0.3409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9289424419403076, "rewards/margins": 1.7716395854949951, "rewards/rejected": 1.1573030948638916, "step": 100280 }, { "epoch": 4.6562050234458425, "grad_norm": 212.85304260253906, "learning_rate": 2.075305260225637e-08, "logits/chosen": -18.75827407836914, "logits/rejected": -19.10378646850586, "logps/chosen": -407.66387939453125, "logps/rejected": -413.92242431640625, "loss": 0.7358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.809189558029175, "rewards/margins": 1.0889537334442139, "rewards/rejected": 2.720235586166382, "step": 100290 }, { "epoch": 4.656669297553275, "grad_norm": 0.6129735112190247, "learning_rate": 2.072519615581039e-08, "logits/chosen": -18.40484046936035, "logits/rejected": -16.98397445678711, "logps/chosen": -428.41961669921875, "logps/rejected": -265.6827087402344, "loss": 0.3286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1265156269073486, "rewards/margins": 1.8026317358016968, "rewards/rejected": 1.3238837718963623, "step": 100300 }, { "epoch": 4.657133571660708, "grad_norm": 182.87814331054688, "learning_rate": 2.0697339709364407e-08, "logits/chosen": -18.928720474243164, "logits/rejected": -18.524904251098633, "logps/chosen": -452.0269470214844, "logps/rejected": -453.8407287597656, "loss": 0.8858, "rewards/accuracies": 0.5, "rewards/chosen": 4.045267581939697, "rewards/margins": 0.8707813024520874, "rewards/rejected": 3.1744866371154785, "step": 100310 }, { "epoch": 4.657597845768142, "grad_norm": 5.559894561767578, "learning_rate": 2.0669483262918424e-08, "logits/chosen": -19.428098678588867, "logits/rejected": -18.527721405029297, "logps/chosen": -431.00360107421875, "logps/rejected": -310.5473937988281, "loss": 0.1576, "rewards/accuracies": 1.0, "rewards/chosen": 4.781149864196777, "rewards/margins": 2.816519260406494, "rewards/rejected": 1.964630365371704, "step": 100320 }, { "epoch": 4.6580621198755745, "grad_norm": 2.8974032402038574, "learning_rate": 2.0641626816472446e-08, "logits/chosen": -19.20564079284668, "logits/rejected": -18.14297866821289, "logps/chosen": -498.004638671875, "logps/rejected": -414.6600646972656, "loss": 0.4005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.338548183441162, "rewards/margins": 2.487046241760254, "rewards/rejected": 1.8515020608901978, "step": 100330 }, { "epoch": 4.658526393983007, "grad_norm": 101.69697570800781, "learning_rate": 2.0613770370026463e-08, "logits/chosen": -19.302127838134766, "logits/rejected": -17.95244026184082, "logps/chosen": -441.5341796875, "logps/rejected": -351.5844421386719, "loss": 0.5835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 6.006128787994385, "rewards/margins": 2.5587613582611084, "rewards/rejected": 3.4473679065704346, "step": 100340 }, { "epoch": 4.658990668090441, "grad_norm": 0.4657966196537018, "learning_rate": 2.058591392358048e-08, "logits/chosen": -19.42874526977539, "logits/rejected": -18.53774070739746, "logps/chosen": -414.60205078125, "logps/rejected": -270.92376708984375, "loss": 0.4299, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7333383560180664, "rewards/margins": 1.3519426584243774, "rewards/rejected": 1.381395697593689, "step": 100350 }, { "epoch": 4.659454942197874, "grad_norm": 35.623897552490234, "learning_rate": 2.0558057477134502e-08, "logits/chosen": -18.806255340576172, "logits/rejected": -18.237529754638672, "logps/chosen": -383.0843200683594, "logps/rejected": -321.7161560058594, "loss": 0.8487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.746737480163574, "rewards/margins": 0.45058107376098633, "rewards/rejected": 2.296156406402588, "step": 100360 }, { "epoch": 4.6599192163053065, "grad_norm": 0.9615223407745361, "learning_rate": 2.053020103068852e-08, "logits/chosen": -18.828800201416016, "logits/rejected": -18.300737380981445, "logps/chosen": -461.0270080566406, "logps/rejected": -393.69073486328125, "loss": 0.682, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0942490100860596, "rewards/margins": 1.093773603439331, "rewards/rejected": 2.0004754066467285, "step": 100370 }, { "epoch": 4.660383490412739, "grad_norm": 194.8694610595703, "learning_rate": 2.0502344584242535e-08, "logits/chosen": -18.89950180053711, "logits/rejected": -17.616931915283203, "logps/chosen": -360.8717346191406, "logps/rejected": -252.9234619140625, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5063042640686035, "rewards/margins": 2.085554838180542, "rewards/rejected": 1.4207487106323242, "step": 100380 }, { "epoch": 4.660847764520173, "grad_norm": 77.15711975097656, "learning_rate": 2.0474488137796553e-08, "logits/chosen": -18.17657470703125, "logits/rejected": -17.80109405517578, "logps/chosen": -339.1405944824219, "logps/rejected": -209.57339477539062, "loss": 0.4796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.605727434158325, "rewards/margins": 1.8814685344696045, "rewards/rejected": 0.7242590188980103, "step": 100390 }, { "epoch": 4.661312038627606, "grad_norm": 11.845026969909668, "learning_rate": 2.044663169135057e-08, "logits/chosen": -19.00977897644043, "logits/rejected": -17.97045135498047, "logps/chosen": -430.59521484375, "logps/rejected": -344.8150329589844, "loss": 0.2866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3881964683532715, "rewards/margins": 2.1326699256896973, "rewards/rejected": 2.2555270195007324, "step": 100400 }, { "epoch": 4.6617763127350385, "grad_norm": 43.119441986083984, "learning_rate": 2.041877524490459e-08, "logits/chosen": -19.263330459594727, "logits/rejected": -18.94308090209961, "logps/chosen": -362.5699768066406, "logps/rejected": -365.2840881347656, "loss": 0.7101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8287672996520996, "rewards/margins": 0.7357959747314453, "rewards/rejected": 3.0929713249206543, "step": 100410 }, { "epoch": 4.662240586842472, "grad_norm": 17.015422821044922, "learning_rate": 2.039091879845861e-08, "logits/chosen": -20.261672973632812, "logits/rejected": -18.881019592285156, "logps/chosen": -438.65673828125, "logps/rejected": -318.77996826171875, "loss": 0.4571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.521201133728027, "rewards/margins": 1.6439762115478516, "rewards/rejected": 2.877224922180176, "step": 100420 }, { "epoch": 4.662704860949905, "grad_norm": 53.21490478515625, "learning_rate": 2.0363062352012627e-08, "logits/chosen": -18.942672729492188, "logits/rejected": -18.849048614501953, "logps/chosen": -435.5048828125, "logps/rejected": -311.50421142578125, "loss": 0.6643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.508312940597534, "rewards/margins": 1.539646029472351, "rewards/rejected": 1.968666672706604, "step": 100430 }, { "epoch": 4.663169135057338, "grad_norm": 132.63418579101562, "learning_rate": 2.0335205905566645e-08, "logits/chosen": -19.100139617919922, "logits/rejected": -19.299922943115234, "logps/chosen": -326.03680419921875, "logps/rejected": -330.0571594238281, "loss": 0.7284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.442230701446533, "rewards/margins": 0.7686074376106262, "rewards/rejected": 2.6736226081848145, "step": 100440 }, { "epoch": 4.66363340916477, "grad_norm": 108.38965606689453, "learning_rate": 2.0307349459120666e-08, "logits/chosen": -18.900333404541016, "logits/rejected": -18.982379913330078, "logps/chosen": -302.80499267578125, "logps/rejected": -320.6666564941406, "loss": 0.7834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.063384771347046, "rewards/margins": 0.42534899711608887, "rewards/rejected": 2.638035774230957, "step": 100450 }, { "epoch": 4.664097683272204, "grad_norm": 15.86835765838623, "learning_rate": 2.0279493012674684e-08, "logits/chosen": -18.587358474731445, "logits/rejected": -18.470600128173828, "logps/chosen": -399.39288330078125, "logps/rejected": -325.2000732421875, "loss": 0.9932, "rewards/accuracies": 0.5, "rewards/chosen": 2.783301830291748, "rewards/margins": -0.08369214832782745, "rewards/rejected": 2.8669941425323486, "step": 100460 }, { "epoch": 4.664561957379637, "grad_norm": 151.8694305419922, "learning_rate": 2.0251636566228702e-08, "logits/chosen": -19.349668502807617, "logits/rejected": -19.24906349182129, "logps/chosen": -463.28912353515625, "logps/rejected": -384.50146484375, "loss": 0.9443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.45670747756958, "rewards/margins": 0.5035861134529114, "rewards/rejected": 2.9531216621398926, "step": 100470 }, { "epoch": 4.66502623148707, "grad_norm": 5.313129901885986, "learning_rate": 2.0223780119782717e-08, "logits/chosen": -18.499406814575195, "logits/rejected": -16.94119644165039, "logps/chosen": -373.6591491699219, "logps/rejected": -247.7150421142578, "loss": 0.7529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.708777666091919, "rewards/margins": 1.6781549453735352, "rewards/rejected": 1.0306227207183838, "step": 100480 }, { "epoch": 4.665490505594503, "grad_norm": 33.86296463012695, "learning_rate": 2.0195923673336734e-08, "logits/chosen": -19.371383666992188, "logits/rejected": -19.027990341186523, "logps/chosen": -383.5375061035156, "logps/rejected": -356.93994140625, "loss": 0.8213, "rewards/accuracies": 0.5, "rewards/chosen": 3.777036190032959, "rewards/margins": 0.5931234955787659, "rewards/rejected": 3.1839122772216797, "step": 100490 }, { "epoch": 4.665954779701936, "grad_norm": 0.9577621817588806, "learning_rate": 2.0168067226890756e-08, "logits/chosen": -19.99166488647461, "logits/rejected": -19.118288040161133, "logps/chosen": -298.33990478515625, "logps/rejected": -307.5014343261719, "loss": 0.8655, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.818124294281006, "rewards/margins": 0.4166826605796814, "rewards/rejected": 2.401441812515259, "step": 100500 }, { "epoch": 4.666419053809369, "grad_norm": 4.876431941986084, "learning_rate": 2.0140210780444773e-08, "logits/chosen": -19.81753158569336, "logits/rejected": -19.245460510253906, "logps/chosen": -565.7992553710938, "logps/rejected": -440.48077392578125, "loss": 0.6022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.218708515167236, "rewards/margins": 1.546725869178772, "rewards/rejected": 3.671982526779175, "step": 100510 }, { "epoch": 4.666883327916802, "grad_norm": 2.2096521854400635, "learning_rate": 2.011235433399879e-08, "logits/chosen": -18.935604095458984, "logits/rejected": -18.086843490600586, "logps/chosen": -453.9910583496094, "logps/rejected": -330.4283142089844, "loss": 0.6153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.377062797546387, "rewards/margins": 1.7710834741592407, "rewards/rejected": 2.6059796810150146, "step": 100520 }, { "epoch": 4.667347602024235, "grad_norm": 329.94036865234375, "learning_rate": 2.008449788755281e-08, "logits/chosen": -19.380233764648438, "logits/rejected": -18.473487854003906, "logps/chosen": -356.8267517089844, "logps/rejected": -292.7717590332031, "loss": 1.3973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3160452842712402, "rewards/margins": 1.296696424484253, "rewards/rejected": 2.0193488597869873, "step": 100530 }, { "epoch": 4.667811876131668, "grad_norm": 49.39539337158203, "learning_rate": 2.005664144110683e-08, "logits/chosen": -19.21872329711914, "logits/rejected": -19.517534255981445, "logps/chosen": -299.0290832519531, "logps/rejected": -319.5381164550781, "loss": 1.0741, "rewards/accuracies": 0.5, "rewards/chosen": 3.3488097190856934, "rewards/margins": 0.6698187589645386, "rewards/rejected": 2.6789908409118652, "step": 100540 }, { "epoch": 4.668276150239101, "grad_norm": 43.763160705566406, "learning_rate": 2.0028784994660848e-08, "logits/chosen": -20.14773178100586, "logits/rejected": -19.464494705200195, "logps/chosen": -439.0576171875, "logps/rejected": -345.3665771484375, "loss": 0.9714, "rewards/accuracies": 0.5, "rewards/chosen": 3.7698936462402344, "rewards/margins": 0.5552064776420593, "rewards/rejected": 3.214686870574951, "step": 100550 }, { "epoch": 4.6687404243465345, "grad_norm": 58.9600715637207, "learning_rate": 2.0000928548214866e-08, "logits/chosen": -18.530200958251953, "logits/rejected": -18.92123794555664, "logps/chosen": -254.98593139648438, "logps/rejected": -276.812744140625, "loss": 0.9521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8304073810577393, "rewards/margins": 0.42367058992385864, "rewards/rejected": 1.4067367315292358, "step": 100560 }, { "epoch": 4.669204698453967, "grad_norm": 28.88747215270996, "learning_rate": 1.9973072101768887e-08, "logits/chosen": -18.898113250732422, "logits/rejected": -17.269268035888672, "logps/chosen": -340.5777282714844, "logps/rejected": -247.78964233398438, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 4.039692401885986, "rewards/margins": 3.911046266555786, "rewards/rejected": 0.1286463439464569, "step": 100570 }, { "epoch": 4.6696689725614, "grad_norm": 4.257271766662598, "learning_rate": 1.99452156553229e-08, "logits/chosen": -18.95751190185547, "logits/rejected": -17.902198791503906, "logps/chosen": -341.696533203125, "logps/rejected": -255.0379638671875, "loss": 0.4533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.851485013961792, "rewards/margins": 1.7754697799682617, "rewards/rejected": 1.0760152339935303, "step": 100580 }, { "epoch": 4.670133246668833, "grad_norm": 57.37041473388672, "learning_rate": 1.991735920887692e-08, "logits/chosen": -19.062335968017578, "logits/rejected": -18.866086959838867, "logps/chosen": -340.79852294921875, "logps/rejected": -333.1075134277344, "loss": 1.1161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8803210258483887, "rewards/margins": 0.5095497369766235, "rewards/rejected": 2.3707709312438965, "step": 100590 }, { "epoch": 4.670597520776266, "grad_norm": 261.32733154296875, "learning_rate": 1.9889502762430937e-08, "logits/chosen": -19.026506423950195, "logits/rejected": -18.49051284790039, "logps/chosen": -423.73516845703125, "logps/rejected": -396.76531982421875, "loss": 0.9785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3225021362304688, "rewards/margins": 0.72602379322052, "rewards/rejected": 2.5964784622192383, "step": 100600 }, { "epoch": 4.671061794883699, "grad_norm": 82.40200805664062, "learning_rate": 1.9861646315984955e-08, "logits/chosen": -20.404293060302734, "logits/rejected": -17.627416610717773, "logps/chosen": -474.1966247558594, "logps/rejected": -237.91696166992188, "loss": 0.448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.632680892944336, "rewards/margins": 2.9834117889404297, "rewards/rejected": 1.6492693424224854, "step": 100610 }, { "epoch": 4.671526068991132, "grad_norm": 68.20576477050781, "learning_rate": 1.9833789869538976e-08, "logits/chosen": -19.348995208740234, "logits/rejected": -18.880626678466797, "logps/chosen": -429.84039306640625, "logps/rejected": -361.20709228515625, "loss": 0.4638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.748723030090332, "rewards/margins": 1.4119077920913696, "rewards/rejected": 3.3368148803710938, "step": 100620 }, { "epoch": 4.671990343098566, "grad_norm": 143.2918243408203, "learning_rate": 1.9805933423092994e-08, "logits/chosen": -19.174558639526367, "logits/rejected": -18.23098373413086, "logps/chosen": -385.52801513671875, "logps/rejected": -285.38677978515625, "loss": 0.5703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4184584617614746, "rewards/margins": 1.7354707717895508, "rewards/rejected": 0.6829878687858582, "step": 100630 }, { "epoch": 4.672454617205998, "grad_norm": 52.4433708190918, "learning_rate": 1.9778076976647012e-08, "logits/chosen": -18.889833450317383, "logits/rejected": -17.768348693847656, "logps/chosen": -373.62933349609375, "logps/rejected": -342.1659851074219, "loss": 0.8503, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4201889038085938, "rewards/margins": 1.2345945835113525, "rewards/rejected": 2.185594320297241, "step": 100640 }, { "epoch": 4.672918891313431, "grad_norm": 116.9031982421875, "learning_rate": 1.975022053020103e-08, "logits/chosen": -19.463863372802734, "logits/rejected": -19.200719833374023, "logps/chosen": -406.47393798828125, "logps/rejected": -342.2762756347656, "loss": 0.9372, "rewards/accuracies": 0.5, "rewards/chosen": 3.6625137329101562, "rewards/margins": 0.29155272245407104, "rewards/rejected": 3.3709609508514404, "step": 100650 }, { "epoch": 4.673383165420865, "grad_norm": 17.60152816772461, "learning_rate": 1.972236408375505e-08, "logits/chosen": -19.131389617919922, "logits/rejected": -18.162336349487305, "logps/chosen": -392.555419921875, "logps/rejected": -277.41845703125, "loss": 0.4228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5795836448669434, "rewards/margins": 2.234520435333252, "rewards/rejected": 1.3450628519058228, "step": 100660 }, { "epoch": 4.673847439528298, "grad_norm": 1.7333958148956299, "learning_rate": 1.9694507637309066e-08, "logits/chosen": -18.967920303344727, "logits/rejected": -18.644529342651367, "logps/chosen": -245.1055145263672, "logps/rejected": -326.90740966796875, "loss": 1.2079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.015536069869995, "rewards/margins": 0.6807720065116882, "rewards/rejected": 1.3347642421722412, "step": 100670 }, { "epoch": 4.67431171363573, "grad_norm": 75.7347183227539, "learning_rate": 1.9666651190863083e-08, "logits/chosen": -18.56670570373535, "logits/rejected": -18.180187225341797, "logps/chosen": -387.7923583984375, "logps/rejected": -341.07470703125, "loss": 0.6402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5833308696746826, "rewards/margins": 1.3697030544281006, "rewards/rejected": 2.213627815246582, "step": 100680 }, { "epoch": 4.674775987743164, "grad_norm": 69.2002182006836, "learning_rate": 1.96387947444171e-08, "logits/chosen": -19.35223960876465, "logits/rejected": -17.884479522705078, "logps/chosen": -450.00225830078125, "logps/rejected": -344.0255126953125, "loss": 0.7093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8578686714172363, "rewards/margins": 1.4856208562850952, "rewards/rejected": 2.3722476959228516, "step": 100690 }, { "epoch": 4.675240261850597, "grad_norm": 22.338294982910156, "learning_rate": 1.961093829797112e-08, "logits/chosen": -19.505924224853516, "logits/rejected": -17.896739959716797, "logps/chosen": -367.25836181640625, "logps/rejected": -284.235595703125, "loss": 0.4117, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5726406574249268, "rewards/margins": 3.1168782711029053, "rewards/rejected": 0.45576247572898865, "step": 100700 }, { "epoch": 4.67570453595803, "grad_norm": 51.46350860595703, "learning_rate": 1.958308185152514e-08, "logits/chosen": -18.299108505249023, "logits/rejected": -18.604822158813477, "logps/chosen": -399.8549499511719, "logps/rejected": -364.2225036621094, "loss": 1.5334, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6132652759552, "rewards/margins": 0.17308378219604492, "rewards/rejected": 3.4401817321777344, "step": 100710 }, { "epoch": 4.676168810065462, "grad_norm": 210.198486328125, "learning_rate": 1.9555225405079158e-08, "logits/chosen": -19.114561080932617, "logits/rejected": -18.77455711364746, "logps/chosen": -319.4185485839844, "logps/rejected": -315.3151550292969, "loss": 0.9362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5918936729431152, "rewards/margins": 0.41292619705200195, "rewards/rejected": 2.1789677143096924, "step": 100720 }, { "epoch": 4.676633084172896, "grad_norm": 0.3419181704521179, "learning_rate": 1.9527368958633176e-08, "logits/chosen": -18.01112174987793, "logits/rejected": -17.440940856933594, "logps/chosen": -327.568359375, "logps/rejected": -209.88760375976562, "loss": 0.6317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.397803783416748, "rewards/margins": 1.2726414203643799, "rewards/rejected": 1.1251623630523682, "step": 100730 }, { "epoch": 4.677097358280329, "grad_norm": 33.578216552734375, "learning_rate": 1.9499512512187194e-08, "logits/chosen": -19.382123947143555, "logits/rejected": -18.30820083618164, "logps/chosen": -417.96697998046875, "logps/rejected": -284.22186279296875, "loss": 0.6094, "rewards/accuracies": 0.5, "rewards/chosen": 3.199626922607422, "rewards/margins": 1.9171075820922852, "rewards/rejected": 1.2825192213058472, "step": 100740 }, { "epoch": 4.677561632387762, "grad_norm": 57.84186935424805, "learning_rate": 1.9471656065741215e-08, "logits/chosen": -19.829906463623047, "logits/rejected": -18.63023567199707, "logps/chosen": -360.5396728515625, "logps/rejected": -273.5672302246094, "loss": 0.7869, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5133328437805176, "rewards/margins": 1.471985936164856, "rewards/rejected": 2.041346549987793, "step": 100750 }, { "epoch": 4.678025906495195, "grad_norm": 53.20175552368164, "learning_rate": 1.9443799619295233e-08, "logits/chosen": -18.728010177612305, "logits/rejected": -19.316299438476562, "logps/chosen": -424.36474609375, "logps/rejected": -438.9534606933594, "loss": 0.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.162309646606445, "rewards/margins": 1.2506370544433594, "rewards/rejected": 2.911673069000244, "step": 100760 }, { "epoch": 4.678490180602628, "grad_norm": 2.5778934955596924, "learning_rate": 1.9415943172849247e-08, "logits/chosen": -19.810291290283203, "logits/rejected": -18.741329193115234, "logps/chosen": -409.72344970703125, "logps/rejected": -399.9187316894531, "loss": 0.8058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6406197547912598, "rewards/margins": 0.7665607333183289, "rewards/rejected": 2.874058961868286, "step": 100770 }, { "epoch": 4.678954454710061, "grad_norm": 33.25328826904297, "learning_rate": 1.9388086726403265e-08, "logits/chosen": -19.762229919433594, "logits/rejected": -19.79631233215332, "logps/chosen": -446.51226806640625, "logps/rejected": -455.0980529785156, "loss": 1.368, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8021597862243652, "rewards/margins": -0.4377676546573639, "rewards/rejected": 4.239927291870117, "step": 100780 }, { "epoch": 4.6794187288174935, "grad_norm": 30.137359619140625, "learning_rate": 1.9360230279957286e-08, "logits/chosen": -19.952674865722656, "logits/rejected": -19.51491355895996, "logps/chosen": -415.1758728027344, "logps/rejected": -288.0862731933594, "loss": 0.8222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1670775413513184, "rewards/margins": 0.8701282739639282, "rewards/rejected": 2.2969493865966797, "step": 100790 }, { "epoch": 4.679883002924927, "grad_norm": 4.798954486846924, "learning_rate": 1.9332373833511304e-08, "logits/chosen": -20.14552879333496, "logits/rejected": -19.424060821533203, "logps/chosen": -386.0505676269531, "logps/rejected": -302.89105224609375, "loss": 0.9289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9968149662017822, "rewards/margins": 0.8762784004211426, "rewards/rejected": 2.1205360889434814, "step": 100800 }, { "epoch": 4.68034727703236, "grad_norm": 9.3357515335083, "learning_rate": 1.9304517387065322e-08, "logits/chosen": -18.757131576538086, "logits/rejected": -17.534095764160156, "logps/chosen": -381.1692810058594, "logps/rejected": -224.95596313476562, "loss": 0.288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.346160888671875, "rewards/margins": 2.0435516834259033, "rewards/rejected": 1.3026092052459717, "step": 100810 }, { "epoch": 4.680811551139793, "grad_norm": 78.24603271484375, "learning_rate": 1.927666094061934e-08, "logits/chosen": -19.851802825927734, "logits/rejected": -18.637168884277344, "logps/chosen": -482.9798278808594, "logps/rejected": -358.70709228515625, "loss": 0.4302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3495378494262695, "rewards/margins": 1.864495873451233, "rewards/rejected": 2.485042095184326, "step": 100820 }, { "epoch": 4.681275825247226, "grad_norm": 16.06121063232422, "learning_rate": 1.924880449417336e-08, "logits/chosen": -18.821659088134766, "logits/rejected": -18.09688949584961, "logps/chosen": -385.31402587890625, "logps/rejected": -307.6629638671875, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.57078218460083, "rewards/margins": 0.8318823575973511, "rewards/rejected": 1.7388999462127686, "step": 100830 }, { "epoch": 4.681740099354659, "grad_norm": 35.62554931640625, "learning_rate": 1.922094804772738e-08, "logits/chosen": -19.162090301513672, "logits/rejected": -18.472631454467773, "logps/chosen": -361.8819885253906, "logps/rejected": -309.1920471191406, "loss": 0.7486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.944061756134033, "rewards/margins": 0.9557002186775208, "rewards/rejected": 1.988361120223999, "step": 100840 }, { "epoch": 4.682204373462092, "grad_norm": 149.22125244140625, "learning_rate": 1.9193091601281397e-08, "logits/chosen": -18.992998123168945, "logits/rejected": -18.730243682861328, "logps/chosen": -312.08001708984375, "logps/rejected": -304.00628662109375, "loss": 1.464, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.057939052581787, "rewards/margins": 0.09087192267179489, "rewards/rejected": 2.967067241668701, "step": 100850 }, { "epoch": 4.682668647569525, "grad_norm": 0.17680853605270386, "learning_rate": 1.9165235154835415e-08, "logits/chosen": -19.703561782836914, "logits/rejected": -18.78879165649414, "logps/chosen": -392.9925537109375, "logps/rejected": -281.2486572265625, "loss": 0.9353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.181891441345215, "rewards/margins": 1.854529619216919, "rewards/rejected": 1.3273615837097168, "step": 100860 }, { "epoch": 4.683132921676958, "grad_norm": 149.98655700683594, "learning_rate": 1.913737870838943e-08, "logits/chosen": -18.059452056884766, "logits/rejected": -18.25275993347168, "logps/chosen": -267.4288635253906, "logps/rejected": -326.1013488769531, "loss": 1.3837, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.525911569595337, "rewards/margins": -0.9200603365898132, "rewards/rejected": 2.445971727371216, "step": 100870 }, { "epoch": 4.683597195784391, "grad_norm": 118.95976257324219, "learning_rate": 1.910952226194345e-08, "logits/chosen": -20.67544174194336, "logits/rejected": -19.441791534423828, "logps/chosen": -270.1250915527344, "logps/rejected": -228.9133758544922, "loss": 0.4222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.542513370513916, "rewards/margins": 1.123209834098816, "rewards/rejected": 1.4193031787872314, "step": 100880 }, { "epoch": 4.684061469891824, "grad_norm": 147.9203338623047, "learning_rate": 1.9081665815497468e-08, "logits/chosen": -18.912456512451172, "logits/rejected": -18.644275665283203, "logps/chosen": -292.3640441894531, "logps/rejected": -273.8282165527344, "loss": 1.2421, "rewards/accuracies": 0.5, "rewards/chosen": 2.7872982025146484, "rewards/margins": 0.7986558079719543, "rewards/rejected": 1.9886423349380493, "step": 100890 }, { "epoch": 4.684525743999258, "grad_norm": 2.082460641860962, "learning_rate": 1.9053809369051486e-08, "logits/chosen": -17.645063400268555, "logits/rejected": -18.90676498413086, "logps/chosen": -273.8861389160156, "logps/rejected": -387.00469970703125, "loss": 2.0847, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0263853073120117, "rewards/margins": -1.1208384037017822, "rewards/rejected": 3.147223472595215, "step": 100900 }, { "epoch": 4.68499001810669, "grad_norm": 51.18352127075195, "learning_rate": 1.9025952922605504e-08, "logits/chosen": -19.440645217895508, "logits/rejected": -18.8515625, "logps/chosen": -304.23870849609375, "logps/rejected": -253.7683563232422, "loss": 0.5742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7235474586486816, "rewards/margins": 0.8757867813110352, "rewards/rejected": 1.8477604389190674, "step": 100910 }, { "epoch": 4.685454292214123, "grad_norm": 1.533907413482666, "learning_rate": 1.8998096476159525e-08, "logits/chosen": -20.70701026916504, "logits/rejected": -19.354137420654297, "logps/chosen": -354.2781066894531, "logps/rejected": -315.3680725097656, "loss": 0.7189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.480297565460205, "rewards/margins": 1.3659321069717407, "rewards/rejected": 3.114365816116333, "step": 100920 }, { "epoch": 4.685918566321556, "grad_norm": 39.26847839355469, "learning_rate": 1.8970240029713543e-08, "logits/chosen": -18.740646362304688, "logits/rejected": -18.028724670410156, "logps/chosen": -378.2252502441406, "logps/rejected": -324.79901123046875, "loss": 0.4791, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6085124015808105, "rewards/margins": 0.8142936825752258, "rewards/rejected": 1.79421865940094, "step": 100930 }, { "epoch": 4.68638284042899, "grad_norm": 143.18797302246094, "learning_rate": 1.894238358326756e-08, "logits/chosen": -18.47422218322754, "logits/rejected": -17.381580352783203, "logps/chosen": -368.17547607421875, "logps/rejected": -303.67388916015625, "loss": 0.3134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3598999977111816, "rewards/margins": 2.284468412399292, "rewards/rejected": 1.0754315853118896, "step": 100940 }, { "epoch": 4.686847114536422, "grad_norm": 9.397358894348145, "learning_rate": 1.891452713682158e-08, "logits/chosen": -18.333072662353516, "logits/rejected": -16.704851150512695, "logps/chosen": -440.04852294921875, "logps/rejected": -344.8197326660156, "loss": 0.7168, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.273895740509033, "rewards/margins": 2.244814872741699, "rewards/rejected": 2.029080390930176, "step": 100950 }, { "epoch": 4.687311388643855, "grad_norm": 132.7595672607422, "learning_rate": 1.88866706903756e-08, "logits/chosen": -18.873794555664062, "logits/rejected": -19.101533889770508, "logps/chosen": -335.0797119140625, "logps/rejected": -357.7959289550781, "loss": 1.1166, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.282248020172119, "rewards/margins": -0.10843918472528458, "rewards/rejected": 2.3906872272491455, "step": 100960 }, { "epoch": 4.687775662751289, "grad_norm": 1.8995307683944702, "learning_rate": 1.8858814243929614e-08, "logits/chosen": -19.337114334106445, "logits/rejected": -18.322513580322266, "logps/chosen": -408.74456787109375, "logps/rejected": -276.37274169921875, "loss": 0.3637, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.724966049194336, "rewards/margins": 1.9760754108428955, "rewards/rejected": 1.7488906383514404, "step": 100970 }, { "epoch": 4.6882399368587215, "grad_norm": 2.2519731521606445, "learning_rate": 1.8830957797483632e-08, "logits/chosen": -19.2384033203125, "logits/rejected": -17.7629337310791, "logps/chosen": -444.9637145996094, "logps/rejected": -256.9828796386719, "loss": 0.606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.643306255340576, "rewards/margins": 2.9363179206848145, "rewards/rejected": 1.7069885730743408, "step": 100980 }, { "epoch": 4.688704210966154, "grad_norm": 44.21561813354492, "learning_rate": 1.880310135103765e-08, "logits/chosen": -18.342174530029297, "logits/rejected": -18.357925415039062, "logps/chosen": -363.4541931152344, "logps/rejected": -358.94976806640625, "loss": 1.1254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2285351753234863, "rewards/margins": -0.08887894451618195, "rewards/rejected": 2.317413806915283, "step": 100990 }, { "epoch": 4.689168485073587, "grad_norm": 47.7812614440918, "learning_rate": 1.877524490459167e-08, "logits/chosen": -19.465435028076172, "logits/rejected": -19.27619743347168, "logps/chosen": -388.25738525390625, "logps/rejected": -390.6622314453125, "loss": 0.7721, "rewards/accuracies": 0.5, "rewards/chosen": 4.593890190124512, "rewards/margins": 0.5744320154190063, "rewards/rejected": 4.019458770751953, "step": 101000 }, { "epoch": 4.689632759181021, "grad_norm": 108.38880157470703, "learning_rate": 1.874738845814569e-08, "logits/chosen": -19.18372344970703, "logits/rejected": -18.37769317626953, "logps/chosen": -498.84100341796875, "logps/rejected": -357.37945556640625, "loss": 0.8091, "rewards/accuracies": 0.5, "rewards/chosen": 3.517244815826416, "rewards/margins": 1.0319328308105469, "rewards/rejected": 2.48531174659729, "step": 101010 }, { "epoch": 4.6900970332884535, "grad_norm": 156.5991973876953, "learning_rate": 1.8719532011699707e-08, "logits/chosen": -19.269615173339844, "logits/rejected": -19.108394622802734, "logps/chosen": -391.33319091796875, "logps/rejected": -395.8219299316406, "loss": 0.9233, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8108034133911133, "rewards/margins": -0.09039248526096344, "rewards/rejected": 2.901196002960205, "step": 101020 }, { "epoch": 4.690561307395886, "grad_norm": 25.467388153076172, "learning_rate": 1.8691675565253725e-08, "logits/chosen": -19.67258071899414, "logits/rejected": -19.246456146240234, "logps/chosen": -553.0619506835938, "logps/rejected": -498.04803466796875, "loss": 0.406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.9853363037109375, "rewards/margins": 1.0628377199172974, "rewards/rejected": 3.9224982261657715, "step": 101030 }, { "epoch": 4.69102558150332, "grad_norm": 99.32030487060547, "learning_rate": 1.8663819118807742e-08, "logits/chosen": -19.127315521240234, "logits/rejected": -18.2513484954834, "logps/chosen": -327.57269287109375, "logps/rejected": -287.3391418457031, "loss": 0.6367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.200142502784729, "rewards/margins": 0.678641676902771, "rewards/rejected": 0.5215007662773132, "step": 101040 }, { "epoch": 4.691489855610753, "grad_norm": 6.518388271331787, "learning_rate": 1.863596267236176e-08, "logits/chosen": -19.364864349365234, "logits/rejected": -19.175992965698242, "logps/chosen": -307.0086669921875, "logps/rejected": -324.23797607421875, "loss": 0.8601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.764568328857422, "rewards/margins": 0.6506894826889038, "rewards/rejected": 2.1138787269592285, "step": 101050 }, { "epoch": 4.6919541297181855, "grad_norm": 212.00025939941406, "learning_rate": 1.8608106225915778e-08, "logits/chosen": -19.220056533813477, "logits/rejected": -18.589458465576172, "logps/chosen": -400.22381591796875, "logps/rejected": -362.660400390625, "loss": 0.4886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.269120693206787, "rewards/margins": 1.524754285812378, "rewards/rejected": 1.7443664073944092, "step": 101060 }, { "epoch": 4.692418403825618, "grad_norm": 35.82005310058594, "learning_rate": 1.85802497794698e-08, "logits/chosen": -18.815650939941406, "logits/rejected": -18.162555694580078, "logps/chosen": -293.7822265625, "logps/rejected": -263.59527587890625, "loss": 0.8044, "rewards/accuracies": 0.5, "rewards/chosen": 3.220651149749756, "rewards/margins": 1.200885534286499, "rewards/rejected": 2.019765853881836, "step": 101070 }, { "epoch": 4.692882677933052, "grad_norm": 1.0646113157272339, "learning_rate": 1.8552393333023817e-08, "logits/chosen": -18.963001251220703, "logits/rejected": -18.371618270874023, "logps/chosen": -484.831298828125, "logps/rejected": -382.3419494628906, "loss": 0.5162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.902846574783325, "rewards/margins": 1.1198594570159912, "rewards/rejected": 2.782986879348755, "step": 101080 }, { "epoch": 4.693346952040485, "grad_norm": 4.510791301727295, "learning_rate": 1.8524536886577835e-08, "logits/chosen": -19.0345516204834, "logits/rejected": -18.635875701904297, "logps/chosen": -426.5077209472656, "logps/rejected": -266.51318359375, "loss": 0.4242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7131943702697754, "rewards/margins": 1.5015361309051514, "rewards/rejected": 1.211658239364624, "step": 101090 }, { "epoch": 4.6938112261479175, "grad_norm": 76.94609069824219, "learning_rate": 1.8496680440131853e-08, "logits/chosen": -19.704238891601562, "logits/rejected": -19.00197982788086, "logps/chosen": -420.687744140625, "logps/rejected": -330.7024841308594, "loss": 0.682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2938072681427, "rewards/margins": 0.6720072627067566, "rewards/rejected": 2.6217994689941406, "step": 101100 }, { "epoch": 4.694275500255351, "grad_norm": 37.15196228027344, "learning_rate": 1.846882399368587e-08, "logits/chosen": -18.890748977661133, "logits/rejected": -18.355382919311523, "logps/chosen": -354.9796447753906, "logps/rejected": -312.0805358886719, "loss": 0.5265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1438052654266357, "rewards/margins": 1.4016859531402588, "rewards/rejected": 1.7421194314956665, "step": 101110 }, { "epoch": 4.694739774362784, "grad_norm": 127.17906188964844, "learning_rate": 1.844096754723989e-08, "logits/chosen": -19.542728424072266, "logits/rejected": -19.002214431762695, "logps/chosen": -380.27801513671875, "logps/rejected": -314.5228576660156, "loss": 0.359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.006028175354004, "rewards/margins": 1.2403641939163208, "rewards/rejected": 1.765663743019104, "step": 101120 }, { "epoch": 4.695204048470217, "grad_norm": 158.1686248779297, "learning_rate": 1.841311110079391e-08, "logits/chosen": -19.683895111083984, "logits/rejected": -18.902963638305664, "logps/chosen": -438.4717712402344, "logps/rejected": -401.68310546875, "loss": 0.8102, "rewards/accuracies": 0.5, "rewards/chosen": 2.893360137939453, "rewards/margins": 0.19535109400749207, "rewards/rejected": 2.6980090141296387, "step": 101130 }, { "epoch": 4.695668322577649, "grad_norm": 5.498860836029053, "learning_rate": 1.8385254654347924e-08, "logits/chosen": -19.534976959228516, "logits/rejected": -18.335256576538086, "logps/chosen": -354.224853515625, "logps/rejected": -213.3289031982422, "loss": 0.7632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8659884929656982, "rewards/margins": 2.2392630577087402, "rewards/rejected": 1.626725435256958, "step": 101140 }, { "epoch": 4.696132596685083, "grad_norm": 16.27560806274414, "learning_rate": 1.8357398207901945e-08, "logits/chosen": -20.006610870361328, "logits/rejected": -18.0985164642334, "logps/chosen": -485.6626892089844, "logps/rejected": -317.66668701171875, "loss": 0.3015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.126248359680176, "rewards/margins": 2.1779658794403076, "rewards/rejected": 1.9482828378677368, "step": 101150 }, { "epoch": 4.696596870792516, "grad_norm": 29.785425186157227, "learning_rate": 1.8329541761455963e-08, "logits/chosen": -20.394535064697266, "logits/rejected": -19.08770179748535, "logps/chosen": -474.6837463378906, "logps/rejected": -359.80889892578125, "loss": 0.3232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5079474449157715, "rewards/margins": 2.2731878757476807, "rewards/rejected": 2.23475980758667, "step": 101160 }, { "epoch": 4.697061144899949, "grad_norm": 55.18231964111328, "learning_rate": 1.830168531500998e-08, "logits/chosen": -19.53216552734375, "logits/rejected": -18.730398178100586, "logps/chosen": -476.0384216308594, "logps/rejected": -385.2508544921875, "loss": 0.7651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.219437122344971, "rewards/margins": 1.3528203964233398, "rewards/rejected": 2.866616725921631, "step": 101170 }, { "epoch": 4.697525419007382, "grad_norm": 206.52210998535156, "learning_rate": 1.8273828868564e-08, "logits/chosen": -19.515066146850586, "logits/rejected": -18.815689086914062, "logps/chosen": -491.5770568847656, "logps/rejected": -348.26873779296875, "loss": 0.6208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8538811206817627, "rewards/margins": 1.4542815685272217, "rewards/rejected": 2.399599552154541, "step": 101180 }, { "epoch": 4.697989693114815, "grad_norm": 46.86025619506836, "learning_rate": 1.8245972422118017e-08, "logits/chosen": -19.11461067199707, "logits/rejected": -18.026264190673828, "logps/chosen": -400.5199890136719, "logps/rejected": -333.2505187988281, "loss": 0.5496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.056927442550659, "rewards/margins": 1.150635004043579, "rewards/rejected": 1.9062926769256592, "step": 101190 }, { "epoch": 4.698453967222248, "grad_norm": 205.36875915527344, "learning_rate": 1.8218115975672035e-08, "logits/chosen": -18.962617874145508, "logits/rejected": -17.606929779052734, "logps/chosen": -359.10198974609375, "logps/rejected": -303.04718017578125, "loss": 0.9544, "rewards/accuracies": 0.5, "rewards/chosen": 3.587023973464966, "rewards/margins": 1.3781073093414307, "rewards/rejected": 2.208916664123535, "step": 101200 }, { "epoch": 4.698918241329681, "grad_norm": 65.68385314941406, "learning_rate": 1.8190259529226056e-08, "logits/chosen": -18.43821144104004, "logits/rejected": -18.7371883392334, "logps/chosen": -317.4582824707031, "logps/rejected": -345.2079162597656, "loss": 1.5479, "rewards/accuracies": 0.5, "rewards/chosen": 2.9976649284362793, "rewards/margins": -0.00829226989299059, "rewards/rejected": 3.0059573650360107, "step": 101210 }, { "epoch": 4.699382515437114, "grad_norm": 156.16148376464844, "learning_rate": 1.8162403082780074e-08, "logits/chosen": -18.463428497314453, "logits/rejected": -17.807838439941406, "logps/chosen": -418.875244140625, "logps/rejected": -341.02105712890625, "loss": 0.6216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5534024238586426, "rewards/margins": 1.1045280694961548, "rewards/rejected": 1.4488742351531982, "step": 101220 }, { "epoch": 4.699846789544547, "grad_norm": 76.6469497680664, "learning_rate": 1.813454663633409e-08, "logits/chosen": -18.54652214050293, "logits/rejected": -18.16439437866211, "logps/chosen": -314.9566955566406, "logps/rejected": -301.6604309082031, "loss": 0.5809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2137417793273926, "rewards/margins": 1.1387531757354736, "rewards/rejected": 1.0749883651733398, "step": 101230 }, { "epoch": 4.70031106365198, "grad_norm": 60.2820930480957, "learning_rate": 1.810669018988811e-08, "logits/chosen": -20.00857925415039, "logits/rejected": -19.099252700805664, "logps/chosen": -400.76416015625, "logps/rejected": -323.9466552734375, "loss": 0.4486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9210364818573, "rewards/margins": 1.6741119623184204, "rewards/rejected": 2.24692440032959, "step": 101240 }, { "epoch": 4.7007753377594135, "grad_norm": 132.0396728515625, "learning_rate": 1.8078833743442127e-08, "logits/chosen": -19.541046142578125, "logits/rejected": -18.263301849365234, "logps/chosen": -494.8832092285156, "logps/rejected": -423.17449951171875, "loss": 0.4363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.928786277770996, "rewards/margins": 2.259199619293213, "rewards/rejected": 2.669586658477783, "step": 101250 }, { "epoch": 4.701239611866846, "grad_norm": 0.6750038862228394, "learning_rate": 1.8050977296996145e-08, "logits/chosen": -18.641056060791016, "logits/rejected": -18.6883487701416, "logps/chosen": -307.4957580566406, "logps/rejected": -295.5377197265625, "loss": 0.4878, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7126357555389404, "rewards/margins": 1.0078061819076538, "rewards/rejected": 1.7048298120498657, "step": 101260 }, { "epoch": 4.701703885974279, "grad_norm": 51.71929168701172, "learning_rate": 1.8023120850550163e-08, "logits/chosen": -18.21603775024414, "logits/rejected": -17.912460327148438, "logps/chosen": -270.80340576171875, "logps/rejected": -248.5734100341797, "loss": 1.1793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5783419609069824, "rewards/margins": 0.7061252593994141, "rewards/rejected": 1.8722164630889893, "step": 101270 }, { "epoch": 4.702168160081712, "grad_norm": 5.559615612030029, "learning_rate": 1.7995264404104184e-08, "logits/chosen": -19.915267944335938, "logits/rejected": -18.733627319335938, "logps/chosen": -421.7237243652344, "logps/rejected": -282.9813232421875, "loss": 0.7228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6741576194763184, "rewards/margins": 1.188247799873352, "rewards/rejected": 2.485909938812256, "step": 101280 }, { "epoch": 4.7026324341891454, "grad_norm": 146.83255004882812, "learning_rate": 1.79674079576582e-08, "logits/chosen": -18.239612579345703, "logits/rejected": -18.668914794921875, "logps/chosen": -379.70806884765625, "logps/rejected": -402.0133972167969, "loss": 0.8013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5053651332855225, "rewards/margins": 0.28176599740982056, "rewards/rejected": 2.2235991954803467, "step": 101290 }, { "epoch": 4.703096708296578, "grad_norm": 2.8793044090270996, "learning_rate": 1.793955151121222e-08, "logits/chosen": -18.77924156188965, "logits/rejected": -18.921390533447266, "logps/chosen": -425.80828857421875, "logps/rejected": -366.0291748046875, "loss": 0.491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.010256290435791, "rewards/margins": 1.1704308986663818, "rewards/rejected": 2.83982515335083, "step": 101300 }, { "epoch": 4.703560982404011, "grad_norm": 133.51161193847656, "learning_rate": 1.7911695064766237e-08, "logits/chosen": -19.395723342895508, "logits/rejected": -18.10570526123047, "logps/chosen": -465.8128356933594, "logps/rejected": -333.074462890625, "loss": 0.5267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.962285280227661, "rewards/margins": 1.896969199180603, "rewards/rejected": 2.0653164386749268, "step": 101310 }, { "epoch": 4.704025256511445, "grad_norm": 31.266357421875, "learning_rate": 1.7883838618320255e-08, "logits/chosen": -19.12981414794922, "logits/rejected": -19.03296661376953, "logps/chosen": -371.8414611816406, "logps/rejected": -312.6036376953125, "loss": 0.3185, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.203329086303711, "rewards/margins": 1.386306643486023, "rewards/rejected": 1.8170225620269775, "step": 101320 }, { "epoch": 4.704489530618877, "grad_norm": 30.52691650390625, "learning_rate": 1.7855982171874273e-08, "logits/chosen": -18.389142990112305, "logits/rejected": -18.540346145629883, "logps/chosen": -221.4257354736328, "logps/rejected": -309.8033752441406, "loss": 1.7602, "rewards/accuracies": 0.5, "rewards/chosen": 1.8628721237182617, "rewards/margins": -0.3710278272628784, "rewards/rejected": 2.2338998317718506, "step": 101330 }, { "epoch": 4.70495380472631, "grad_norm": 78.10204315185547, "learning_rate": 1.782812572542829e-08, "logits/chosen": -20.193370819091797, "logits/rejected": -20.10063934326172, "logps/chosen": -439.005126953125, "logps/rejected": -362.5207214355469, "loss": 0.5821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6044650077819824, "rewards/margins": 0.8869702219963074, "rewards/rejected": 2.7174949645996094, "step": 101340 }, { "epoch": 4.705418078833743, "grad_norm": 25.638795852661133, "learning_rate": 1.780026927898231e-08, "logits/chosen": -18.302236557006836, "logits/rejected": -18.412187576293945, "logps/chosen": -339.740966796875, "logps/rejected": -359.44805908203125, "loss": 1.1422, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8889830112457275, "rewards/margins": 0.3237646520137787, "rewards/rejected": 2.565218448638916, "step": 101350 }, { "epoch": 4.705882352941177, "grad_norm": 0.42902109026908875, "learning_rate": 1.777241283253633e-08, "logits/chosen": -19.264005661010742, "logits/rejected": -17.61772918701172, "logps/chosen": -348.57855224609375, "logps/rejected": -209.6759033203125, "loss": 0.3468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.986922264099121, "rewards/margins": 2.631408214569092, "rewards/rejected": 0.3555140197277069, "step": 101360 }, { "epoch": 4.706346627048609, "grad_norm": 39.262603759765625, "learning_rate": 1.7744556386090348e-08, "logits/chosen": -18.69659996032715, "logits/rejected": -18.202625274658203, "logps/chosen": -381.8515319824219, "logps/rejected": -299.6292419433594, "loss": 0.582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9976654052734375, "rewards/margins": 0.9064770936965942, "rewards/rejected": 2.0911881923675537, "step": 101370 }, { "epoch": 4.706810901156042, "grad_norm": 157.40097045898438, "learning_rate": 1.7716699939644366e-08, "logits/chosen": -18.08860206604004, "logits/rejected": -17.157611846923828, "logps/chosen": -331.9723815917969, "logps/rejected": -292.8341979980469, "loss": 0.7249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9275705814361572, "rewards/margins": 1.6079988479614258, "rewards/rejected": 1.319571852684021, "step": 101380 }, { "epoch": 4.707275175263476, "grad_norm": 0.3366442024707794, "learning_rate": 1.7688843493198384e-08, "logits/chosen": -19.455015182495117, "logits/rejected": -18.34872817993164, "logps/chosen": -400.5511779785156, "logps/rejected": -308.98583984375, "loss": 0.3617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.370606422424316, "rewards/margins": 2.916785955429077, "rewards/rejected": 1.4538211822509766, "step": 101390 }, { "epoch": 4.707739449370909, "grad_norm": 208.75994873046875, "learning_rate": 1.76609870467524e-08, "logits/chosen": -18.62282943725586, "logits/rejected": -17.970775604248047, "logps/chosen": -384.72479248046875, "logps/rejected": -257.55035400390625, "loss": 0.958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0100226402282715, "rewards/margins": 1.3735426664352417, "rewards/rejected": 1.6364799737930298, "step": 101400 }, { "epoch": 4.708203723478341, "grad_norm": 0.161508709192276, "learning_rate": 1.763313060030642e-08, "logits/chosen": -20.12367057800293, "logits/rejected": -18.977035522460938, "logps/chosen": -427.4905700683594, "logps/rejected": -346.2237243652344, "loss": 0.4504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.000060558319092, "rewards/margins": 3.118112087249756, "rewards/rejected": 1.881948709487915, "step": 101410 }, { "epoch": 4.708667997585774, "grad_norm": 48.047122955322266, "learning_rate": 1.760527415386044e-08, "logits/chosen": -18.862714767456055, "logits/rejected": -18.21584701538086, "logps/chosen": -451.11663818359375, "logps/rejected": -419.5824279785156, "loss": 0.4223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9551620483398438, "rewards/margins": 1.5354585647583008, "rewards/rejected": 2.419703483581543, "step": 101420 }, { "epoch": 4.709132271693208, "grad_norm": 6.445578098297119, "learning_rate": 1.7577417707414458e-08, "logits/chosen": -20.410099029541016, "logits/rejected": -19.878482818603516, "logps/chosen": -392.89813232421875, "logps/rejected": -304.8877258300781, "loss": 0.995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.440309524536133, "rewards/margins": 0.49757710099220276, "rewards/rejected": 2.942732572555542, "step": 101430 }, { "epoch": 4.709596545800641, "grad_norm": 9.417543411254883, "learning_rate": 1.7549561260968473e-08, "logits/chosen": -19.208711624145508, "logits/rejected": -18.824657440185547, "logps/chosen": -406.96826171875, "logps/rejected": -377.8126525878906, "loss": 1.5685, "rewards/accuracies": 0.5, "rewards/chosen": 2.9405267238616943, "rewards/margins": -0.43611258268356323, "rewards/rejected": 3.3766396045684814, "step": 101440 }, { "epoch": 4.710060819908073, "grad_norm": 349.0458679199219, "learning_rate": 1.7521704814522494e-08, "logits/chosen": -18.254974365234375, "logits/rejected": -18.09769630432129, "logps/chosen": -393.86322021484375, "logps/rejected": -323.624267578125, "loss": 1.7775, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.985901355743408, "rewards/margins": -0.2445388287305832, "rewards/rejected": 3.230440139770508, "step": 101450 }, { "epoch": 4.710525094015507, "grad_norm": 201.3015899658203, "learning_rate": 1.7493848368076512e-08, "logits/chosen": -20.273664474487305, "logits/rejected": -19.540559768676758, "logps/chosen": -551.7603149414062, "logps/rejected": -457.8897399902344, "loss": 0.5801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.224433898925781, "rewards/margins": 1.9171421527862549, "rewards/rejected": 3.3072915077209473, "step": 101460 }, { "epoch": 4.71098936812294, "grad_norm": 6.906149387359619, "learning_rate": 1.746599192163053e-08, "logits/chosen": -20.634092330932617, "logits/rejected": -20.4274845123291, "logps/chosen": -425.7440490722656, "logps/rejected": -337.9808044433594, "loss": 0.5727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9906930923461914, "rewards/margins": 1.1179594993591309, "rewards/rejected": 1.87273371219635, "step": 101470 }, { "epoch": 4.7114536422303726, "grad_norm": 88.2753677368164, "learning_rate": 1.7438135475184547e-08, "logits/chosen": -18.440319061279297, "logits/rejected": -18.236051559448242, "logps/chosen": -361.63836669921875, "logps/rejected": -371.5021667480469, "loss": 0.8452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0937249660491943, "rewards/margins": 0.6179925799369812, "rewards/rejected": 2.4757323265075684, "step": 101480 }, { "epoch": 4.711917916337806, "grad_norm": 18.30582618713379, "learning_rate": 1.7410279028738565e-08, "logits/chosen": -18.242067337036133, "logits/rejected": -17.737260818481445, "logps/chosen": -306.8072204589844, "logps/rejected": -265.56390380859375, "loss": 0.7906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8852424621582031, "rewards/margins": 0.7690643072128296, "rewards/rejected": 1.1161781549453735, "step": 101490 }, { "epoch": 4.712382190445239, "grad_norm": 44.875274658203125, "learning_rate": 1.7382422582292583e-08, "logits/chosen": -19.075532913208008, "logits/rejected": -18.019418716430664, "logps/chosen": -326.76165771484375, "logps/rejected": -277.4644775390625, "loss": 0.315, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2640881538391113, "rewards/margins": 1.8459358215332031, "rewards/rejected": 1.418152093887329, "step": 101500 }, { "epoch": 4.712846464552672, "grad_norm": 11.609272956848145, "learning_rate": 1.7354566135846604e-08, "logits/chosen": -19.733287811279297, "logits/rejected": -19.032276153564453, "logps/chosen": -363.1104431152344, "logps/rejected": -352.54669189453125, "loss": 1.3436, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4827582836151123, "rewards/margins": -0.5129947066307068, "rewards/rejected": 2.9957528114318848, "step": 101510 }, { "epoch": 4.713310738660105, "grad_norm": 5.520967960357666, "learning_rate": 1.7326709689400622e-08, "logits/chosen": -19.296892166137695, "logits/rejected": -18.547801971435547, "logps/chosen": -432.8282775878906, "logps/rejected": -375.8868103027344, "loss": 0.3019, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8904595375061035, "rewards/margins": 2.5062286853790283, "rewards/rejected": 2.3842310905456543, "step": 101520 }, { "epoch": 4.713775012767538, "grad_norm": 1.8046247959136963, "learning_rate": 1.729885324295464e-08, "logits/chosen": -20.221187591552734, "logits/rejected": -20.269126892089844, "logps/chosen": -371.8079528808594, "logps/rejected": -357.2674255371094, "loss": 0.5627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8795204162597656, "rewards/margins": 1.346821665763855, "rewards/rejected": 2.532698392868042, "step": 101530 }, { "epoch": 4.714239286874971, "grad_norm": 42.796958923339844, "learning_rate": 1.7270996796508658e-08, "logits/chosen": -18.79237937927246, "logits/rejected": -19.0020809173584, "logps/chosen": -319.08685302734375, "logps/rejected": -324.9586181640625, "loss": 1.5088, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.519200563430786, "rewards/margins": -0.9972529411315918, "rewards/rejected": 3.516453504562378, "step": 101540 }, { "epoch": 4.714703560982404, "grad_norm": 114.1878662109375, "learning_rate": 1.7243140350062676e-08, "logits/chosen": -19.94107437133789, "logits/rejected": -19.62653160095215, "logps/chosen": -508.95367431640625, "logps/rejected": -444.01861572265625, "loss": 0.8526, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.471890687942505, "rewards/margins": 0.3205081522464752, "rewards/rejected": 3.1513824462890625, "step": 101550 }, { "epoch": 4.715167835089837, "grad_norm": 0.06283289939165115, "learning_rate": 1.7215283903616694e-08, "logits/chosen": -19.149272918701172, "logits/rejected": -18.98637580871582, "logps/chosen": -491.9972229003906, "logps/rejected": -414.49798583984375, "loss": 1.1023, "rewards/accuracies": 0.5, "rewards/chosen": 4.88282585144043, "rewards/margins": 1.0609514713287354, "rewards/rejected": 3.8218750953674316, "step": 101560 }, { "epoch": 4.71563210919727, "grad_norm": 6.784336566925049, "learning_rate": 1.7187427457170715e-08, "logits/chosen": -19.138864517211914, "logits/rejected": -18.177860260009766, "logps/chosen": -339.81390380859375, "logps/rejected": -276.0382080078125, "loss": 0.2947, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6152913570404053, "rewards/margins": 2.038369655609131, "rewards/rejected": 0.5769218802452087, "step": 101570 }, { "epoch": 4.716096383304703, "grad_norm": 1.326483130455017, "learning_rate": 1.7159571010724733e-08, "logits/chosen": -19.66101837158203, "logits/rejected": -18.167484283447266, "logps/chosen": -457.7826232910156, "logps/rejected": -355.6310119628906, "loss": 0.2033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.817434787750244, "rewards/margins": 3.1551806926727295, "rewards/rejected": 1.6622540950775146, "step": 101580 }, { "epoch": 4.716560657412137, "grad_norm": 49.60355758666992, "learning_rate": 1.7131714564278747e-08, "logits/chosen": -19.11520767211914, "logits/rejected": -18.31018829345703, "logps/chosen": -346.8709716796875, "logps/rejected": -284.33624267578125, "loss": 0.3931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.483550548553467, "rewards/margins": 1.6458317041397095, "rewards/rejected": 0.8377188444137573, "step": 101590 }, { "epoch": 4.717024931519569, "grad_norm": 3.490231990814209, "learning_rate": 1.7103858117832768e-08, "logits/chosen": -19.09347152709961, "logits/rejected": -17.813512802124023, "logps/chosen": -334.80328369140625, "logps/rejected": -188.4346923828125, "loss": 0.3538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5293045043945312, "rewards/margins": 2.53601336479187, "rewards/rejected": 0.9932912588119507, "step": 101600 }, { "epoch": 4.717489205627002, "grad_norm": 213.1665802001953, "learning_rate": 1.7076001671386786e-08, "logits/chosen": -20.055343627929688, "logits/rejected": -18.610673904418945, "logps/chosen": -487.3953552246094, "logps/rejected": -393.2178955078125, "loss": 0.5049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7224979400634766, "rewards/margins": 1.2336481809616089, "rewards/rejected": 2.48884916305542, "step": 101610 }, { "epoch": 4.717953479734435, "grad_norm": 73.46961212158203, "learning_rate": 1.7048145224940804e-08, "logits/chosen": -21.010845184326172, "logits/rejected": -19.879253387451172, "logps/chosen": -403.66070556640625, "logps/rejected": -335.0516357421875, "loss": 0.4364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.878302812576294, "rewards/margins": 1.544950246810913, "rewards/rejected": 2.333352565765381, "step": 101620 }, { "epoch": 4.718417753841869, "grad_norm": 0.38204970955848694, "learning_rate": 1.7020288778494825e-08, "logits/chosen": -18.649078369140625, "logits/rejected": -18.052270889282227, "logps/chosen": -327.95233154296875, "logps/rejected": -297.19598388671875, "loss": 0.4775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8208069801330566, "rewards/margins": 1.2658164501190186, "rewards/rejected": 1.554990530014038, "step": 101630 }, { "epoch": 4.718882027949301, "grad_norm": 222.6368865966797, "learning_rate": 1.699243233204884e-08, "logits/chosen": -20.64977264404297, "logits/rejected": -18.641605377197266, "logps/chosen": -520.0966796875, "logps/rejected": -365.3841857910156, "loss": 0.8301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.15664005279541, "rewards/margins": 1.8370988368988037, "rewards/rejected": 2.3195412158966064, "step": 101640 }, { "epoch": 4.719346302056734, "grad_norm": 0.32214581966400146, "learning_rate": 1.6964575885602857e-08, "logits/chosen": -18.883018493652344, "logits/rejected": -17.651355743408203, "logps/chosen": -477.1795959472656, "logps/rejected": -330.9906311035156, "loss": 0.3014, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.316122531890869, "rewards/margins": 3.4446310997009277, "rewards/rejected": 1.8714911937713623, "step": 101650 }, { "epoch": 4.719810576164168, "grad_norm": 0.969282329082489, "learning_rate": 1.693671943915688e-08, "logits/chosen": -19.81812858581543, "logits/rejected": -17.558448791503906, "logps/chosen": -424.4988708496094, "logps/rejected": -288.715576171875, "loss": 0.4128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.713883399963379, "rewards/margins": 2.7097480297088623, "rewards/rejected": 2.0041348934173584, "step": 101660 }, { "epoch": 4.7202748502716005, "grad_norm": 2.835029363632202, "learning_rate": 1.6908862992710896e-08, "logits/chosen": -18.903194427490234, "logits/rejected": -17.39406394958496, "logps/chosen": -400.2528991699219, "logps/rejected": -269.3262939453125, "loss": 0.3316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.226727247238159, "rewards/margins": 2.0641047954559326, "rewards/rejected": 1.1626226902008057, "step": 101670 }, { "epoch": 4.720739124379033, "grad_norm": 3.5029616355895996, "learning_rate": 1.6881006546264914e-08, "logits/chosen": -19.617677688598633, "logits/rejected": -18.216995239257812, "logps/chosen": -414.3626403808594, "logps/rejected": -263.64862060546875, "loss": 0.5436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.398606300354004, "rewards/margins": 2.1057868003845215, "rewards/rejected": 2.2928194999694824, "step": 101680 }, { "epoch": 4.721203398486466, "grad_norm": 24.37645149230957, "learning_rate": 1.6853150099818932e-08, "logits/chosen": -19.048019409179688, "logits/rejected": -16.933582305908203, "logps/chosen": -361.18414306640625, "logps/rejected": -223.9217071533203, "loss": 0.3101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.11405611038208, "rewards/margins": 2.8230865001678467, "rewards/rejected": 1.2909696102142334, "step": 101690 }, { "epoch": 4.7216676725939, "grad_norm": 186.3632354736328, "learning_rate": 1.682529365337295e-08, "logits/chosen": -18.602596282958984, "logits/rejected": -17.6060791015625, "logps/chosen": -400.40240478515625, "logps/rejected": -316.1517639160156, "loss": 0.524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6519813537597656, "rewards/margins": 1.5474305152893066, "rewards/rejected": 2.104550838470459, "step": 101700 }, { "epoch": 4.7221319467013325, "grad_norm": 9.58753776550293, "learning_rate": 1.6797437206926968e-08, "logits/chosen": -18.434642791748047, "logits/rejected": -18.681537628173828, "logps/chosen": -280.4067687988281, "logps/rejected": -304.72589111328125, "loss": 1.0106, "rewards/accuracies": 0.5, "rewards/chosen": 2.2688369750976562, "rewards/margins": 0.09614710509777069, "rewards/rejected": 2.172689914703369, "step": 101710 }, { "epoch": 4.722596220808765, "grad_norm": 12.218168258666992, "learning_rate": 1.676958076048099e-08, "logits/chosen": -18.824338912963867, "logits/rejected": -18.7987003326416, "logps/chosen": -388.24609375, "logps/rejected": -388.54840087890625, "loss": 0.7194, "rewards/accuracies": 0.5, "rewards/chosen": 2.063347816467285, "rewards/margins": 0.35477590560913086, "rewards/rejected": 1.7085721492767334, "step": 101720 }, { "epoch": 4.723060494916199, "grad_norm": 4.073273181915283, "learning_rate": 1.6741724314035007e-08, "logits/chosen": -18.72408676147461, "logits/rejected": -18.548782348632812, "logps/chosen": -400.80682373046875, "logps/rejected": -481.79345703125, "loss": 2.0731, "rewards/accuracies": 0.5, "rewards/chosen": 2.911881923675537, "rewards/margins": -0.40778273344039917, "rewards/rejected": 3.319664478302002, "step": 101730 }, { "epoch": 4.723524769023632, "grad_norm": 60.588951110839844, "learning_rate": 1.6713867867589025e-08, "logits/chosen": -18.67853355407715, "logits/rejected": -18.439802169799805, "logps/chosen": -319.90618896484375, "logps/rejected": -285.74627685546875, "loss": 0.807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2651166915893555, "rewards/margins": 0.611108124256134, "rewards/rejected": 1.6540085077285767, "step": 101740 }, { "epoch": 4.7239890431310645, "grad_norm": 48.92962646484375, "learning_rate": 1.6686011421143043e-08, "logits/chosen": -19.242509841918945, "logits/rejected": -18.446035385131836, "logps/chosen": -335.64642333984375, "logps/rejected": -284.3177795410156, "loss": 0.306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0706276893615723, "rewards/margins": 2.0582547187805176, "rewards/rejected": 1.0123733282089233, "step": 101750 }, { "epoch": 4.724453317238497, "grad_norm": 27.452438354492188, "learning_rate": 1.665815497469706e-08, "logits/chosen": -18.365741729736328, "logits/rejected": -18.462574005126953, "logps/chosen": -361.1827087402344, "logps/rejected": -371.29229736328125, "loss": 1.6174, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6539769172668457, "rewards/margins": -0.5970221161842346, "rewards/rejected": 3.2509994506835938, "step": 101760 }, { "epoch": 4.724917591345931, "grad_norm": 111.94651794433594, "learning_rate": 1.6630298528251078e-08, "logits/chosen": -18.244388580322266, "logits/rejected": -18.434200286865234, "logps/chosen": -267.3891906738281, "logps/rejected": -239.9048614501953, "loss": 1.2125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.060723066329956, "rewards/margins": 0.03507862240076065, "rewards/rejected": 2.025644302368164, "step": 101770 }, { "epoch": 4.725381865453364, "grad_norm": 265.04302978515625, "learning_rate": 1.66024420818051e-08, "logits/chosen": -18.28921890258789, "logits/rejected": -18.099773406982422, "logps/chosen": -355.7720642089844, "logps/rejected": -433.7604064941406, "loss": 1.6053, "rewards/accuracies": 0.5, "rewards/chosen": 1.496228814125061, "rewards/margins": -0.6752028465270996, "rewards/rejected": 2.171431541442871, "step": 101780 }, { "epoch": 4.7258461395607965, "grad_norm": 46.861488342285156, "learning_rate": 1.6574585635359114e-08, "logits/chosen": -19.456741333007812, "logits/rejected": -18.215675354003906, "logps/chosen": -329.2850341796875, "logps/rejected": -219.97659301757812, "loss": 0.4046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7911479473114014, "rewards/margins": 1.751220941543579, "rewards/rejected": 1.0399272441864014, "step": 101790 }, { "epoch": 4.72631041366823, "grad_norm": 16.22414779663086, "learning_rate": 1.6546729188913132e-08, "logits/chosen": -19.816699981689453, "logits/rejected": -18.23388671875, "logps/chosen": -446.4175720214844, "logps/rejected": -314.46478271484375, "loss": 0.2216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.567750930786133, "rewards/margins": 2.121650457382202, "rewards/rejected": 2.4461004734039307, "step": 101800 }, { "epoch": 4.726774687775663, "grad_norm": 0.7771943211555481, "learning_rate": 1.6518872742467153e-08, "logits/chosen": -18.505441665649414, "logits/rejected": -18.11570167541504, "logps/chosen": -421.616943359375, "logps/rejected": -349.1580505371094, "loss": 0.7964, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.539158821105957, "rewards/margins": 1.4550632238388062, "rewards/rejected": 3.0840954780578613, "step": 101810 }, { "epoch": 4.727238961883096, "grad_norm": 18.87298011779785, "learning_rate": 1.649101629602117e-08, "logits/chosen": -20.667652130126953, "logits/rejected": -19.892101287841797, "logps/chosen": -530.9912109375, "logps/rejected": -403.6993713378906, "loss": 0.5262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8597233295440674, "rewards/margins": 0.9193897247314453, "rewards/rejected": 2.940333127975464, "step": 101820 }, { "epoch": 4.727703235990528, "grad_norm": 31.859783172607422, "learning_rate": 1.646315984957519e-08, "logits/chosen": -18.905166625976562, "logits/rejected": -18.235326766967773, "logps/chosen": -365.40582275390625, "logps/rejected": -271.80267333984375, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": 3.0091724395751953, "rewards/margins": 1.4438247680664062, "rewards/rejected": 1.5653479099273682, "step": 101830 }, { "epoch": 4.728167510097962, "grad_norm": 187.4932098388672, "learning_rate": 1.6435303403129206e-08, "logits/chosen": -18.665969848632812, "logits/rejected": -19.204797744750977, "logps/chosen": -311.35369873046875, "logps/rejected": -405.22821044921875, "loss": 1.2457, "rewards/accuracies": 0.5, "rewards/chosen": 2.4457669258117676, "rewards/margins": -0.2997116446495056, "rewards/rejected": 2.745478630065918, "step": 101840 }, { "epoch": 4.728631784205395, "grad_norm": 34.28044891357422, "learning_rate": 1.6407446956683224e-08, "logits/chosen": -18.068540573120117, "logits/rejected": -17.255130767822266, "logps/chosen": -277.19439697265625, "logps/rejected": -214.8340301513672, "loss": 0.4773, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0045721530914307, "rewards/margins": 1.789771318435669, "rewards/rejected": 0.21480071544647217, "step": 101850 }, { "epoch": 4.729096058312828, "grad_norm": 22.881689071655273, "learning_rate": 1.6379590510237242e-08, "logits/chosen": -18.62173843383789, "logits/rejected": -18.126243591308594, "logps/chosen": -362.78570556640625, "logps/rejected": -328.5087890625, "loss": 0.9643, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8806545734405518, "rewards/margins": 0.0769876018166542, "rewards/rejected": 2.8036670684814453, "step": 101860 }, { "epoch": 4.729560332420261, "grad_norm": 0.26738104224205017, "learning_rate": 1.6351734063791263e-08, "logits/chosen": -18.16469383239746, "logits/rejected": -17.254154205322266, "logps/chosen": -398.768310546875, "logps/rejected": -270.2041320800781, "loss": 0.4936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6149837970733643, "rewards/margins": 1.8321317434310913, "rewards/rejected": 0.782852053642273, "step": 101870 }, { "epoch": 4.730024606527694, "grad_norm": 9.384196281433105, "learning_rate": 1.632387761734528e-08, "logits/chosen": -19.249536514282227, "logits/rejected": -19.055706024169922, "logps/chosen": -316.61004638671875, "logps/rejected": -333.2446594238281, "loss": 1.0488, "rewards/accuracies": 0.5, "rewards/chosen": 3.288128614425659, "rewards/margins": 0.9279816746711731, "rewards/rejected": 2.360146999359131, "step": 101880 }, { "epoch": 4.730488880635127, "grad_norm": 172.64598083496094, "learning_rate": 1.62960211708993e-08, "logits/chosen": -19.345836639404297, "logits/rejected": -19.380706787109375, "logps/chosen": -235.25851440429688, "logps/rejected": -201.00949096679688, "loss": 0.7876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6618635654449463, "rewards/margins": 0.6673906445503235, "rewards/rejected": 0.994472861289978, "step": 101890 }, { "epoch": 4.73095315474256, "grad_norm": 97.19766235351562, "learning_rate": 1.6268164724453317e-08, "logits/chosen": -18.95948028564453, "logits/rejected": -17.62132453918457, "logps/chosen": -349.53057861328125, "logps/rejected": -262.2445983886719, "loss": 0.4923, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1571476459503174, "rewards/margins": 1.8558803796768188, "rewards/rejected": 1.3012670278549194, "step": 101900 }, { "epoch": 4.731417428849993, "grad_norm": 94.85456085205078, "learning_rate": 1.6240308278007335e-08, "logits/chosen": -18.343244552612305, "logits/rejected": -17.730974197387695, "logps/chosen": -436.49188232421875, "logps/rejected": -322.992431640625, "loss": 0.43, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5372161865234375, "rewards/margins": 1.184070348739624, "rewards/rejected": 2.3531460762023926, "step": 101910 }, { "epoch": 4.731881702957426, "grad_norm": 42.1711540222168, "learning_rate": 1.6212451831561352e-08, "logits/chosen": -18.886770248413086, "logits/rejected": -18.101560592651367, "logps/chosen": -320.22930908203125, "logps/rejected": -248.3787384033203, "loss": 0.5462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9112095832824707, "rewards/margins": 1.4251627922058105, "rewards/rejected": 1.486046552658081, "step": 101920 }, { "epoch": 4.732345977064859, "grad_norm": 85.27327728271484, "learning_rate": 1.6184595385115374e-08, "logits/chosen": -19.039661407470703, "logits/rejected": -18.239791870117188, "logps/chosen": -465.2900390625, "logps/rejected": -400.83795166015625, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 4.468167304992676, "rewards/margins": 2.1874051094055176, "rewards/rejected": 2.28076171875, "step": 101930 }, { "epoch": 4.7328102511722925, "grad_norm": 13.544228553771973, "learning_rate": 1.6156738938669388e-08, "logits/chosen": -19.589656829833984, "logits/rejected": -17.58843994140625, "logps/chosen": -355.2657470703125, "logps/rejected": -254.35791015625, "loss": 0.1809, "rewards/accuracies": 1.0, "rewards/chosen": 3.102219820022583, "rewards/margins": 2.205064535140991, "rewards/rejected": 0.89715576171875, "step": 101940 }, { "epoch": 4.733274525279725, "grad_norm": 8.345237731933594, "learning_rate": 1.612888249222341e-08, "logits/chosen": -18.825603485107422, "logits/rejected": -18.070711135864258, "logps/chosen": -417.5958557128906, "logps/rejected": -348.93414306640625, "loss": 0.4824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.691643476486206, "rewards/margins": 1.1806504726409912, "rewards/rejected": 2.510993003845215, "step": 101950 }, { "epoch": 4.733738799387158, "grad_norm": 75.74514770507812, "learning_rate": 1.6101026045777427e-08, "logits/chosen": -19.070024490356445, "logits/rejected": -19.15194320678711, "logps/chosen": -361.4212951660156, "logps/rejected": -384.7464294433594, "loss": 1.3205, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3777211904525757, "rewards/margins": -0.4049222469329834, "rewards/rejected": 1.7826435565948486, "step": 101960 }, { "epoch": 4.734203073494591, "grad_norm": 15.407180786132812, "learning_rate": 1.6073169599331445e-08, "logits/chosen": -19.235248565673828, "logits/rejected": -18.343849182128906, "logps/chosen": -491.3294982910156, "logps/rejected": -423.35186767578125, "loss": 0.6746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.755863189697266, "rewards/margins": 1.8211971521377563, "rewards/rejected": 2.9346656799316406, "step": 101970 }, { "epoch": 4.7346673476020245, "grad_norm": 16.46158790588379, "learning_rate": 1.6045313152885463e-08, "logits/chosen": -19.19183921813965, "logits/rejected": -19.21550178527832, "logps/chosen": -376.2421875, "logps/rejected": -363.30908203125, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6472277641296387, "rewards/margins": 1.2870383262634277, "rewards/rejected": 2.36018967628479, "step": 101980 }, { "epoch": 4.735131621709457, "grad_norm": 224.60400390625, "learning_rate": 1.601745670643948e-08, "logits/chosen": -18.60501480102539, "logits/rejected": -18.560367584228516, "logps/chosen": -371.4847717285156, "logps/rejected": -442.4488830566406, "loss": 0.7673, "rewards/accuracies": 0.5, "rewards/chosen": 3.172166109085083, "rewards/margins": 0.6499432325363159, "rewards/rejected": 2.5222227573394775, "step": 101990 }, { "epoch": 4.73559589581689, "grad_norm": 158.53903198242188, "learning_rate": 1.59896002599935e-08, "logits/chosen": -19.05904769897461, "logits/rejected": -18.23603630065918, "logps/chosen": -443.947998046875, "logps/rejected": -345.3116760253906, "loss": 0.6682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.168588638305664, "rewards/margins": 1.7669460773468018, "rewards/rejected": 2.401642322540283, "step": 102000 }, { "epoch": 4.736060169924324, "grad_norm": 68.65650939941406, "learning_rate": 1.5961743813547516e-08, "logits/chosen": -19.48274803161621, "logits/rejected": -18.101470947265625, "logps/chosen": -377.5003662109375, "logps/rejected": -297.34356689453125, "loss": 0.4526, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.787454128265381, "rewards/margins": 1.851633071899414, "rewards/rejected": 1.935821294784546, "step": 102010 }, { "epoch": 4.736524444031756, "grad_norm": 305.4709777832031, "learning_rate": 1.5933887367101538e-08, "logits/chosen": -18.777109146118164, "logits/rejected": -18.559173583984375, "logps/chosen": -351.386474609375, "logps/rejected": -404.1069641113281, "loss": 1.4445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6319050788879395, "rewards/margins": -0.08961744606494904, "rewards/rejected": 2.721522569656372, "step": 102020 }, { "epoch": 4.736988718139189, "grad_norm": 52.55447769165039, "learning_rate": 1.5906030920655552e-08, "logits/chosen": -19.438676834106445, "logits/rejected": -18.51689338684082, "logps/chosen": -553.6801147460938, "logps/rejected": -449.00347900390625, "loss": 0.4414, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.381226539611816, "rewards/margins": 1.6447538137435913, "rewards/rejected": 3.7364723682403564, "step": 102030 }, { "epoch": 4.737452992246622, "grad_norm": 42.78514862060547, "learning_rate": 1.5878174474209573e-08, "logits/chosen": -18.595294952392578, "logits/rejected": -18.374534606933594, "logps/chosen": -357.3130798339844, "logps/rejected": -322.5779724121094, "loss": 1.0126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.735255718231201, "rewards/margins": 0.7316559553146362, "rewards/rejected": 2.0035998821258545, "step": 102040 }, { "epoch": 4.737917266354056, "grad_norm": 230.84732055664062, "learning_rate": 1.585031802776359e-08, "logits/chosen": -18.900279998779297, "logits/rejected": -18.66499900817871, "logps/chosen": -437.85882568359375, "logps/rejected": -371.93292236328125, "loss": 1.5874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6507606506347656, "rewards/margins": -0.015400504693388939, "rewards/rejected": 3.666161060333252, "step": 102050 }, { "epoch": 4.738381540461488, "grad_norm": 107.0412826538086, "learning_rate": 1.582246158131761e-08, "logits/chosen": -20.36953353881836, "logits/rejected": -19.101490020751953, "logps/chosen": -454.81243896484375, "logps/rejected": -408.6810607910156, "loss": 0.5032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.975177764892578, "rewards/margins": 1.3641633987426758, "rewards/rejected": 3.6110146045684814, "step": 102060 }, { "epoch": 4.738845814568921, "grad_norm": 0.07071617245674133, "learning_rate": 1.5794605134871627e-08, "logits/chosen": -19.501615524291992, "logits/rejected": -18.094654083251953, "logps/chosen": -345.7066955566406, "logps/rejected": -237.3732147216797, "loss": 0.3459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.980548143386841, "rewards/margins": 2.791943073272705, "rewards/rejected": 0.18860502541065216, "step": 102070 }, { "epoch": 4.739310088676355, "grad_norm": 0.3672686815261841, "learning_rate": 1.5766748688425645e-08, "logits/chosen": -19.00164031982422, "logits/rejected": -18.336549758911133, "logps/chosen": -405.028076171875, "logps/rejected": -363.774658203125, "loss": 1.1241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8495912551879883, "rewards/margins": 0.7530655860900879, "rewards/rejected": 2.0965256690979004, "step": 102080 }, { "epoch": 4.739774362783788, "grad_norm": 116.4710922241211, "learning_rate": 1.5738892241979662e-08, "logits/chosen": -18.265024185180664, "logits/rejected": -17.908235549926758, "logps/chosen": -394.8293151855469, "logps/rejected": -371.3387756347656, "loss": 0.92, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1043782234191895, "rewards/margins": 1.0479291677474976, "rewards/rejected": 2.0564491748809814, "step": 102090 }, { "epoch": 4.74023863689122, "grad_norm": 184.3269805908203, "learning_rate": 1.5711035795533684e-08, "logits/chosen": -18.604328155517578, "logits/rejected": -17.660486221313477, "logps/chosen": -476.7716369628906, "logps/rejected": -367.8199462890625, "loss": 0.3302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8184828758239746, "rewards/margins": 1.3486518859863281, "rewards/rejected": 1.469831109046936, "step": 102100 }, { "epoch": 4.740702910998653, "grad_norm": 220.87200927734375, "learning_rate": 1.56831793490877e-08, "logits/chosen": -18.51148223876953, "logits/rejected": -17.239116668701172, "logps/chosen": -458.3408203125, "logps/rejected": -340.4000549316406, "loss": 0.8626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4662082195281982, "rewards/margins": 1.0801379680633545, "rewards/rejected": 1.3860702514648438, "step": 102110 }, { "epoch": 4.741167185106087, "grad_norm": 8.351347923278809, "learning_rate": 1.565532290264172e-08, "logits/chosen": -20.47528076171875, "logits/rejected": -18.9810791015625, "logps/chosen": -418.48175048828125, "logps/rejected": -342.247314453125, "loss": 0.5179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.08657169342041, "rewards/margins": 1.8581974506378174, "rewards/rejected": 2.2283740043640137, "step": 102120 }, { "epoch": 4.74163145921352, "grad_norm": 129.4063262939453, "learning_rate": 1.5627466456195737e-08, "logits/chosen": -18.462520599365234, "logits/rejected": -18.878631591796875, "logps/chosen": -419.4322814941406, "logps/rejected": -455.13885498046875, "loss": 1.1759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.933666229248047, "rewards/margins": -0.08982858806848526, "rewards/rejected": 4.023494720458984, "step": 102130 }, { "epoch": 4.742095733320952, "grad_norm": 1.203216314315796, "learning_rate": 1.5599610009749755e-08, "logits/chosen": -18.942039489746094, "logits/rejected": -18.19514274597168, "logps/chosen": -367.13873291015625, "logps/rejected": -242.40771484375, "loss": 0.7994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.854823589324951, "rewards/margins": 1.427183985710144, "rewards/rejected": 1.4276397228240967, "step": 102140 }, { "epoch": 4.742560007428386, "grad_norm": 166.94406127929688, "learning_rate": 1.5571753563303773e-08, "logits/chosen": -19.713451385498047, "logits/rejected": -19.786108016967773, "logps/chosen": -425.990234375, "logps/rejected": -395.1727600097656, "loss": 0.9983, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.417523145675659, "rewards/margins": -0.12792964279651642, "rewards/rejected": 3.5454533100128174, "step": 102150 }, { "epoch": 4.743024281535819, "grad_norm": 46.21321487426758, "learning_rate": 1.5543897116857794e-08, "logits/chosen": -19.34299087524414, "logits/rejected": -19.278968811035156, "logps/chosen": -422.1297302246094, "logps/rejected": -414.9944763183594, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9020819664001465, "rewards/margins": 1.4440739154815674, "rewards/rejected": 2.4580078125, "step": 102160 }, { "epoch": 4.743488555643252, "grad_norm": 13.533160209655762, "learning_rate": 1.5516040670411812e-08, "logits/chosen": -18.748516082763672, "logits/rejected": -17.488296508789062, "logps/chosen": -584.3607177734375, "logps/rejected": -380.84063720703125, "loss": 0.3738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.946544170379639, "rewards/margins": 2.8268444538116455, "rewards/rejected": 2.1196999549865723, "step": 102170 }, { "epoch": 4.743952829750684, "grad_norm": 1.879694938659668, "learning_rate": 1.5488184223965826e-08, "logits/chosen": -19.59607696533203, "logits/rejected": -18.425477981567383, "logps/chosen": -349.37249755859375, "logps/rejected": -260.3959045410156, "loss": 0.3571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.128638744354248, "rewards/margins": 2.5883166790008545, "rewards/rejected": 0.5403220653533936, "step": 102180 }, { "epoch": 4.744417103858118, "grad_norm": 120.7302474975586, "learning_rate": 1.5460327777519848e-08, "logits/chosen": -19.6823673248291, "logits/rejected": -18.152727127075195, "logps/chosen": -486.14990234375, "logps/rejected": -420.51885986328125, "loss": 0.2807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.665096759796143, "rewards/margins": 2.0306220054626465, "rewards/rejected": 2.634474515914917, "step": 102190 }, { "epoch": 4.744881377965551, "grad_norm": 151.8882293701172, "learning_rate": 1.5432471331073865e-08, "logits/chosen": -19.752334594726562, "logits/rejected": -18.813140869140625, "logps/chosen": -294.73822021484375, "logps/rejected": -251.9681854248047, "loss": 0.5004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.49176287651062, "rewards/margins": 0.9335408210754395, "rewards/rejected": 1.5582220554351807, "step": 102200 }, { "epoch": 4.7453456520729835, "grad_norm": 49.04582214355469, "learning_rate": 1.5404614884627883e-08, "logits/chosen": -19.059711456298828, "logits/rejected": -18.849414825439453, "logps/chosen": -314.6887512207031, "logps/rejected": -306.94781494140625, "loss": 1.1391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2714977264404297, "rewards/margins": 0.6330147981643677, "rewards/rejected": 2.6384830474853516, "step": 102210 }, { "epoch": 4.745809926180417, "grad_norm": 0.03765975311398506, "learning_rate": 1.53767584381819e-08, "logits/chosen": -19.814056396484375, "logits/rejected": -18.661945343017578, "logps/chosen": -378.8005676269531, "logps/rejected": -296.877685546875, "loss": 0.4208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4472527503967285, "rewards/margins": 2.084785223007202, "rewards/rejected": 2.362467050552368, "step": 102220 }, { "epoch": 4.74627420028785, "grad_norm": 5.56821870803833, "learning_rate": 1.534890199173592e-08, "logits/chosen": -19.292577743530273, "logits/rejected": -18.207218170166016, "logps/chosen": -507.814697265625, "logps/rejected": -402.6264953613281, "loss": 0.2027, "rewards/accuracies": 1.0, "rewards/chosen": 5.1595001220703125, "rewards/margins": 2.6185977458953857, "rewards/rejected": 2.540902614593506, "step": 102230 }, { "epoch": 4.746738474395283, "grad_norm": 77.91426086425781, "learning_rate": 1.5321045545289937e-08, "logits/chosen": -19.488828659057617, "logits/rejected": -18.703540802001953, "logps/chosen": -368.7096252441406, "logps/rejected": -286.9546813964844, "loss": 0.7568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.419419765472412, "rewards/margins": 0.5457528233528137, "rewards/rejected": 1.873666763305664, "step": 102240 }, { "epoch": 4.7472027485027155, "grad_norm": 266.2245178222656, "learning_rate": 1.5293189098843958e-08, "logits/chosen": -20.307126998901367, "logits/rejected": -19.066755294799805, "logps/chosen": -519.8590087890625, "logps/rejected": -396.05352783203125, "loss": 1.0347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.251977443695068, "rewards/margins": 1.6908591985702515, "rewards/rejected": 2.5611183643341064, "step": 102250 }, { "epoch": 4.747667022610149, "grad_norm": 65.8892822265625, "learning_rate": 1.5265332652397976e-08, "logits/chosen": -19.380495071411133, "logits/rejected": -18.983848571777344, "logps/chosen": -424.3701171875, "logps/rejected": -433.6690368652344, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8425133228302, "rewards/margins": 1.1720240116119385, "rewards/rejected": 2.6704893112182617, "step": 102260 }, { "epoch": 4.748131296717582, "grad_norm": 251.56312561035156, "learning_rate": 1.5237476205951994e-08, "logits/chosen": -20.16607093811035, "logits/rejected": -18.903949737548828, "logps/chosen": -372.89996337890625, "logps/rejected": -394.1164245605469, "loss": 0.942, "rewards/accuracies": 0.5, "rewards/chosen": 4.1248979568481445, "rewards/margins": 1.0586662292480469, "rewards/rejected": 3.066232204437256, "step": 102270 }, { "epoch": 4.748595570825015, "grad_norm": 67.1966781616211, "learning_rate": 1.520961975950601e-08, "logits/chosen": -18.668420791625977, "logits/rejected": -18.38933753967285, "logps/chosen": -472.5711975097656, "logps/rejected": -415.7010192871094, "loss": 0.5906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.750838041305542, "rewards/margins": 1.0984216928482056, "rewards/rejected": 2.652416706085205, "step": 102280 }, { "epoch": 4.749059844932448, "grad_norm": 264.42181396484375, "learning_rate": 1.518176331306003e-08, "logits/chosen": -19.328041076660156, "logits/rejected": -18.997018814086914, "logps/chosen": -358.26751708984375, "logps/rejected": -307.13763427734375, "loss": 1.0381, "rewards/accuracies": 0.5, "rewards/chosen": 3.2012863159179688, "rewards/margins": 1.0218572616577148, "rewards/rejected": 2.179429054260254, "step": 102290 }, { "epoch": 4.749524119039881, "grad_norm": 4.268770217895508, "learning_rate": 1.5153906866614047e-08, "logits/chosen": -19.47877311706543, "logits/rejected": -18.290992736816406, "logps/chosen": -479.6795959472656, "logps/rejected": -324.0985412597656, "loss": 0.8982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4984970092773438, "rewards/margins": 1.5251569747924805, "rewards/rejected": 1.9733397960662842, "step": 102300 }, { "epoch": 4.749988393147314, "grad_norm": 1.517784595489502, "learning_rate": 1.5126050420168068e-08, "logits/chosen": -19.38995933532715, "logits/rejected": -18.689062118530273, "logps/chosen": -367.59344482421875, "logps/rejected": -295.88299560546875, "loss": 0.8342, "rewards/accuracies": 0.5, "rewards/chosen": 3.595236301422119, "rewards/margins": 1.4550292491912842, "rewards/rejected": 2.140207052230835, "step": 102310 }, { "epoch": 4.750452667254748, "grad_norm": 131.1180419921875, "learning_rate": 1.5098193973722086e-08, "logits/chosen": -19.051162719726562, "logits/rejected": -18.2748966217041, "logps/chosen": -423.91815185546875, "logps/rejected": -290.62384033203125, "loss": 0.5277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.583550214767456, "rewards/margins": 1.7713136672973633, "rewards/rejected": 1.8122365474700928, "step": 102320 }, { "epoch": 4.75091694136218, "grad_norm": 56.600616455078125, "learning_rate": 1.50703375272761e-08, "logits/chosen": -19.46533966064453, "logits/rejected": -18.54181480407715, "logps/chosen": -434.2599182128906, "logps/rejected": -283.3871765136719, "loss": 0.43, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7711119651794434, "rewards/margins": 2.1515705585479736, "rewards/rejected": 1.619541883468628, "step": 102330 }, { "epoch": 4.751381215469613, "grad_norm": 29.531593322753906, "learning_rate": 1.5042481080830122e-08, "logits/chosen": -18.179983139038086, "logits/rejected": -17.435321807861328, "logps/chosen": -380.1119384765625, "logps/rejected": -301.0345764160156, "loss": 0.7701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.223369598388672, "rewards/margins": 1.2257009744644165, "rewards/rejected": 1.9976686239242554, "step": 102340 }, { "epoch": 4.751845489577047, "grad_norm": 138.91424560546875, "learning_rate": 1.501462463438414e-08, "logits/chosen": -19.98990249633789, "logits/rejected": -19.58103370666504, "logps/chosen": -422.36627197265625, "logps/rejected": -328.9236145019531, "loss": 0.8927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6070308685302734, "rewards/margins": 1.1060810089111328, "rewards/rejected": 2.5009493827819824, "step": 102350 }, { "epoch": 4.7523097636844795, "grad_norm": 37.38187789916992, "learning_rate": 1.4986768187938158e-08, "logits/chosen": -19.048660278320312, "logits/rejected": -18.905887603759766, "logps/chosen": -354.745849609375, "logps/rejected": -311.75, "loss": 1.6629, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.466921329498291, "rewards/margins": -0.13715426623821259, "rewards/rejected": 2.6040756702423096, "step": 102360 }, { "epoch": 4.752774037791912, "grad_norm": 89.6808090209961, "learning_rate": 1.495891174149218e-08, "logits/chosen": -18.714550018310547, "logits/rejected": -18.11423110961914, "logps/chosen": -517.4949951171875, "logps/rejected": -421.9004821777344, "loss": 0.6596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.762032508850098, "rewards/margins": 1.3568214178085327, "rewards/rejected": 3.405210494995117, "step": 102370 }, { "epoch": 4.753238311899345, "grad_norm": 3.6383533477783203, "learning_rate": 1.4931055295046193e-08, "logits/chosen": -19.832359313964844, "logits/rejected": -19.059001922607422, "logps/chosen": -392.812255859375, "logps/rejected": -318.2913818359375, "loss": 0.7949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4059665203094482, "rewards/margins": 1.0442025661468506, "rewards/rejected": 2.3617641925811768, "step": 102380 }, { "epoch": 4.753702586006779, "grad_norm": 125.84420776367188, "learning_rate": 1.490319884860021e-08, "logits/chosen": -19.62099838256836, "logits/rejected": -18.93234634399414, "logps/chosen": -436.0956115722656, "logps/rejected": -402.3395080566406, "loss": 0.4569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.113428592681885, "rewards/margins": 1.555847406387329, "rewards/rejected": 2.5575804710388184, "step": 102390 }, { "epoch": 4.7541668601142115, "grad_norm": 0.2599298059940338, "learning_rate": 1.487534240215423e-08, "logits/chosen": -18.735084533691406, "logits/rejected": -18.325407028198242, "logps/chosen": -472.4788513183594, "logps/rejected": -392.6730651855469, "loss": 0.408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.077181816101074, "rewards/margins": 1.944464087486267, "rewards/rejected": 2.1327173709869385, "step": 102400 }, { "epoch": 4.754631134221644, "grad_norm": 33.47846221923828, "learning_rate": 1.484748595570825e-08, "logits/chosen": -19.731761932373047, "logits/rejected": -19.44852638244629, "logps/chosen": -450.66009521484375, "logps/rejected": -394.9076232910156, "loss": 0.3666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.406205177307129, "rewards/margins": 1.29380464553833, "rewards/rejected": 3.112400770187378, "step": 102410 }, { "epoch": 4.755095408329078, "grad_norm": 30.019779205322266, "learning_rate": 1.4819629509262268e-08, "logits/chosen": -19.618867874145508, "logits/rejected": -19.29509735107422, "logps/chosen": -451.72027587890625, "logps/rejected": -368.26153564453125, "loss": 0.8323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.382737636566162, "rewards/margins": 0.3705469071865082, "rewards/rejected": 3.0121910572052, "step": 102420 }, { "epoch": 4.755559682436511, "grad_norm": 0.5540823340415955, "learning_rate": 1.4791773062816286e-08, "logits/chosen": -19.630840301513672, "logits/rejected": -19.084665298461914, "logps/chosen": -438.8941955566406, "logps/rejected": -379.0065002441406, "loss": 0.7091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.183816432952881, "rewards/margins": 1.1148790121078491, "rewards/rejected": 2.068937301635742, "step": 102430 }, { "epoch": 4.7560239565439435, "grad_norm": 17.024206161499023, "learning_rate": 1.4763916616370304e-08, "logits/chosen": -19.165727615356445, "logits/rejected": -18.018041610717773, "logps/chosen": -414.44122314453125, "logps/rejected": -281.68731689453125, "loss": 0.4646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.296025514602661, "rewards/margins": 1.535744547843933, "rewards/rejected": 1.760280966758728, "step": 102440 }, { "epoch": 4.756488230651376, "grad_norm": 273.59637451171875, "learning_rate": 1.4736060169924323e-08, "logits/chosen": -18.38307762145996, "logits/rejected": -18.55668067932129, "logps/chosen": -381.9002380371094, "logps/rejected": -365.6972961425781, "loss": 1.007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.137270927429199, "rewards/margins": 0.8572543859481812, "rewards/rejected": 3.2800166606903076, "step": 102450 }, { "epoch": 4.75695250475881, "grad_norm": 57.752838134765625, "learning_rate": 1.4708203723478341e-08, "logits/chosen": -19.616512298583984, "logits/rejected": -18.165992736816406, "logps/chosen": -437.4908142089844, "logps/rejected": -299.9237060546875, "loss": 0.3506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8662776947021484, "rewards/margins": 2.2850518226623535, "rewards/rejected": 1.5812256336212158, "step": 102460 }, { "epoch": 4.757416778866243, "grad_norm": 5.742398262023926, "learning_rate": 1.468034727703236e-08, "logits/chosen": -19.069156646728516, "logits/rejected": -17.74476432800293, "logps/chosen": -549.476318359375, "logps/rejected": -349.3492431640625, "loss": 0.2257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.843728542327881, "rewards/margins": 2.564979076385498, "rewards/rejected": 2.278749465942383, "step": 102470 }, { "epoch": 4.7578810529736755, "grad_norm": 9.300435066223145, "learning_rate": 1.4652490830586377e-08, "logits/chosen": -19.274065017700195, "logits/rejected": -18.079303741455078, "logps/chosen": -306.7816467285156, "logps/rejected": -283.5577087402344, "loss": 0.8713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1594290733337402, "rewards/margins": 0.8904411196708679, "rewards/rejected": 1.268987774848938, "step": 102480 }, { "epoch": 4.758345327081109, "grad_norm": 0.8560877442359924, "learning_rate": 1.4624634384140396e-08, "logits/chosen": -19.978710174560547, "logits/rejected": -18.132814407348633, "logps/chosen": -416.44000244140625, "logps/rejected": -277.3124694824219, "loss": 0.8945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9983749389648438, "rewards/margins": 1.8678455352783203, "rewards/rejected": 2.1305289268493652, "step": 102490 }, { "epoch": 4.758809601188542, "grad_norm": 358.738037109375, "learning_rate": 1.4596777937694414e-08, "logits/chosen": -18.996112823486328, "logits/rejected": -20.69629669189453, "logps/chosen": -413.16162109375, "logps/rejected": -455.0384216308594, "loss": 1.5769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.850714921951294, "rewards/margins": -0.49622640013694763, "rewards/rejected": 4.346940994262695, "step": 102500 }, { "epoch": 4.759273875295975, "grad_norm": 0.19237014651298523, "learning_rate": 1.4568921491248434e-08, "logits/chosen": -19.09086799621582, "logits/rejected": -18.981773376464844, "logps/chosen": -319.65277099609375, "logps/rejected": -339.4582214355469, "loss": 0.8132, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.5207605361938477, "rewards/margins": 0.3611573278903961, "rewards/rejected": 2.1596033573150635, "step": 102510 }, { "epoch": 4.7597381494034074, "grad_norm": 77.49967193603516, "learning_rate": 1.4541065044802451e-08, "logits/chosen": -19.443626403808594, "logits/rejected": -19.456104278564453, "logps/chosen": -476.5018005371094, "logps/rejected": -485.900634765625, "loss": 0.9248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.884042739868164, "rewards/margins": 0.01598381996154785, "rewards/rejected": 3.8680591583251953, "step": 102520 }, { "epoch": 4.760202423510841, "grad_norm": 15.74815559387207, "learning_rate": 1.4513208598356468e-08, "logits/chosen": -19.668575286865234, "logits/rejected": -18.532201766967773, "logps/chosen": -378.3086853027344, "logps/rejected": -255.6016387939453, "loss": 0.4284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5460307598114014, "rewards/margins": 1.7365550994873047, "rewards/rejected": 1.8094755411148071, "step": 102530 }, { "epoch": 4.760666697618274, "grad_norm": 52.02132034301758, "learning_rate": 1.4485352151910487e-08, "logits/chosen": -19.445049285888672, "logits/rejected": -18.161693572998047, "logps/chosen": -507.0428771972656, "logps/rejected": -351.44757080078125, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": 3.170421600341797, "rewards/margins": 2.137326717376709, "rewards/rejected": 1.0330945253372192, "step": 102540 }, { "epoch": 4.761130971725707, "grad_norm": 135.59605407714844, "learning_rate": 1.4457495705464505e-08, "logits/chosen": -19.406131744384766, "logits/rejected": -17.694255828857422, "logps/chosen": -345.17547607421875, "logps/rejected": -189.81289672851562, "loss": 0.5462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.939342498779297, "rewards/margins": 2.284416675567627, "rewards/rejected": 0.6549261808395386, "step": 102550 }, { "epoch": 4.76159524583314, "grad_norm": 326.0520324707031, "learning_rate": 1.4429639259018524e-08, "logits/chosen": -19.781414031982422, "logits/rejected": -20.507204055786133, "logps/chosen": -352.89508056640625, "logps/rejected": -424.3787536621094, "loss": 1.2694, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.318925142288208, "rewards/margins": -0.5369406938552856, "rewards/rejected": 3.855865955352783, "step": 102560 }, { "epoch": 4.762059519940573, "grad_norm": 1.0450897216796875, "learning_rate": 1.4401782812572544e-08, "logits/chosen": -18.44595718383789, "logits/rejected": -17.554702758789062, "logps/chosen": -314.72882080078125, "logps/rejected": -187.00772094726562, "loss": 0.198, "rewards/accuracies": 1.0, "rewards/chosen": 3.035778522491455, "rewards/margins": 2.4394290447235107, "rewards/rejected": 0.596349835395813, "step": 102570 }, { "epoch": 4.762523794048006, "grad_norm": 109.33079528808594, "learning_rate": 1.437392636612656e-08, "logits/chosen": -19.023941040039062, "logits/rejected": -18.219865798950195, "logps/chosen": -332.3975524902344, "logps/rejected": -306.3489990234375, "loss": 0.7655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2783799171447754, "rewards/margins": 0.8909096717834473, "rewards/rejected": 1.3874701261520386, "step": 102580 }, { "epoch": 4.762988068155439, "grad_norm": 35.96925735473633, "learning_rate": 1.4346069919680578e-08, "logits/chosen": -18.614852905273438, "logits/rejected": -18.113540649414062, "logps/chosen": -390.75799560546875, "logps/rejected": -312.1559143066406, "loss": 1.0692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.987605571746826, "rewards/margins": 1.2826995849609375, "rewards/rejected": 2.7049059867858887, "step": 102590 }, { "epoch": 4.763452342262872, "grad_norm": 101.2287368774414, "learning_rate": 1.4318213473234597e-08, "logits/chosen": -18.5191707611084, "logits/rejected": -18.75442886352539, "logps/chosen": -427.9659118652344, "logps/rejected": -412.8196716308594, "loss": 0.721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.96441912651062, "rewards/margins": 0.5443271398544312, "rewards/rejected": 2.4200921058654785, "step": 102600 }, { "epoch": 4.763916616370305, "grad_norm": 27.752620697021484, "learning_rate": 1.4290357026788615e-08, "logits/chosen": -18.252452850341797, "logits/rejected": -18.375911712646484, "logps/chosen": -296.0806579589844, "logps/rejected": -361.90716552734375, "loss": 1.6942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.157743453979492, "rewards/margins": 0.5026928782463074, "rewards/rejected": 1.6550506353378296, "step": 102610 }, { "epoch": 4.764380890477738, "grad_norm": 347.3400573730469, "learning_rate": 1.4262500580342635e-08, "logits/chosen": -19.485668182373047, "logits/rejected": -19.587358474731445, "logps/chosen": -427.63165283203125, "logps/rejected": -512.116455078125, "loss": 0.9791, "rewards/accuracies": 0.5, "rewards/chosen": 3.3754794597625732, "rewards/margins": 0.06504769623279572, "rewards/rejected": 3.310431718826294, "step": 102620 }, { "epoch": 4.7648451645851715, "grad_norm": 17.34591293334961, "learning_rate": 1.4234644133896651e-08, "logits/chosen": -19.731042861938477, "logits/rejected": -18.821517944335938, "logps/chosen": -483.4581604003906, "logps/rejected": -371.703369140625, "loss": 0.3319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.414685249328613, "rewards/margins": 1.48135507106781, "rewards/rejected": 2.9333298206329346, "step": 102630 }, { "epoch": 4.765309438692604, "grad_norm": 13.663334846496582, "learning_rate": 1.420678768745067e-08, "logits/chosen": -19.299964904785156, "logits/rejected": -18.695880889892578, "logps/chosen": -393.34271240234375, "logps/rejected": -376.52838134765625, "loss": 0.249, "rewards/accuracies": 1.0, "rewards/chosen": 3.796093702316284, "rewards/margins": 2.0737648010253906, "rewards/rejected": 1.7223291397094727, "step": 102640 }, { "epoch": 4.765773712800037, "grad_norm": 33.92008972167969, "learning_rate": 1.4178931241004688e-08, "logits/chosen": -19.586666107177734, "logits/rejected": -17.795228958129883, "logps/chosen": -469.6229553222656, "logps/rejected": -300.43341064453125, "loss": 0.3734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.261761665344238, "rewards/margins": 2.5344741344451904, "rewards/rejected": 1.7272869348526, "step": 102650 }, { "epoch": 4.76623798690747, "grad_norm": 205.49905395507812, "learning_rate": 1.4151074794558708e-08, "logits/chosen": -19.517261505126953, "logits/rejected": -18.874616622924805, "logps/chosen": -494.598876953125, "logps/rejected": -426.5538635253906, "loss": 0.8905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.179558753967285, "rewards/margins": 1.1938693523406982, "rewards/rejected": 2.985689640045166, "step": 102660 }, { "epoch": 4.7667022610149035, "grad_norm": 42.33781051635742, "learning_rate": 1.4123218348112726e-08, "logits/chosen": -20.691648483276367, "logits/rejected": -19.259719848632812, "logps/chosen": -374.78997802734375, "logps/rejected": -353.5479431152344, "loss": 0.6425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.524137496948242, "rewards/margins": 2.0293076038360596, "rewards/rejected": 1.4948303699493408, "step": 102670 }, { "epoch": 4.767166535122336, "grad_norm": 4.06210994720459, "learning_rate": 1.4095361901666743e-08, "logits/chosen": -19.233583450317383, "logits/rejected": -18.05795669555664, "logps/chosen": -353.9645690917969, "logps/rejected": -209.14895629882812, "loss": 0.8706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.924828290939331, "rewards/margins": 1.698311448097229, "rewards/rejected": 1.2265173196792603, "step": 102680 }, { "epoch": 4.767630809229769, "grad_norm": 2.1199281215667725, "learning_rate": 1.4067505455220761e-08, "logits/chosen": -19.589733123779297, "logits/rejected": -18.412033081054688, "logps/chosen": -434.58087158203125, "logps/rejected": -277.99090576171875, "loss": 0.6342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.246706962585449, "rewards/margins": 1.5609346628189087, "rewards/rejected": 2.6857714653015137, "step": 102690 }, { "epoch": 4.768095083337203, "grad_norm": 1.724840521812439, "learning_rate": 1.4039649008774781e-08, "logits/chosen": -18.348621368408203, "logits/rejected": -17.17501449584961, "logps/chosen": -278.6628112792969, "logps/rejected": -141.34738159179688, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 1.9514474868774414, "rewards/margins": 2.5467443466186523, "rewards/rejected": -0.5952968597412109, "step": 102700 }, { "epoch": 4.768559357444635, "grad_norm": 80.01811981201172, "learning_rate": 1.4011792562328799e-08, "logits/chosen": -20.192684173583984, "logits/rejected": -18.774765014648438, "logps/chosen": -359.4157409667969, "logps/rejected": -273.370361328125, "loss": 0.6213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2364449501037598, "rewards/margins": 1.3601515293121338, "rewards/rejected": 0.8762936592102051, "step": 102710 }, { "epoch": 4.769023631552068, "grad_norm": 30.34010887145996, "learning_rate": 1.3983936115882818e-08, "logits/chosen": -19.853864669799805, "logits/rejected": -19.69109535217285, "logps/chosen": -433.8558044433594, "logps/rejected": -406.91156005859375, "loss": 0.4045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.391236305236816, "rewards/margins": 1.6005083322525024, "rewards/rejected": 3.7907280921936035, "step": 102720 }, { "epoch": 4.769487905659501, "grad_norm": 119.86365509033203, "learning_rate": 1.3956079669436834e-08, "logits/chosen": -19.185922622680664, "logits/rejected": -18.546812057495117, "logps/chosen": -367.21026611328125, "logps/rejected": -343.4786071777344, "loss": 1.0556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9657645225524902, "rewards/margins": 0.4938623905181885, "rewards/rejected": 2.4719021320343018, "step": 102730 }, { "epoch": 4.769952179766935, "grad_norm": 0.9825778603553772, "learning_rate": 1.3928223222990852e-08, "logits/chosen": -19.820655822753906, "logits/rejected": -19.691606521606445, "logps/chosen": -407.0296325683594, "logps/rejected": -404.1709289550781, "loss": 0.7734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.932722568511963, "rewards/margins": 1.3981329202651978, "rewards/rejected": 3.5345897674560547, "step": 102740 }, { "epoch": 4.770416453874367, "grad_norm": 59.660362243652344, "learning_rate": 1.3900366776544872e-08, "logits/chosen": -19.367698669433594, "logits/rejected": -19.089208602905273, "logps/chosen": -472.4769592285156, "logps/rejected": -438.4583435058594, "loss": 1.0976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6513512134552, "rewards/margins": 0.5868166089057922, "rewards/rejected": 2.0645346641540527, "step": 102750 }, { "epoch": 4.7708807279818, "grad_norm": 127.90850067138672, "learning_rate": 1.387251033009889e-08, "logits/chosen": -19.007478713989258, "logits/rejected": -17.866580963134766, "logps/chosen": -410.59234619140625, "logps/rejected": -374.4369201660156, "loss": 0.5124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3436856269836426, "rewards/margins": 1.2134301662445068, "rewards/rejected": 2.1302552223205566, "step": 102760 }, { "epoch": 4.771345002089234, "grad_norm": 75.48934936523438, "learning_rate": 1.3844653883652909e-08, "logits/chosen": -18.959579467773438, "logits/rejected": -19.151065826416016, "logps/chosen": -358.60687255859375, "logps/rejected": -324.8934631347656, "loss": 1.0183, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.940650463104248, "rewards/margins": 0.19027230143547058, "rewards/rejected": 2.750378131866455, "step": 102770 }, { "epoch": 4.771809276196667, "grad_norm": 0.4784011244773865, "learning_rate": 1.3816797437206925e-08, "logits/chosen": -19.84957504272461, "logits/rejected": -18.965381622314453, "logps/chosen": -432.9231872558594, "logps/rejected": -288.4004821777344, "loss": 0.3232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.450137615203857, "rewards/margins": 2.529665470123291, "rewards/rejected": 1.9204719066619873, "step": 102780 }, { "epoch": 4.772273550304099, "grad_norm": 203.90740966796875, "learning_rate": 1.3788940990760945e-08, "logits/chosen": -19.962047576904297, "logits/rejected": -18.24128532409668, "logps/chosen": -476.3828125, "logps/rejected": -305.83856201171875, "loss": 0.3468, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.326024532318115, "rewards/margins": 2.722952365875244, "rewards/rejected": 1.6030718088150024, "step": 102790 }, { "epoch": 4.772737824411532, "grad_norm": 47.88821029663086, "learning_rate": 1.3761084544314963e-08, "logits/chosen": -19.79154396057129, "logits/rejected": -18.152273178100586, "logps/chosen": -428.4752502441406, "logps/rejected": -267.67401123046875, "loss": 0.4701, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.034829139709473, "rewards/margins": 2.5104613304138184, "rewards/rejected": 1.5243674516677856, "step": 102800 }, { "epoch": 4.773202098518966, "grad_norm": 21.8367977142334, "learning_rate": 1.3733228097868982e-08, "logits/chosen": -19.65096664428711, "logits/rejected": -18.58984375, "logps/chosen": -341.2853698730469, "logps/rejected": -305.7698059082031, "loss": 0.8087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8112101554870605, "rewards/margins": 1.805107831954956, "rewards/rejected": 2.0061023235321045, "step": 102810 }, { "epoch": 4.773666372626399, "grad_norm": 140.33554077148438, "learning_rate": 1.3705371651423e-08, "logits/chosen": -18.560407638549805, "logits/rejected": -18.023548126220703, "logps/chosen": -342.31170654296875, "logps/rejected": -304.51190185546875, "loss": 1.3096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8895025253295898, "rewards/margins": -0.018998075276613235, "rewards/rejected": 1.9085009098052979, "step": 102820 }, { "epoch": 4.774130646733831, "grad_norm": 141.1578369140625, "learning_rate": 1.3677515204977018e-08, "logits/chosen": -18.55881118774414, "logits/rejected": -17.359445571899414, "logps/chosen": -363.47259521484375, "logps/rejected": -255.0621795654297, "loss": 0.5574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.646928310394287, "rewards/margins": 1.3453547954559326, "rewards/rejected": 1.301573395729065, "step": 102830 }, { "epoch": 4.774594920841265, "grad_norm": 4.442070960998535, "learning_rate": 1.3649658758531036e-08, "logits/chosen": -19.842445373535156, "logits/rejected": -18.8594970703125, "logps/chosen": -439.1519470214844, "logps/rejected": -316.24932861328125, "loss": 0.4722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.205625057220459, "rewards/margins": 1.9648654460906982, "rewards/rejected": 2.2407593727111816, "step": 102840 }, { "epoch": 4.775059194948698, "grad_norm": 39.45405197143555, "learning_rate": 1.3621802312085055e-08, "logits/chosen": -18.41239356994629, "logits/rejected": -17.957843780517578, "logps/chosen": -326.8460693359375, "logps/rejected": -271.5775146484375, "loss": 1.0633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.209232807159424, "rewards/margins": 0.8373963236808777, "rewards/rejected": 1.371836543083191, "step": 102850 }, { "epoch": 4.775523469056131, "grad_norm": 148.42149353027344, "learning_rate": 1.3593945865639073e-08, "logits/chosen": -19.205041885375977, "logits/rejected": -17.655319213867188, "logps/chosen": -298.6288757324219, "logps/rejected": -209.6723175048828, "loss": 0.4882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.532460927963257, "rewards/margins": 1.918828010559082, "rewards/rejected": 0.6136330962181091, "step": 102860 }, { "epoch": 4.775987743163563, "grad_norm": 137.18325805664062, "learning_rate": 1.356608941919309e-08, "logits/chosen": -18.39327049255371, "logits/rejected": -17.852779388427734, "logps/chosen": -388.0936584472656, "logps/rejected": -403.4120788574219, "loss": 1.3066, "rewards/accuracies": 0.5, "rewards/chosen": 2.8757119178771973, "rewards/margins": 0.11798863112926483, "rewards/rejected": 2.757723331451416, "step": 102870 }, { "epoch": 4.776452017270997, "grad_norm": 47.70829772949219, "learning_rate": 1.3538232972747109e-08, "logits/chosen": -19.184127807617188, "logits/rejected": -17.383272171020508, "logps/chosen": -489.3880920410156, "logps/rejected": -283.52825927734375, "loss": 0.2586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.077136993408203, "rewards/margins": 2.253358840942383, "rewards/rejected": 1.8237783908843994, "step": 102880 }, { "epoch": 4.77691629137843, "grad_norm": 181.47817993164062, "learning_rate": 1.3510376526301128e-08, "logits/chosen": -18.506671905517578, "logits/rejected": -17.591384887695312, "logps/chosen": -388.9423828125, "logps/rejected": -280.46075439453125, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8763270378112793, "rewards/margins": 1.1860806941986084, "rewards/rejected": 1.690246343612671, "step": 102890 }, { "epoch": 4.7773805654858625, "grad_norm": 0.0012665741378441453, "learning_rate": 1.3482520079855146e-08, "logits/chosen": -18.30458641052246, "logits/rejected": -17.779878616333008, "logps/chosen": -352.96087646484375, "logps/rejected": -303.08258056640625, "loss": 1.2509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.766697406768799, "rewards/margins": 1.2431929111480713, "rewards/rejected": 1.523504614830017, "step": 102900 }, { "epoch": 4.777844839593296, "grad_norm": 125.40072631835938, "learning_rate": 1.3454663633409166e-08, "logits/chosen": -18.851877212524414, "logits/rejected": -19.523099899291992, "logps/chosen": -263.5897216796875, "logps/rejected": -318.80145263671875, "loss": 1.4509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6912847757339478, "rewards/margins": -0.08847391605377197, "rewards/rejected": 1.7797588109970093, "step": 102910 }, { "epoch": 4.778309113700729, "grad_norm": 156.8845977783203, "learning_rate": 1.3426807186963182e-08, "logits/chosen": -18.595462799072266, "logits/rejected": -18.425830841064453, "logps/chosen": -362.0240173339844, "logps/rejected": -358.6376037597656, "loss": 0.7106, "rewards/accuracies": 0.5, "rewards/chosen": 2.4957175254821777, "rewards/margins": 0.6733685731887817, "rewards/rejected": 1.8223488330841064, "step": 102920 }, { "epoch": 4.778773387808162, "grad_norm": 78.21109008789062, "learning_rate": 1.33989507405172e-08, "logits/chosen": -17.92856788635254, "logits/rejected": -18.355579376220703, "logps/chosen": -283.0749816894531, "logps/rejected": -334.0476989746094, "loss": 0.8548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7813342809677124, "rewards/margins": 0.20631377398967743, "rewards/rejected": 1.5750205516815186, "step": 102930 }, { "epoch": 4.7792376619155945, "grad_norm": 166.25270080566406, "learning_rate": 1.3373879938715817e-08, "logits/chosen": -19.84249496459961, "logits/rejected": -19.904338836669922, "logps/chosen": -407.40667724609375, "logps/rejected": -404.7592468261719, "loss": 1.107, "rewards/accuracies": 0.5, "rewards/chosen": 2.872662305831909, "rewards/margins": 0.10711336135864258, "rewards/rejected": 2.7655491828918457, "step": 102940 }, { "epoch": 4.779701936023028, "grad_norm": 29.058116912841797, "learning_rate": 1.3346023492269835e-08, "logits/chosen": -20.05256462097168, "logits/rejected": -18.148006439208984, "logps/chosen": -537.1580810546875, "logps/rejected": -324.7955627441406, "loss": 0.2782, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.077202320098877, "rewards/margins": 3.106064796447754, "rewards/rejected": 1.9711377620697021, "step": 102950 }, { "epoch": 4.780166210130461, "grad_norm": 159.2789764404297, "learning_rate": 1.3318167045823855e-08, "logits/chosen": -18.846521377563477, "logits/rejected": -19.16475486755371, "logps/chosen": -463.55157470703125, "logps/rejected": -490.6043395996094, "loss": 1.228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.502823352813721, "rewards/margins": 0.31529712677001953, "rewards/rejected": 4.187525749206543, "step": 102960 }, { "epoch": 4.780630484237894, "grad_norm": 8.918608665466309, "learning_rate": 1.3290310599377872e-08, "logits/chosen": -19.187610626220703, "logits/rejected": -18.153417587280273, "logps/chosen": -459.4606018066406, "logps/rejected": -342.6302795410156, "loss": 0.3794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.580409288406372, "rewards/margins": 1.5958545207977295, "rewards/rejected": 1.9845542907714844, "step": 102970 }, { "epoch": 4.781094758345327, "grad_norm": 6.484647750854492, "learning_rate": 1.3262454152931889e-08, "logits/chosen": -20.26506996154785, "logits/rejected": -19.56561851501465, "logps/chosen": -450.86724853515625, "logps/rejected": -398.003662109375, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.896576881408691, "rewards/margins": 1.311406135559082, "rewards/rejected": 3.585170269012451, "step": 102980 }, { "epoch": 4.78155903245276, "grad_norm": 145.53431701660156, "learning_rate": 1.3234597706485908e-08, "logits/chosen": -18.29416275024414, "logits/rejected": -17.629108428955078, "logps/chosen": -370.4668884277344, "logps/rejected": -258.3194580078125, "loss": 0.4621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4502387046813965, "rewards/margins": 2.5816197395324707, "rewards/rejected": 0.8686192631721497, "step": 102990 }, { "epoch": 4.782023306560193, "grad_norm": 11.627789497375488, "learning_rate": 1.3206741260039926e-08, "logits/chosen": -18.417522430419922, "logits/rejected": -17.598583221435547, "logps/chosen": -398.0534973144531, "logps/rejected": -322.52032470703125, "loss": 0.8732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7330687046051025, "rewards/margins": 0.8912456631660461, "rewards/rejected": 1.841822862625122, "step": 103000 }, { "epoch": 4.782487580667626, "grad_norm": 231.63824462890625, "learning_rate": 1.3178884813593946e-08, "logits/chosen": -19.472415924072266, "logits/rejected": -19.318267822265625, "logps/chosen": -414.07830810546875, "logps/rejected": -403.20941162109375, "loss": 0.8185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.302116870880127, "rewards/margins": 1.1536966562271118, "rewards/rejected": 3.1484198570251465, "step": 103010 }, { "epoch": 4.782951854775059, "grad_norm": 69.51700592041016, "learning_rate": 1.3151028367147965e-08, "logits/chosen": -19.78578758239746, "logits/rejected": -19.143966674804688, "logps/chosen": -344.07891845703125, "logps/rejected": -347.3509521484375, "loss": 0.8308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1041464805603027, "rewards/margins": 0.8900488615036011, "rewards/rejected": 2.214097499847412, "step": 103020 }, { "epoch": 4.783416128882492, "grad_norm": 111.80684661865234, "learning_rate": 1.3123171920701981e-08, "logits/chosen": -19.368858337402344, "logits/rejected": -18.850788116455078, "logps/chosen": -373.046630859375, "logps/rejected": -324.04949951171875, "loss": 1.1356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9397315979003906, "rewards/margins": 0.8541901707649231, "rewards/rejected": 3.0855417251586914, "step": 103030 }, { "epoch": 4.783880402989925, "grad_norm": 30.00389862060547, "learning_rate": 1.3095315474255999e-08, "logits/chosen": -18.824405670166016, "logits/rejected": -18.41196060180664, "logps/chosen": -489.6309509277344, "logps/rejected": -378.9781799316406, "loss": 0.4145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.743619441986084, "rewards/margins": 1.497280240058899, "rewards/rejected": 2.2463393211364746, "step": 103040 }, { "epoch": 4.7843446770973586, "grad_norm": 34.459808349609375, "learning_rate": 1.3067459027810019e-08, "logits/chosen": -18.861974716186523, "logits/rejected": -17.835186004638672, "logps/chosen": -315.7129211425781, "logps/rejected": -230.91165161132812, "loss": 0.2714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1435611248016357, "rewards/margins": 2.3912298679351807, "rewards/rejected": 0.7523314952850342, "step": 103050 }, { "epoch": 4.784808951204791, "grad_norm": 93.6471176147461, "learning_rate": 1.3039602581364036e-08, "logits/chosen": -18.825529098510742, "logits/rejected": -17.92806053161621, "logps/chosen": -423.29791259765625, "logps/rejected": -324.4726257324219, "loss": 0.5841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0969932079315186, "rewards/margins": 0.9612258672714233, "rewards/rejected": 2.1357672214508057, "step": 103060 }, { "epoch": 4.785273225312224, "grad_norm": 8.333464622497559, "learning_rate": 1.3011746134918056e-08, "logits/chosen": -19.717390060424805, "logits/rejected": -19.153827667236328, "logps/chosen": -334.57763671875, "logps/rejected": -324.4642028808594, "loss": 0.6238, "rewards/accuracies": 0.5, "rewards/chosen": 3.516077756881714, "rewards/margins": 0.8127764463424683, "rewards/rejected": 2.703300952911377, "step": 103070 }, { "epoch": 4.785737499419657, "grad_norm": 1.2483981847763062, "learning_rate": 1.2983889688472072e-08, "logits/chosen": -19.312057495117188, "logits/rejected": -18.293272018432617, "logps/chosen": -359.89984130859375, "logps/rejected": -234.30093383789062, "loss": 0.6263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2469642162323, "rewards/margins": 1.8131885528564453, "rewards/rejected": 1.4337761402130127, "step": 103080 }, { "epoch": 4.7862017735270905, "grad_norm": 110.7145004272461, "learning_rate": 1.2956033242026092e-08, "logits/chosen": -19.5908260345459, "logits/rejected": -17.840442657470703, "logps/chosen": -489.477783203125, "logps/rejected": -295.92181396484375, "loss": 0.3538, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7129547595977783, "rewards/margins": 1.9775091409683228, "rewards/rejected": 1.7354453802108765, "step": 103090 }, { "epoch": 4.786666047634523, "grad_norm": 11.061599731445312, "learning_rate": 1.292817679558011e-08, "logits/chosen": -18.709814071655273, "logits/rejected": -17.541290283203125, "logps/chosen": -420.732421875, "logps/rejected": -346.03228759765625, "loss": 0.4231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.426860332489014, "rewards/margins": 1.8912458419799805, "rewards/rejected": 2.535614490509033, "step": 103100 }, { "epoch": 4.787130321741956, "grad_norm": 3.005265951156616, "learning_rate": 1.2900320349134129e-08, "logits/chosen": -18.758338928222656, "logits/rejected": -18.001346588134766, "logps/chosen": -364.17950439453125, "logps/rejected": -247.1376495361328, "loss": 0.3547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7173311710357666, "rewards/margins": 1.4023663997650146, "rewards/rejected": 1.314964771270752, "step": 103110 }, { "epoch": 4.78759459584939, "grad_norm": 59.0677490234375, "learning_rate": 1.2872463902688147e-08, "logits/chosen": -20.30202865600586, "logits/rejected": -19.742984771728516, "logps/chosen": -453.00360107421875, "logps/rejected": -433.806396484375, "loss": 0.2489, "rewards/accuracies": 1.0, "rewards/chosen": 5.8769426345825195, "rewards/margins": 1.7940210103988647, "rewards/rejected": 4.082921504974365, "step": 103120 }, { "epoch": 4.7880588699568225, "grad_norm": 3.9454023838043213, "learning_rate": 1.2844607456242165e-08, "logits/chosen": -19.168407440185547, "logits/rejected": -18.467824935913086, "logps/chosen": -491.7452087402344, "logps/rejected": -313.3275146484375, "loss": 0.495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.056539058685303, "rewards/margins": 1.7039178609848022, "rewards/rejected": 2.35262131690979, "step": 103130 }, { "epoch": 4.788523144064255, "grad_norm": 122.2297134399414, "learning_rate": 1.2816751009796182e-08, "logits/chosen": -18.732593536376953, "logits/rejected": -18.401277542114258, "logps/chosen": -531.5280151367188, "logps/rejected": -397.82769775390625, "loss": 1.1058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.169100761413574, "rewards/margins": 0.8329057693481445, "rewards/rejected": 3.3361945152282715, "step": 103140 }, { "epoch": 4.788987418171689, "grad_norm": 155.5675506591797, "learning_rate": 1.2788894563350202e-08, "logits/chosen": -19.835765838623047, "logits/rejected": -17.89285659790039, "logps/chosen": -479.52093505859375, "logps/rejected": -252.65628051757812, "loss": 0.2988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.219479084014893, "rewards/margins": 3.8956799507141113, "rewards/rejected": 1.3237992525100708, "step": 103150 }, { "epoch": 4.789451692279122, "grad_norm": 46.53799057006836, "learning_rate": 1.276103811690422e-08, "logits/chosen": -19.080276489257812, "logits/rejected": -18.303157806396484, "logps/chosen": -388.0218811035156, "logps/rejected": -300.6468505859375, "loss": 0.8944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.690528392791748, "rewards/margins": 1.2988611459732056, "rewards/rejected": 1.391667127609253, "step": 103160 }, { "epoch": 4.7899159663865545, "grad_norm": 37.3802604675293, "learning_rate": 1.273318167045824e-08, "logits/chosen": -18.657428741455078, "logits/rejected": -18.456573486328125, "logps/chosen": -368.9687805175781, "logps/rejected": -354.780029296875, "loss": 0.6662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.323305606842041, "rewards/margins": 0.7862617373466492, "rewards/rejected": 2.537044048309326, "step": 103170 }, { "epoch": 4.790380240493988, "grad_norm": 109.5106201171875, "learning_rate": 1.2705325224012255e-08, "logits/chosen": -19.593381881713867, "logits/rejected": -18.539997100830078, "logps/chosen": -371.6683349609375, "logps/rejected": -254.5214385986328, "loss": 0.8717, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4743218421936035, "rewards/margins": 1.0658938884735107, "rewards/rejected": 1.4084278345108032, "step": 103180 }, { "epoch": 4.790844514601421, "grad_norm": 64.54995727539062, "learning_rate": 1.2677468777566273e-08, "logits/chosen": -19.152183532714844, "logits/rejected": -18.87449073791504, "logps/chosen": -439.82989501953125, "logps/rejected": -392.20904541015625, "loss": 0.5102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.660472869873047, "rewards/margins": 1.4596716165542603, "rewards/rejected": 2.200801134109497, "step": 103190 }, { "epoch": 4.791308788708854, "grad_norm": 35.53521728515625, "learning_rate": 1.2649612331120293e-08, "logits/chosen": -18.916851043701172, "logits/rejected": -18.148338317871094, "logps/chosen": -354.8771057128906, "logps/rejected": -290.6021423339844, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": 2.498081684112549, "rewards/margins": 0.7654987573623657, "rewards/rejected": 1.7325828075408936, "step": 103200 }, { "epoch": 4.7917730628162865, "grad_norm": 82.77017211914062, "learning_rate": 1.262175588467431e-08, "logits/chosen": -18.978473663330078, "logits/rejected": -18.357437133789062, "logps/chosen": -414.92010498046875, "logps/rejected": -367.66033935546875, "loss": 0.7545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4645562171936035, "rewards/margins": 0.715061366558075, "rewards/rejected": 1.7494949102401733, "step": 103210 }, { "epoch": 4.79223733692372, "grad_norm": 122.01979064941406, "learning_rate": 1.259389943822833e-08, "logits/chosen": -18.605045318603516, "logits/rejected": -18.335874557495117, "logps/chosen": -323.574462890625, "logps/rejected": -305.65399169921875, "loss": 1.2893, "rewards/accuracies": 0.5, "rewards/chosen": 1.6757179498672485, "rewards/margins": -0.596368134021759, "rewards/rejected": 2.2720861434936523, "step": 103220 }, { "epoch": 4.792701611031153, "grad_norm": 321.9695739746094, "learning_rate": 1.2566042991782346e-08, "logits/chosen": -19.46218490600586, "logits/rejected": -18.75613784790039, "logps/chosen": -488.06378173828125, "logps/rejected": -397.261474609375, "loss": 0.7337, "rewards/accuracies": 0.5, "rewards/chosen": 3.31559681892395, "rewards/margins": 0.9073112607002258, "rewards/rejected": 2.408285617828369, "step": 103230 }, { "epoch": 4.793165885138586, "grad_norm": 15.6445951461792, "learning_rate": 1.2538186545336366e-08, "logits/chosen": -19.371591567993164, "logits/rejected": -18.463958740234375, "logps/chosen": -441.9803771972656, "logps/rejected": -320.5142822265625, "loss": 0.5147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0439863204956055, "rewards/margins": 1.5742580890655518, "rewards/rejected": 2.4697279930114746, "step": 103240 }, { "epoch": 4.793630159246019, "grad_norm": 147.0948944091797, "learning_rate": 1.2510330098890384e-08, "logits/chosen": -19.781097412109375, "logits/rejected": -19.958721160888672, "logps/chosen": -423.25604248046875, "logps/rejected": -425.64178466796875, "loss": 0.7264, "rewards/accuracies": 0.5, "rewards/chosen": 4.046314716339111, "rewards/margins": 0.3679903447628021, "rewards/rejected": 3.6783244609832764, "step": 103250 }, { "epoch": 4.794094433353452, "grad_norm": 80.53164672851562, "learning_rate": 1.2482473652444403e-08, "logits/chosen": -19.605636596679688, "logits/rejected": -18.79006576538086, "logps/chosen": -470.9891662597656, "logps/rejected": -395.5301818847656, "loss": 0.5061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.296360969543457, "rewards/margins": 0.7875753045082092, "rewards/rejected": 3.5087857246398926, "step": 103260 }, { "epoch": 4.794558707460885, "grad_norm": 31.638023376464844, "learning_rate": 1.2454617205998421e-08, "logits/chosen": -18.87442398071289, "logits/rejected": -17.66813087463379, "logps/chosen": -345.0201416015625, "logps/rejected": -270.26251220703125, "loss": 0.5153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3525898456573486, "rewards/margins": 2.500333309173584, "rewards/rejected": 0.8522566556930542, "step": 103270 }, { "epoch": 4.795022981568318, "grad_norm": 47.62620544433594, "learning_rate": 1.2426760759552439e-08, "logits/chosen": -18.642269134521484, "logits/rejected": -17.303770065307617, "logps/chosen": -488.22808837890625, "logps/rejected": -321.9676818847656, "loss": 0.1605, "rewards/accuracies": 1.0, "rewards/chosen": 4.933077335357666, "rewards/margins": 3.0354859828948975, "rewards/rejected": 1.897591233253479, "step": 103280 }, { "epoch": 4.795487255675751, "grad_norm": 159.45018005371094, "learning_rate": 1.2398904313106457e-08, "logits/chosen": -19.301511764526367, "logits/rejected": -17.544933319091797, "logps/chosen": -498.9219665527344, "logps/rejected": -291.27337646484375, "loss": 0.1667, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.920407295227051, "rewards/margins": 2.9497313499450684, "rewards/rejected": 1.970676064491272, "step": 103290 }, { "epoch": 4.795951529783184, "grad_norm": 302.1192932128906, "learning_rate": 1.2371047866660476e-08, "logits/chosen": -19.309240341186523, "logits/rejected": -18.94962501525879, "logps/chosen": -271.427490234375, "logps/rejected": -254.1217803955078, "loss": 1.1316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9763970375061035, "rewards/margins": 0.639797568321228, "rewards/rejected": 2.336599588394165, "step": 103300 }, { "epoch": 4.796415803890617, "grad_norm": 179.5828399658203, "learning_rate": 1.2343191420214494e-08, "logits/chosen": -18.868366241455078, "logits/rejected": -18.73360252380371, "logps/chosen": -220.05923461914062, "logps/rejected": -193.27357482910156, "loss": 0.8668, "rewards/accuracies": 0.5, "rewards/chosen": 2.1734094619750977, "rewards/margins": 1.4182668924331665, "rewards/rejected": 0.7551423907279968, "step": 103310 }, { "epoch": 4.7968800779980505, "grad_norm": 23.648900985717773, "learning_rate": 1.2315334973768514e-08, "logits/chosen": -19.29324722290039, "logits/rejected": -18.712202072143555, "logps/chosen": -553.3497314453125, "logps/rejected": -471.1726989746094, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 4.167089939117432, "rewards/margins": 0.5517362356185913, "rewards/rejected": 3.6153533458709717, "step": 103320 }, { "epoch": 4.797344352105483, "grad_norm": 5.845584869384766, "learning_rate": 1.228747852732253e-08, "logits/chosen": -18.868175506591797, "logits/rejected": -17.806848526000977, "logps/chosen": -378.9256896972656, "logps/rejected": -294.2257080078125, "loss": 0.3432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.128800630569458, "rewards/margins": 1.451622486114502, "rewards/rejected": 1.6771780252456665, "step": 103330 }, { "epoch": 4.797808626212916, "grad_norm": 48.31044387817383, "learning_rate": 1.225962208087655e-08, "logits/chosen": -18.43507194519043, "logits/rejected": -18.15618324279785, "logps/chosen": -329.5840759277344, "logps/rejected": -234.73974609375, "loss": 1.2844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.309807300567627, "rewards/margins": 0.5420827269554138, "rewards/rejected": 1.7677247524261475, "step": 103340 }, { "epoch": 4.798272900320349, "grad_norm": 0.7868459820747375, "learning_rate": 1.2231765634430567e-08, "logits/chosen": -18.115962982177734, "logits/rejected": -17.391603469848633, "logps/chosen": -324.83221435546875, "logps/rejected": -329.17718505859375, "loss": 0.8906, "rewards/accuracies": 0.5, "rewards/chosen": 3.1144843101501465, "rewards/margins": 1.1026690006256104, "rewards/rejected": 2.0118155479431152, "step": 103350 }, { "epoch": 4.7987371744277825, "grad_norm": 99.60370635986328, "learning_rate": 1.2203909187984587e-08, "logits/chosen": -18.990596771240234, "logits/rejected": -18.379924774169922, "logps/chosen": -437.1214904785156, "logps/rejected": -386.0950927734375, "loss": 0.7925, "rewards/accuracies": 0.5, "rewards/chosen": 3.739046812057495, "rewards/margins": 0.9491533041000366, "rewards/rejected": 2.789893627166748, "step": 103360 }, { "epoch": 4.799201448535215, "grad_norm": 43.077239990234375, "learning_rate": 1.2176052741538604e-08, "logits/chosen": -20.335689544677734, "logits/rejected": -19.204349517822266, "logps/chosen": -324.123291015625, "logps/rejected": -313.9233093261719, "loss": 1.0261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.033360004425049, "rewards/margins": 0.25416097044944763, "rewards/rejected": 2.7791988849639893, "step": 103370 }, { "epoch": 4.799665722642648, "grad_norm": 0.24132581055164337, "learning_rate": 1.214819629509262e-08, "logits/chosen": -18.188846588134766, "logits/rejected": -17.143491744995117, "logps/chosen": -495.72314453125, "logps/rejected": -409.94573974609375, "loss": 0.4651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.222917079925537, "rewards/margins": 2.5082876682281494, "rewards/rejected": 2.7146291732788086, "step": 103380 }, { "epoch": 4.800129996750082, "grad_norm": 132.02374267578125, "learning_rate": 1.212033984864664e-08, "logits/chosen": -18.932636260986328, "logits/rejected": -18.250947952270508, "logps/chosen": -336.7974853515625, "logps/rejected": -293.17950439453125, "loss": 0.579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9964895248413086, "rewards/margins": 0.7406483888626099, "rewards/rejected": 1.2558410167694092, "step": 103390 }, { "epoch": 4.8005942708575144, "grad_norm": 90.73965454101562, "learning_rate": 1.2092483402200658e-08, "logits/chosen": -19.489652633666992, "logits/rejected": -19.045604705810547, "logps/chosen": -476.5193786621094, "logps/rejected": -411.1727600097656, "loss": 0.9069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.701796054840088, "rewards/margins": 0.10352578014135361, "rewards/rejected": 2.5982701778411865, "step": 103400 }, { "epoch": 4.801058544964947, "grad_norm": 0.6366125345230103, "learning_rate": 1.2064626955754678e-08, "logits/chosen": -19.41469955444336, "logits/rejected": -17.96383285522461, "logps/chosen": -380.8189697265625, "logps/rejected": -254.3870086669922, "loss": 0.855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0079166889190674, "rewards/margins": 1.5880916118621826, "rewards/rejected": 1.4198250770568848, "step": 103410 }, { "epoch": 4.80152281907238, "grad_norm": 50.84339141845703, "learning_rate": 1.2036770509308695e-08, "logits/chosen": -18.886886596679688, "logits/rejected": -18.236818313598633, "logps/chosen": -355.10748291015625, "logps/rejected": -319.2774963378906, "loss": 0.5022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5153720378875732, "rewards/margins": 1.0904673337936401, "rewards/rejected": 2.4249045848846436, "step": 103420 }, { "epoch": 4.801987093179814, "grad_norm": 279.89495849609375, "learning_rate": 1.2008914062862713e-08, "logits/chosen": -18.437244415283203, "logits/rejected": -18.189062118530273, "logps/chosen": -430.745849609375, "logps/rejected": -374.60308837890625, "loss": 0.9288, "rewards/accuracies": 0.5, "rewards/chosen": 2.63673734664917, "rewards/margins": 0.6980777978897095, "rewards/rejected": 1.938659429550171, "step": 103430 }, { "epoch": 4.802451367287246, "grad_norm": 78.8557357788086, "learning_rate": 1.1981057616416731e-08, "logits/chosen": -18.340576171875, "logits/rejected": -17.750667572021484, "logps/chosen": -371.05181884765625, "logps/rejected": -295.1938781738281, "loss": 0.3589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6242568492889404, "rewards/margins": 2.2153282165527344, "rewards/rejected": 1.408928394317627, "step": 103440 }, { "epoch": 4.802915641394679, "grad_norm": 8.648871421813965, "learning_rate": 1.195320116997075e-08, "logits/chosen": -18.644445419311523, "logits/rejected": -17.707759857177734, "logps/chosen": -370.74664306640625, "logps/rejected": -319.3519592285156, "loss": 0.3482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.464205265045166, "rewards/margins": 1.752399206161499, "rewards/rejected": 1.7118057012557983, "step": 103450 }, { "epoch": 4.803379915502113, "grad_norm": 177.80947875976562, "learning_rate": 1.1925344723524768e-08, "logits/chosen": -19.2861328125, "logits/rejected": -18.481510162353516, "logps/chosen": -343.1539001464844, "logps/rejected": -299.37249755859375, "loss": 0.5784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4772419929504395, "rewards/margins": 1.496530294418335, "rewards/rejected": 1.9807116985321045, "step": 103460 }, { "epoch": 4.803844189609546, "grad_norm": 52.98664855957031, "learning_rate": 1.1897488277078788e-08, "logits/chosen": -19.208515167236328, "logits/rejected": -18.806310653686523, "logps/chosen": -359.2645568847656, "logps/rejected": -325.0874938964844, "loss": 0.3071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2289767265319824, "rewards/margins": 1.7390458583831787, "rewards/rejected": 1.4899308681488037, "step": 103470 }, { "epoch": 4.804308463716978, "grad_norm": 51.08562088012695, "learning_rate": 1.1869631830632804e-08, "logits/chosen": -20.10936164855957, "logits/rejected": -20.242916107177734, "logps/chosen": -420.87335205078125, "logps/rejected": -368.40789794921875, "loss": 1.2075, "rewards/accuracies": 0.5, "rewards/chosen": 3.4164295196533203, "rewards/margins": -0.38540658354759216, "rewards/rejected": 3.801835536956787, "step": 103480 }, { "epoch": 4.804772737824411, "grad_norm": 175.28463745117188, "learning_rate": 1.1841775384186824e-08, "logits/chosen": -19.89242172241211, "logits/rejected": -17.94301414489746, "logps/chosen": -398.9161376953125, "logps/rejected": -227.088623046875, "loss": 0.3743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.983518600463867, "rewards/margins": 2.9413328170776367, "rewards/rejected": 1.0421857833862305, "step": 103490 }, { "epoch": 4.805237011931845, "grad_norm": 110.05052185058594, "learning_rate": 1.1813918937740841e-08, "logits/chosen": -20.410724639892578, "logits/rejected": -19.810077667236328, "logps/chosen": -385.6898193359375, "logps/rejected": -399.7833557128906, "loss": 0.8392, "rewards/accuracies": 0.5, "rewards/chosen": 4.632843017578125, "rewards/margins": 0.20774245262145996, "rewards/rejected": 4.425100803375244, "step": 103500 }, { "epoch": 4.805701286039278, "grad_norm": 128.34609985351562, "learning_rate": 1.1786062491294861e-08, "logits/chosen": -19.08790397644043, "logits/rejected": -18.82958984375, "logps/chosen": -305.25494384765625, "logps/rejected": -298.241455078125, "loss": 0.6307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1023406982421875, "rewards/margins": 0.4908469319343567, "rewards/rejected": 1.611493706703186, "step": 103510 }, { "epoch": 4.80616556014671, "grad_norm": 169.64891052246094, "learning_rate": 1.1758206044848879e-08, "logits/chosen": -19.27545166015625, "logits/rejected": -17.850875854492188, "logps/chosen": -555.0651245117188, "logps/rejected": -411.2499084472656, "loss": 0.5169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.370107650756836, "rewards/margins": 2.1056315898895264, "rewards/rejected": 3.2644762992858887, "step": 103520 }, { "epoch": 4.806629834254144, "grad_norm": 7.013324737548828, "learning_rate": 1.1730349598402895e-08, "logits/chosen": -20.180150985717773, "logits/rejected": -19.034704208374023, "logps/chosen": -452.54913330078125, "logps/rejected": -352.5868225097656, "loss": 0.3772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.711404323577881, "rewards/margins": 1.8292901515960693, "rewards/rejected": 2.882114887237549, "step": 103530 }, { "epoch": 4.807094108361577, "grad_norm": 0.33492377400398254, "learning_rate": 1.1702493151956914e-08, "logits/chosen": -18.62747573852539, "logits/rejected": -17.683212280273438, "logps/chosen": -351.75604248046875, "logps/rejected": -308.93890380859375, "loss": 0.4629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5109305381774902, "rewards/margins": 1.8276951313018799, "rewards/rejected": 1.6832354068756104, "step": 103540 }, { "epoch": 4.80755838246901, "grad_norm": 72.5340576171875, "learning_rate": 1.1674636705510934e-08, "logits/chosen": -19.9892635345459, "logits/rejected": -19.098896026611328, "logps/chosen": -416.6858825683594, "logps/rejected": -330.2012023925781, "loss": 0.8097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.453108310699463, "rewards/margins": 0.8903976678848267, "rewards/rejected": 2.5627102851867676, "step": 103550 }, { "epoch": 4.808022656576442, "grad_norm": 16.093759536743164, "learning_rate": 1.1646780259064952e-08, "logits/chosen": -19.351118087768555, "logits/rejected": -17.894004821777344, "logps/chosen": -455.39715576171875, "logps/rejected": -321.56207275390625, "loss": 0.2983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.650726795196533, "rewards/margins": 1.9565080404281616, "rewards/rejected": 1.6942188739776611, "step": 103560 }, { "epoch": 4.808486930683876, "grad_norm": 105.65315246582031, "learning_rate": 1.1618923812618971e-08, "logits/chosen": -19.50545883178711, "logits/rejected": -18.773845672607422, "logps/chosen": -356.8642272949219, "logps/rejected": -326.6743469238281, "loss": 0.4124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0287888050079346, "rewards/margins": 1.2699366807937622, "rewards/rejected": 1.7588520050048828, "step": 103570 }, { "epoch": 4.808951204791309, "grad_norm": 42.06644821166992, "learning_rate": 1.1591067366172988e-08, "logits/chosen": -18.84446144104004, "logits/rejected": -18.5889835357666, "logps/chosen": -237.3713836669922, "logps/rejected": -192.982666015625, "loss": 0.4421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.464782476425171, "rewards/margins": 1.4526078701019287, "rewards/rejected": 0.012174701318144798, "step": 103580 }, { "epoch": 4.8094154788987415, "grad_norm": 9.415879249572754, "learning_rate": 1.1563210919727005e-08, "logits/chosen": -18.7863712310791, "logits/rejected": -18.460670471191406, "logps/chosen": -431.0586853027344, "logps/rejected": -324.27728271484375, "loss": 0.7695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7246341705322266, "rewards/margins": 1.379176378250122, "rewards/rejected": 2.3454575538635254, "step": 103590 }, { "epoch": 4.809879753006175, "grad_norm": 52.28837966918945, "learning_rate": 1.1535354473281025e-08, "logits/chosen": -19.575075149536133, "logits/rejected": -19.52345085144043, "logps/chosen": -292.11968994140625, "logps/rejected": -301.1651306152344, "loss": 1.137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7889865636825562, "rewards/margins": -0.2424147129058838, "rewards/rejected": 2.0314013957977295, "step": 103600 }, { "epoch": 4.810344027113608, "grad_norm": 138.31900024414062, "learning_rate": 1.1507498026835043e-08, "logits/chosen": -19.040882110595703, "logits/rejected": -19.13766860961914, "logps/chosen": -328.259033203125, "logps/rejected": -337.0679626464844, "loss": 1.2291, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.593285083770752, "rewards/margins": 0.33010950684547424, "rewards/rejected": 2.2631754875183105, "step": 103610 }, { "epoch": 4.810808301221041, "grad_norm": 177.72323608398438, "learning_rate": 1.1479641580389062e-08, "logits/chosen": -19.03676986694336, "logits/rejected": -18.710575103759766, "logps/chosen": -533.1459350585938, "logps/rejected": -402.3586730957031, "loss": 1.2642, "rewards/accuracies": 0.5, "rewards/chosen": 3.941612720489502, "rewards/margins": -0.16427819430828094, "rewards/rejected": 4.105890274047852, "step": 103620 }, { "epoch": 4.8112725753284735, "grad_norm": 5.552217483520508, "learning_rate": 1.1451785133943078e-08, "logits/chosen": -19.666528701782227, "logits/rejected": -18.185527801513672, "logps/chosen": -426.658935546875, "logps/rejected": -349.20458984375, "loss": 0.6638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.525916576385498, "rewards/margins": 1.3227555751800537, "rewards/rejected": 2.2031612396240234, "step": 103630 }, { "epoch": 4.811736849435907, "grad_norm": 28.75122833251953, "learning_rate": 1.1423928687497098e-08, "logits/chosen": -19.609905242919922, "logits/rejected": -18.00786590576172, "logps/chosen": -401.9101867675781, "logps/rejected": -271.19000244140625, "loss": 0.842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.168964862823486, "rewards/margins": 2.5385031700134277, "rewards/rejected": 1.6304616928100586, "step": 103640 }, { "epoch": 4.81220112354334, "grad_norm": 123.63626098632812, "learning_rate": 1.1396072241051116e-08, "logits/chosen": -19.499645233154297, "logits/rejected": -18.131412506103516, "logps/chosen": -387.6695251464844, "logps/rejected": -249.45266723632812, "loss": 0.5072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.50036358833313, "rewards/margins": 2.127885341644287, "rewards/rejected": 1.3724777698516846, "step": 103650 }, { "epoch": 4.812665397650773, "grad_norm": 22.099355697631836, "learning_rate": 1.1368215794605135e-08, "logits/chosen": -18.500308990478516, "logits/rejected": -18.090084075927734, "logps/chosen": -263.1761474609375, "logps/rejected": -181.05690002441406, "loss": 0.8792, "rewards/accuracies": 0.5, "rewards/chosen": 1.4632298946380615, "rewards/margins": 0.9052574038505554, "rewards/rejected": 0.5579724907875061, "step": 103660 }, { "epoch": 4.813129671758206, "grad_norm": 26.71780014038086, "learning_rate": 1.1340359348159153e-08, "logits/chosen": -18.54467010498047, "logits/rejected": -18.65341567993164, "logps/chosen": -292.4141845703125, "logps/rejected": -336.6404113769531, "loss": 1.4021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7024800777435303, "rewards/margins": -0.1975935399532318, "rewards/rejected": 2.900073528289795, "step": 103670 }, { "epoch": 4.813593945865639, "grad_norm": 130.4896240234375, "learning_rate": 1.1312502901713171e-08, "logits/chosen": -18.781436920166016, "logits/rejected": -18.759838104248047, "logps/chosen": -384.4936828613281, "logps/rejected": -334.43292236328125, "loss": 0.7977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.392107963562012, "rewards/margins": 1.34951651096344, "rewards/rejected": 3.0425915718078613, "step": 103680 }, { "epoch": 4.814058219973072, "grad_norm": 133.31724548339844, "learning_rate": 1.1284646455267189e-08, "logits/chosen": -19.772499084472656, "logits/rejected": -19.377334594726562, "logps/chosen": -477.32452392578125, "logps/rejected": -483.84942626953125, "loss": 1.4856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8996047973632812, "rewards/margins": -0.2933613657951355, "rewards/rejected": 4.192965984344482, "step": 103690 }, { "epoch": 4.814522494080505, "grad_norm": 1.8903440237045288, "learning_rate": 1.1256790008821208e-08, "logits/chosen": -17.829513549804688, "logits/rejected": -17.851699829101562, "logps/chosen": -383.47003173828125, "logps/rejected": -272.82867431640625, "loss": 1.0793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.139174699783325, "rewards/margins": 1.3664692640304565, "rewards/rejected": 0.7727053761482239, "step": 103700 }, { "epoch": 4.814986768187938, "grad_norm": 67.38955688476562, "learning_rate": 1.1228933562375226e-08, "logits/chosen": -18.29513168334961, "logits/rejected": -18.50802993774414, "logps/chosen": -242.72799682617188, "logps/rejected": -255.9269561767578, "loss": 1.0961, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7219831943511963, "rewards/margins": 0.17958326637744904, "rewards/rejected": 1.5423998832702637, "step": 103710 }, { "epoch": 4.815451042295371, "grad_norm": 113.21057891845703, "learning_rate": 1.1201077115929246e-08, "logits/chosen": -20.669734954833984, "logits/rejected": -19.805700302124023, "logps/chosen": -414.8753967285156, "logps/rejected": -362.02264404296875, "loss": 0.741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.725553035736084, "rewards/margins": 1.2290149927139282, "rewards/rejected": 2.4965384006500244, "step": 103720 }, { "epoch": 4.815915316402804, "grad_norm": 1.579360008239746, "learning_rate": 1.1173220669483262e-08, "logits/chosen": -20.910879135131836, "logits/rejected": -19.116003036499023, "logps/chosen": -440.28277587890625, "logps/rejected": -318.5489807128906, "loss": 0.8748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.767815113067627, "rewards/margins": 2.0939812660217285, "rewards/rejected": 2.673833131790161, "step": 103730 }, { "epoch": 4.816379590510238, "grad_norm": 22.362579345703125, "learning_rate": 1.114536422303728e-08, "logits/chosen": -18.304218292236328, "logits/rejected": -19.387218475341797, "logps/chosen": -235.1760711669922, "logps/rejected": -323.3952941894531, "loss": 1.6919, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3645870685577393, "rewards/margins": -1.0599219799041748, "rewards/rejected": 2.424509048461914, "step": 103740 }, { "epoch": 4.81684386461767, "grad_norm": 16.61867904663086, "learning_rate": 1.1117507776591299e-08, "logits/chosen": -19.133914947509766, "logits/rejected": -18.13243865966797, "logps/chosen": -396.7567443847656, "logps/rejected": -305.15777587890625, "loss": 0.7562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7249627113342285, "rewards/margins": 1.0404759645462036, "rewards/rejected": 2.684485912322998, "step": 103750 }, { "epoch": 4.817308138725103, "grad_norm": 124.75798034667969, "learning_rate": 1.1089651330145319e-08, "logits/chosen": -19.011079788208008, "logits/rejected": -18.890960693359375, "logps/chosen": -411.4388732910156, "logps/rejected": -438.3251037597656, "loss": 0.6521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.169180870056152, "rewards/margins": 0.628219485282898, "rewards/rejected": 3.540961742401123, "step": 103760 }, { "epoch": 4.817772412832536, "grad_norm": 218.3675537109375, "learning_rate": 1.1061794883699337e-08, "logits/chosen": -18.345775604248047, "logits/rejected": -17.72469139099121, "logps/chosen": -400.31927490234375, "logps/rejected": -268.1164855957031, "loss": 0.7928, "rewards/accuracies": 0.5, "rewards/chosen": 3.38879656791687, "rewards/margins": 1.470853567123413, "rewards/rejected": 1.9179426431655884, "step": 103770 }, { "epoch": 4.8182366869399695, "grad_norm": 13.752248764038086, "learning_rate": 1.1033938437253353e-08, "logits/chosen": -19.38079833984375, "logits/rejected": -17.94440269470215, "logps/chosen": -391.773193359375, "logps/rejected": -288.09716796875, "loss": 0.3733, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.106520652770996, "rewards/margins": 1.991708755493164, "rewards/rejected": 2.114811658859253, "step": 103780 }, { "epoch": 4.818700961047402, "grad_norm": 1.18289053440094, "learning_rate": 1.1006081990807372e-08, "logits/chosen": -19.541656494140625, "logits/rejected": -19.55278778076172, "logps/chosen": -426.07421875, "logps/rejected": -410.8642578125, "loss": 0.6432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.20510196685791, "rewards/margins": 1.546791672706604, "rewards/rejected": 3.658310651779175, "step": 103790 }, { "epoch": 4.819165235154835, "grad_norm": 48.792301177978516, "learning_rate": 1.097822554436139e-08, "logits/chosen": -17.93082046508789, "logits/rejected": -17.812923431396484, "logps/chosen": -242.9314422607422, "logps/rejected": -281.6182861328125, "loss": 0.6404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9966695308685303, "rewards/margins": 0.8607848882675171, "rewards/rejected": 1.1358845233917236, "step": 103800 }, { "epoch": 4.819629509262269, "grad_norm": 141.3633575439453, "learning_rate": 1.095036909791541e-08, "logits/chosen": -18.856185913085938, "logits/rejected": -18.59397315979004, "logps/chosen": -448.405029296875, "logps/rejected": -402.0687255859375, "loss": 0.854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.484570264816284, "rewards/margins": 0.8902026414871216, "rewards/rejected": 2.5943679809570312, "step": 103810 }, { "epoch": 4.8200937833697015, "grad_norm": 61.39679718017578, "learning_rate": 1.0922512651469426e-08, "logits/chosen": -18.640819549560547, "logits/rejected": -17.85399055480957, "logps/chosen": -363.4749450683594, "logps/rejected": -270.4445495605469, "loss": 0.6357, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.653846502304077, "rewards/margins": 0.6482858657836914, "rewards/rejected": 2.005560874938965, "step": 103820 }, { "epoch": 4.820558057477134, "grad_norm": 102.6692123413086, "learning_rate": 1.0894656205023445e-08, "logits/chosen": -18.072755813598633, "logits/rejected": -17.997255325317383, "logps/chosen": -313.18731689453125, "logps/rejected": -319.7023010253906, "loss": 1.6801, "rewards/accuracies": 0.5, "rewards/chosen": 1.9750388860702515, "rewards/margins": -0.5757714509963989, "rewards/rejected": 2.5508103370666504, "step": 103830 }, { "epoch": 4.821022331584567, "grad_norm": 68.52536010742188, "learning_rate": 1.0866799758577463e-08, "logits/chosen": -19.458341598510742, "logits/rejected": -18.673282623291016, "logps/chosen": -340.9317626953125, "logps/rejected": -279.5306701660156, "loss": 0.9327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.20912504196167, "rewards/margins": 0.29369422793388367, "rewards/rejected": 1.915431022644043, "step": 103840 }, { "epoch": 4.821486605692001, "grad_norm": 140.0408172607422, "learning_rate": 1.0838943312131483e-08, "logits/chosen": -19.71254539489746, "logits/rejected": -19.512903213500977, "logps/chosen": -399.1171875, "logps/rejected": -379.3253173828125, "loss": 1.0059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4844653606414795, "rewards/margins": 0.5084847211837769, "rewards/rejected": 2.975980758666992, "step": 103850 }, { "epoch": 4.8219508797994335, "grad_norm": 19.57599639892578, "learning_rate": 1.08110868656855e-08, "logits/chosen": -19.028940200805664, "logits/rejected": -18.62038230895996, "logps/chosen": -304.90374755859375, "logps/rejected": -289.7201843261719, "loss": 0.7396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6245977878570557, "rewards/margins": 0.868403434753418, "rewards/rejected": 1.7561943531036377, "step": 103860 }, { "epoch": 4.822415153906866, "grad_norm": 37.804195404052734, "learning_rate": 1.0783230419239518e-08, "logits/chosen": -19.376012802124023, "logits/rejected": -17.733312606811523, "logps/chosen": -496.66339111328125, "logps/rejected": -291.8506164550781, "loss": 0.1883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.145406246185303, "rewards/margins": 2.804452657699585, "rewards/rejected": 1.3409535884857178, "step": 103870 }, { "epoch": 4.8228794280143, "grad_norm": 52.30915451049805, "learning_rate": 1.0755373972793536e-08, "logits/chosen": -19.720054626464844, "logits/rejected": -19.006561279296875, "logps/chosen": -345.14105224609375, "logps/rejected": -292.15985107421875, "loss": 0.8919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.310626268386841, "rewards/margins": 0.8284469842910767, "rewards/rejected": 2.4821794033050537, "step": 103880 }, { "epoch": 4.823343702121733, "grad_norm": 119.36366271972656, "learning_rate": 1.0727517526347556e-08, "logits/chosen": -19.60861587524414, "logits/rejected": -19.048099517822266, "logps/chosen": -421.5393981933594, "logps/rejected": -313.6898498535156, "loss": 0.623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5851657390594482, "rewards/margins": 1.2274589538574219, "rewards/rejected": 2.3577065467834473, "step": 103890 }, { "epoch": 4.8238079762291655, "grad_norm": 187.9040985107422, "learning_rate": 1.0699661079901573e-08, "logits/chosen": -19.053394317626953, "logits/rejected": -19.165498733520508, "logps/chosen": -441.3861389160156, "logps/rejected": -386.2094421386719, "loss": 0.9363, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.003729820251465, "rewards/margins": 0.4454132914543152, "rewards/rejected": 3.558316469192505, "step": 103900 }, { "epoch": 4.824272250336598, "grad_norm": 152.16705322265625, "learning_rate": 1.0671804633455593e-08, "logits/chosen": -17.946918487548828, "logits/rejected": -18.485179901123047, "logps/chosen": -319.84619140625, "logps/rejected": -391.02996826171875, "loss": 1.2933, "rewards/accuracies": 0.5, "rewards/chosen": 3.400982618331909, "rewards/margins": -0.04799994081258774, "rewards/rejected": 3.4489827156066895, "step": 103910 }, { "epoch": 4.824736524444032, "grad_norm": 35.042083740234375, "learning_rate": 1.0643948187009609e-08, "logits/chosen": -19.286529541015625, "logits/rejected": -17.7487735748291, "logps/chosen": -380.76776123046875, "logps/rejected": -243.1365509033203, "loss": 0.2431, "rewards/accuracies": 1.0, "rewards/chosen": 3.559530735015869, "rewards/margins": 2.7095131874084473, "rewards/rejected": 0.8500174283981323, "step": 103920 }, { "epoch": 4.825200798551465, "grad_norm": 2.053035020828247, "learning_rate": 1.0616091740563627e-08, "logits/chosen": -20.000629425048828, "logits/rejected": -18.989858627319336, "logps/chosen": -400.63092041015625, "logps/rejected": -283.4719543457031, "loss": 0.2141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.381353378295898, "rewards/margins": 3.3359057903289795, "rewards/rejected": 1.0454472303390503, "step": 103930 }, { "epoch": 4.825665072658897, "grad_norm": 179.70413208007812, "learning_rate": 1.0588235294117647e-08, "logits/chosen": -18.620656967163086, "logits/rejected": -18.43512535095215, "logps/chosen": -332.73162841796875, "logps/rejected": -313.15380859375, "loss": 1.0025, "rewards/accuracies": 0.5, "rewards/chosen": 1.8877004384994507, "rewards/margins": 0.07909099757671356, "rewards/rejected": 1.8086096048355103, "step": 103940 }, { "epoch": 4.826129346766331, "grad_norm": 61.85968780517578, "learning_rate": 1.0560378847671664e-08, "logits/chosen": -19.818645477294922, "logits/rejected": -19.249073028564453, "logps/chosen": -444.6826171875, "logps/rejected": -388.8050231933594, "loss": 0.357, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6939048767089844, "rewards/margins": 1.9017423391342163, "rewards/rejected": 1.7921626567840576, "step": 103950 }, { "epoch": 4.826593620873764, "grad_norm": 88.69893646240234, "learning_rate": 1.0532522401225684e-08, "logits/chosen": -20.54941749572754, "logits/rejected": -20.655345916748047, "logps/chosen": -399.8294372558594, "logps/rejected": -362.49359130859375, "loss": 0.9611, "rewards/accuracies": 0.5, "rewards/chosen": 2.6191015243530273, "rewards/margins": -0.03783092647790909, "rewards/rejected": 2.6569323539733887, "step": 103960 }, { "epoch": 4.827057894981197, "grad_norm": 2.7077107429504395, "learning_rate": 1.05046659547797e-08, "logits/chosen": -18.94505500793457, "logits/rejected": -17.961408615112305, "logps/chosen": -442.790771484375, "logps/rejected": -331.7043762207031, "loss": 0.4116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.458004474639893, "rewards/margins": 2.039238691329956, "rewards/rejected": 2.4187655448913574, "step": 103970 }, { "epoch": 4.82752216908863, "grad_norm": 68.2052001953125, "learning_rate": 1.047680950833372e-08, "logits/chosen": -18.694326400756836, "logits/rejected": -18.55204200744629, "logps/chosen": -365.026123046875, "logps/rejected": -322.71795654296875, "loss": 0.9295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.785613536834717, "rewards/margins": 0.6240512132644653, "rewards/rejected": 2.161562442779541, "step": 103980 }, { "epoch": 4.827986443196063, "grad_norm": 25.02369499206543, "learning_rate": 1.0448953061887737e-08, "logits/chosen": -18.638092041015625, "logits/rejected": -18.054241180419922, "logps/chosen": -329.4621276855469, "logps/rejected": -244.8434600830078, "loss": 0.3622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0925064086914062, "rewards/margins": 1.063516616821289, "rewards/rejected": 2.028989315032959, "step": 103990 }, { "epoch": 4.828450717303496, "grad_norm": 61.44371795654297, "learning_rate": 1.0421096615441757e-08, "logits/chosen": -19.76937484741211, "logits/rejected": -19.9609317779541, "logps/chosen": -365.33984375, "logps/rejected": -357.75640869140625, "loss": 0.7137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.861384630203247, "rewards/margins": 0.6791666150093079, "rewards/rejected": 2.182218074798584, "step": 104000 }, { "epoch": 4.8289149914109295, "grad_norm": 2.3554301261901855, "learning_rate": 1.0393240168995775e-08, "logits/chosen": -18.37891387939453, "logits/rejected": -17.772815704345703, "logps/chosen": -474.00054931640625, "logps/rejected": -367.1971740722656, "loss": 0.8529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3261799812316895, "rewards/margins": 1.1144084930419922, "rewards/rejected": 2.2117714881896973, "step": 104010 }, { "epoch": 4.829379265518362, "grad_norm": 34.99285888671875, "learning_rate": 1.0365383722549793e-08, "logits/chosen": -19.20697593688965, "logits/rejected": -18.193058013916016, "logps/chosen": -489.580078125, "logps/rejected": -391.80609130859375, "loss": 0.7779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.413876533508301, "rewards/margins": 1.1053950786590576, "rewards/rejected": 3.3084816932678223, "step": 104020 }, { "epoch": 4.829843539625795, "grad_norm": 43.38178634643555, "learning_rate": 1.033752727610381e-08, "logits/chosen": -19.859432220458984, "logits/rejected": -18.96112632751465, "logps/chosen": -380.241455078125, "logps/rejected": -371.06683349609375, "loss": 1.2978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9710240364074707, "rewards/margins": 0.42864537239074707, "rewards/rejected": 3.5423789024353027, "step": 104030 }, { "epoch": 4.830307813733228, "grad_norm": 5.97300386428833, "learning_rate": 1.030967082965783e-08, "logits/chosen": -19.835113525390625, "logits/rejected": -19.634723663330078, "logps/chosen": -513.5489501953125, "logps/rejected": -417.5072326660156, "loss": 0.2194, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.743856906890869, "rewards/margins": 2.4175736904144287, "rewards/rejected": 2.3262829780578613, "step": 104040 }, { "epoch": 4.8307720878406615, "grad_norm": 1.641386866569519, "learning_rate": 1.0281814383211848e-08, "logits/chosen": -19.15986442565918, "logits/rejected": -18.220874786376953, "logps/chosen": -497.29962158203125, "logps/rejected": -383.7502746582031, "loss": 1.3459, "rewards/accuracies": 0.5, "rewards/chosen": 3.5413691997528076, "rewards/margins": 0.5547326803207397, "rewards/rejected": 2.9866368770599365, "step": 104050 }, { "epoch": 4.831236361948094, "grad_norm": 12.079140663146973, "learning_rate": 1.0253957936765867e-08, "logits/chosen": -19.088459014892578, "logits/rejected": -17.783336639404297, "logps/chosen": -364.86798095703125, "logps/rejected": -371.90411376953125, "loss": 0.3905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8030121326446533, "rewards/margins": 2.0157623291015625, "rewards/rejected": 1.7872495651245117, "step": 104060 }, { "epoch": 4.831700636055527, "grad_norm": 38.205657958984375, "learning_rate": 1.0226101490319883e-08, "logits/chosen": -19.303020477294922, "logits/rejected": -19.654186248779297, "logps/chosen": -284.0971374511719, "logps/rejected": -338.36737060546875, "loss": 1.2307, "rewards/accuracies": 0.5, "rewards/chosen": 1.3050422668457031, "rewards/margins": -0.4805923104286194, "rewards/rejected": 1.7856343984603882, "step": 104070 }, { "epoch": 4.832164910162961, "grad_norm": 0.04473480209708214, "learning_rate": 1.0198245043873903e-08, "logits/chosen": -18.638505935668945, "logits/rejected": -17.131038665771484, "logps/chosen": -416.609375, "logps/rejected": -271.62298583984375, "loss": 0.4628, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.538499355316162, "rewards/margins": 2.333799123764038, "rewards/rejected": 1.2047007083892822, "step": 104080 }, { "epoch": 4.8326291842703935, "grad_norm": 56.04740905761719, "learning_rate": 1.017038859742792e-08, "logits/chosen": -19.679019927978516, "logits/rejected": -19.013408660888672, "logps/chosen": -463.3966369628906, "logps/rejected": -454.044921875, "loss": 0.7685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.761256694793701, "rewards/margins": 0.7599062919616699, "rewards/rejected": 3.0013504028320312, "step": 104090 }, { "epoch": 4.833093458377826, "grad_norm": 1.9851617813110352, "learning_rate": 1.014253215098194e-08, "logits/chosen": -18.923974990844727, "logits/rejected": -17.592477798461914, "logps/chosen": -474.71197509765625, "logps/rejected": -258.6503601074219, "loss": 0.4006, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8372013568878174, "rewards/margins": 2.614983081817627, "rewards/rejected": 1.2222182750701904, "step": 104100 }, { "epoch": 4.833557732485259, "grad_norm": 180.65673828125, "learning_rate": 1.0114675704535958e-08, "logits/chosen": -18.96038818359375, "logits/rejected": -19.402267456054688, "logps/chosen": -325.08441162109375, "logps/rejected": -360.86395263671875, "loss": 0.7492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9015165567398071, "rewards/margins": 0.4234698414802551, "rewards/rejected": 1.4780467748641968, "step": 104110 }, { "epoch": 4.834022006592693, "grad_norm": 74.27225494384766, "learning_rate": 1.0086819258089974e-08, "logits/chosen": -19.434749603271484, "logits/rejected": -18.547826766967773, "logps/chosen": -287.4359130859375, "logps/rejected": -214.10397338867188, "loss": 0.7031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9965585470199585, "rewards/margins": 1.4959485530853271, "rewards/rejected": 0.5006102323532104, "step": 104120 }, { "epoch": 4.834486280700125, "grad_norm": 14.635103225708008, "learning_rate": 1.0058962811643994e-08, "logits/chosen": -19.796463012695312, "logits/rejected": -18.13075065612793, "logps/chosen": -434.67742919921875, "logps/rejected": -237.7408905029297, "loss": 0.7354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.954214572906494, "rewards/margins": 2.410660982131958, "rewards/rejected": 1.5435532331466675, "step": 104130 }, { "epoch": 4.834950554807558, "grad_norm": 27.391454696655273, "learning_rate": 1.0031106365198012e-08, "logits/chosen": -18.251693725585938, "logits/rejected": -16.65053939819336, "logps/chosen": -311.35858154296875, "logps/rejected": -203.97421264648438, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 2.569258689880371, "rewards/margins": 2.3584532737731934, "rewards/rejected": 0.21080517768859863, "step": 104140 }, { "epoch": 4.835414828914992, "grad_norm": 82.06227111816406, "learning_rate": 1.0003249918752031e-08, "logits/chosen": -18.815534591674805, "logits/rejected": -18.81009292602539, "logps/chosen": -379.01239013671875, "logps/rejected": -360.5458984375, "loss": 1.3645, "rewards/accuracies": 0.5, "rewards/chosen": 2.786062717437744, "rewards/margins": -0.08369340747594833, "rewards/rejected": 2.8697562217712402, "step": 104150 }, { "epoch": 4.835879103022425, "grad_norm": 89.65494537353516, "learning_rate": 9.975393472306049e-09, "logits/chosen": -19.401206970214844, "logits/rejected": -19.56900405883789, "logps/chosen": -291.8601379394531, "logps/rejected": -300.96734619140625, "loss": 1.2829, "rewards/accuracies": 0.5, "rewards/chosen": 2.237854480743408, "rewards/margins": -0.5439707040786743, "rewards/rejected": 2.781825542449951, "step": 104160 }, { "epoch": 4.836343377129857, "grad_norm": 36.26376724243164, "learning_rate": 9.947537025860067e-09, "logits/chosen": -18.598976135253906, "logits/rejected": -18.27062225341797, "logps/chosen": -451.89569091796875, "logps/rejected": -386.6351623535156, "loss": 0.5643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2763214111328125, "rewards/margins": 0.517264723777771, "rewards/rejected": 2.759056568145752, "step": 104170 }, { "epoch": 4.83680765123729, "grad_norm": 122.97626495361328, "learning_rate": 9.919680579414085e-09, "logits/chosen": -18.3554744720459, "logits/rejected": -17.462993621826172, "logps/chosen": -434.82061767578125, "logps/rejected": -326.4208984375, "loss": 0.4654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0591139793395996, "rewards/margins": 1.4082921743392944, "rewards/rejected": 1.6508219242095947, "step": 104180 }, { "epoch": 4.837271925344724, "grad_norm": 142.90737915039062, "learning_rate": 9.891824132968104e-09, "logits/chosen": -18.643787384033203, "logits/rejected": -18.180681228637695, "logps/chosen": -423.3753967285156, "logps/rejected": -346.8619079589844, "loss": 1.1195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7324764728546143, "rewards/margins": 1.4512131214141846, "rewards/rejected": 2.281263589859009, "step": 104190 }, { "epoch": 4.837736199452157, "grad_norm": 133.52420043945312, "learning_rate": 9.863967686522122e-09, "logits/chosen": -19.696157455444336, "logits/rejected": -19.131885528564453, "logps/chosen": -409.837890625, "logps/rejected": -355.6719665527344, "loss": 1.1009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1552493572235107, "rewards/margins": 1.1222288608551025, "rewards/rejected": 2.033020496368408, "step": 104200 }, { "epoch": 4.838200473559589, "grad_norm": 25.92691993713379, "learning_rate": 9.836111240076142e-09, "logits/chosen": -19.315784454345703, "logits/rejected": -18.82879638671875, "logps/chosen": -396.81597900390625, "logps/rejected": -334.0236511230469, "loss": 0.5533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.084357261657715, "rewards/margins": 1.5631195306777954, "rewards/rejected": 2.521237850189209, "step": 104210 }, { "epoch": 4.838664747667023, "grad_norm": 41.352195739746094, "learning_rate": 9.808254793630158e-09, "logits/chosen": -18.46600914001465, "logits/rejected": -17.707881927490234, "logps/chosen": -431.4324645996094, "logps/rejected": -343.38470458984375, "loss": 0.437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.962740421295166, "rewards/margins": 1.3609157800674438, "rewards/rejected": 2.601824998855591, "step": 104220 }, { "epoch": 4.839129021774456, "grad_norm": 21.67246437072754, "learning_rate": 9.780398347184177e-09, "logits/chosen": -18.97049331665039, "logits/rejected": -18.128700256347656, "logps/chosen": -263.77130126953125, "logps/rejected": -230.2588348388672, "loss": 0.7627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9080889225006104, "rewards/margins": 0.8965532183647156, "rewards/rejected": 1.01153564453125, "step": 104230 }, { "epoch": 4.839593295881889, "grad_norm": 59.36201477050781, "learning_rate": 9.752541900738195e-09, "logits/chosen": -18.426279067993164, "logits/rejected": -18.956653594970703, "logps/chosen": -220.4408416748047, "logps/rejected": -287.08905029296875, "loss": 1.2913, "rewards/accuracies": 0.5, "rewards/chosen": 2.245370388031006, "rewards/margins": -0.36773669719696045, "rewards/rejected": 2.613107204437256, "step": 104240 }, { "epoch": 4.840057569989321, "grad_norm": 80.36445617675781, "learning_rate": 9.724685454292215e-09, "logits/chosen": -18.744094848632812, "logits/rejected": -17.957805633544922, "logps/chosen": -397.35772705078125, "logps/rejected": -337.7639465332031, "loss": 0.8344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.195434808731079, "rewards/margins": 1.460601806640625, "rewards/rejected": 1.734832763671875, "step": 104250 }, { "epoch": 4.840521844096755, "grad_norm": 4.627072334289551, "learning_rate": 9.696829007846232e-09, "logits/chosen": -19.361019134521484, "logits/rejected": -18.472816467285156, "logps/chosen": -396.5597839355469, "logps/rejected": -345.6050109863281, "loss": 0.451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.034693956375122, "rewards/margins": 1.3046683073043823, "rewards/rejected": 1.7300258874893188, "step": 104260 }, { "epoch": 4.840986118204188, "grad_norm": 51.1353645324707, "learning_rate": 9.668972561400249e-09, "logits/chosen": -20.405874252319336, "logits/rejected": -19.61709213256836, "logps/chosen": -476.80242919921875, "logps/rejected": -454.13946533203125, "loss": 0.6808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.870011806488037, "rewards/margins": 1.095146894454956, "rewards/rejected": 3.774864912033081, "step": 104270 }, { "epoch": 4.8414503923116206, "grad_norm": 9.125849723815918, "learning_rate": 9.641116114954268e-09, "logits/chosen": -19.382976531982422, "logits/rejected": -17.597209930419922, "logps/chosen": -322.3085632324219, "logps/rejected": -236.7389373779297, "loss": 0.2798, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4081149101257324, "rewards/margins": 1.9045076370239258, "rewards/rejected": 0.503607451915741, "step": 104280 }, { "epoch": 4.841914666419054, "grad_norm": 126.57572174072266, "learning_rate": 9.613259668508288e-09, "logits/chosen": -18.179609298706055, "logits/rejected": -16.733003616333008, "logps/chosen": -429.254638671875, "logps/rejected": -199.19949340820312, "loss": 0.8116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5501818656921387, "rewards/margins": 2.2799174785614014, "rewards/rejected": 0.27026426792144775, "step": 104290 }, { "epoch": 4.842378940526487, "grad_norm": 35.866580963134766, "learning_rate": 9.585403222062305e-09, "logits/chosen": -18.59963607788086, "logits/rejected": -18.723697662353516, "logps/chosen": -437.176513671875, "logps/rejected": -437.12530517578125, "loss": 1.2218, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8441271781921387, "rewards/margins": -0.0015497565036639571, "rewards/rejected": 3.845676898956299, "step": 104300 }, { "epoch": 4.84284321463392, "grad_norm": 55.33005142211914, "learning_rate": 9.557546775616325e-09, "logits/chosen": -19.468109130859375, "logits/rejected": -18.186668395996094, "logps/chosen": -380.3255920410156, "logps/rejected": -240.1444091796875, "loss": 0.7364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4621264934539795, "rewards/margins": 1.7493435144424438, "rewards/rejected": 1.7127830982208252, "step": 104310 }, { "epoch": 4.8433074887413525, "grad_norm": 262.0105895996094, "learning_rate": 9.529690329170341e-09, "logits/chosen": -19.045791625976562, "logits/rejected": -18.959064483642578, "logps/chosen": -516.4940185546875, "logps/rejected": -466.0142517089844, "loss": 0.6134, "rewards/accuracies": 0.5, "rewards/chosen": 4.4203691482543945, "rewards/margins": 1.324664831161499, "rewards/rejected": 3.0957043170928955, "step": 104320 }, { "epoch": 4.843771762848786, "grad_norm": 123.42312622070312, "learning_rate": 9.501833882724359e-09, "logits/chosen": -19.173351287841797, "logits/rejected": -17.90523338317871, "logps/chosen": -413.59295654296875, "logps/rejected": -279.8177185058594, "loss": 0.3931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7230618000030518, "rewards/margins": 2.204054355621338, "rewards/rejected": 1.5190074443817139, "step": 104330 }, { "epoch": 4.844236036956219, "grad_norm": 182.37738037109375, "learning_rate": 9.473977436278379e-09, "logits/chosen": -20.100093841552734, "logits/rejected": -19.586027145385742, "logps/chosen": -492.9927673339844, "logps/rejected": -346.41717529296875, "loss": 1.1765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9068992137908936, "rewards/margins": 1.5402092933654785, "rewards/rejected": 2.366690158843994, "step": 104340 }, { "epoch": 4.844700311063652, "grad_norm": 37.37771987915039, "learning_rate": 9.446120989832396e-09, "logits/chosen": -18.89853286743164, "logits/rejected": -18.11355972290039, "logps/chosen": -374.2853088378906, "logps/rejected": -260.57061767578125, "loss": 0.3703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.446923017501831, "rewards/margins": 2.193509340286255, "rewards/rejected": 1.2534136772155762, "step": 104350 }, { "epoch": 4.845164585171085, "grad_norm": 325.2473449707031, "learning_rate": 9.418264543386416e-09, "logits/chosen": -19.24104881286621, "logits/rejected": -18.366474151611328, "logps/chosen": -368.7205505371094, "logps/rejected": -407.45526123046875, "loss": 0.9642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.124173164367676, "rewards/margins": 0.662867546081543, "rewards/rejected": 3.461306095123291, "step": 104360 }, { "epoch": 4.845628859278518, "grad_norm": 98.69873809814453, "learning_rate": 9.390408096940432e-09, "logits/chosen": -19.972576141357422, "logits/rejected": -18.409061431884766, "logps/chosen": -344.19793701171875, "logps/rejected": -267.1238708496094, "loss": 0.5515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9151782989501953, "rewards/margins": 1.4674837589263916, "rewards/rejected": 2.4476943016052246, "step": 104370 }, { "epoch": 4.846093133385951, "grad_norm": 0.44554030895233154, "learning_rate": 9.362551650494452e-09, "logits/chosen": -19.335765838623047, "logits/rejected": -18.57171058654785, "logps/chosen": -482.03668212890625, "logps/rejected": -398.8765563964844, "loss": 0.26, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.621478080749512, "rewards/margins": 2.6954569816589355, "rewards/rejected": 1.926020860671997, "step": 104380 }, { "epoch": 4.846557407493384, "grad_norm": 154.37942504882812, "learning_rate": 9.33469520404847e-09, "logits/chosen": -19.19396209716797, "logits/rejected": -17.939882278442383, "logps/chosen": -552.8973388671875, "logps/rejected": -346.80731201171875, "loss": 0.2768, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.898054599761963, "rewards/margins": 2.3491275310516357, "rewards/rejected": 1.5489269495010376, "step": 104390 }, { "epoch": 4.847021681600817, "grad_norm": 85.78471374511719, "learning_rate": 9.306838757602489e-09, "logits/chosen": -19.98115348815918, "logits/rejected": -20.28097152709961, "logps/chosen": -418.306884765625, "logps/rejected": -449.4574279785156, "loss": 0.9501, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.233264446258545, "rewards/margins": -0.04111982509493828, "rewards/rejected": 4.274383544921875, "step": 104400 }, { "epoch": 4.84748595570825, "grad_norm": 207.080322265625, "learning_rate": 9.278982311156507e-09, "logits/chosen": -19.327198028564453, "logits/rejected": -18.705020904541016, "logps/chosen": -437.5751037597656, "logps/rejected": -429.29754638671875, "loss": 0.5309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.076106071472168, "rewards/margins": 0.7629905939102173, "rewards/rejected": 3.313114881515503, "step": 104410 }, { "epoch": 4.847950229815683, "grad_norm": 5.149695873260498, "learning_rate": 9.251125864710525e-09, "logits/chosen": -19.510730743408203, "logits/rejected": -17.929203033447266, "logps/chosen": -264.55328369140625, "logps/rejected": -188.28196716308594, "loss": 0.2804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1005477905273438, "rewards/margins": 2.490490674972534, "rewards/rejected": 0.6100570559501648, "step": 104420 }, { "epoch": 4.848414503923117, "grad_norm": 244.0325164794922, "learning_rate": 9.223269418264542e-09, "logits/chosen": -19.1769962310791, "logits/rejected": -19.09147834777832, "logps/chosen": -313.2947692871094, "logps/rejected": -349.98895263671875, "loss": 1.1721, "rewards/accuracies": 0.5, "rewards/chosen": 2.8503565788269043, "rewards/margins": 0.2998359799385071, "rewards/rejected": 2.550520420074463, "step": 104430 }, { "epoch": 4.848878778030549, "grad_norm": 22.546449661254883, "learning_rate": 9.195412971818562e-09, "logits/chosen": -19.308673858642578, "logits/rejected": -17.88178825378418, "logps/chosen": -348.54949951171875, "logps/rejected": -241.9844512939453, "loss": 0.2665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1916825771331787, "rewards/margins": 2.661921501159668, "rewards/rejected": 0.5297608375549316, "step": 104440 }, { "epoch": 4.849343052137982, "grad_norm": 36.79071807861328, "learning_rate": 9.16755652537258e-09, "logits/chosen": -18.003620147705078, "logits/rejected": -18.654224395751953, "logps/chosen": -267.10784912109375, "logps/rejected": -315.09954833984375, "loss": 0.8307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.168240547180176, "rewards/margins": 0.2728734016418457, "rewards/rejected": 1.8953670263290405, "step": 104450 }, { "epoch": 4.849807326245415, "grad_norm": 10.544811248779297, "learning_rate": 9.139700078926598e-09, "logits/chosen": -18.449798583984375, "logits/rejected": -17.9141788482666, "logps/chosen": -383.660400390625, "logps/rejected": -314.51556396484375, "loss": 0.6958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3765625953674316, "rewards/margins": 1.1295453310012817, "rewards/rejected": 2.2470173835754395, "step": 104460 }, { "epoch": 4.8502716003528485, "grad_norm": 111.6297607421875, "learning_rate": 9.111843632480617e-09, "logits/chosen": -19.912755966186523, "logits/rejected": -18.927589416503906, "logps/chosen": -312.9543762207031, "logps/rejected": -314.39886474609375, "loss": 0.5507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1988706588745117, "rewards/margins": 0.8610551953315735, "rewards/rejected": 1.337815523147583, "step": 104470 }, { "epoch": 4.850735874460281, "grad_norm": 38.918365478515625, "learning_rate": 9.083987186034633e-09, "logits/chosen": -18.645870208740234, "logits/rejected": -18.711210250854492, "logps/chosen": -249.5167694091797, "logps/rejected": -247.5252685546875, "loss": 0.9773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6074597835540771, "rewards/margins": 0.5975483655929565, "rewards/rejected": 1.0099114179611206, "step": 104480 }, { "epoch": 4.851200148567714, "grad_norm": 0.030014855787158012, "learning_rate": 9.056130739588653e-09, "logits/chosen": -20.108259201049805, "logits/rejected": -19.108287811279297, "logps/chosen": -374.55072021484375, "logps/rejected": -256.2652282714844, "loss": 0.3765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.419110298156738, "rewards/margins": 2.3119778633117676, "rewards/rejected": 2.1071324348449707, "step": 104490 }, { "epoch": 4.851664422675148, "grad_norm": 116.51557922363281, "learning_rate": 9.02827429314267e-09, "logits/chosen": -20.021629333496094, "logits/rejected": -18.561004638671875, "logps/chosen": -438.2479553222656, "logps/rejected": -335.42559814453125, "loss": 0.2606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.785515308380127, "rewards/margins": 2.184842109680176, "rewards/rejected": 2.600672960281372, "step": 104500 }, { "epoch": 4.8521286967825805, "grad_norm": 82.21214294433594, "learning_rate": 9.000417846696689e-09, "logits/chosen": -18.8533992767334, "logits/rejected": -18.557384490966797, "logps/chosen": -412.77984619140625, "logps/rejected": -289.6774597167969, "loss": 0.6544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.292948246002197, "rewards/margins": 1.9644172191619873, "rewards/rejected": 2.328530788421631, "step": 104510 }, { "epoch": 4.852592970890013, "grad_norm": 3.96774959564209, "learning_rate": 8.972561400250708e-09, "logits/chosen": -19.7972354888916, "logits/rejected": -18.425601959228516, "logps/chosen": -321.02001953125, "logps/rejected": -260.007080078125, "loss": 0.766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7291293144226074, "rewards/margins": 1.2478816509246826, "rewards/rejected": 1.4812475442886353, "step": 104520 }, { "epoch": 4.853057244997446, "grad_norm": 27.85364532470703, "learning_rate": 8.944704953804726e-09, "logits/chosen": -18.562299728393555, "logits/rejected": -18.56440544128418, "logps/chosen": -345.68255615234375, "logps/rejected": -346.8992614746094, "loss": 0.6188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7856385707855225, "rewards/margins": 0.5345649719238281, "rewards/rejected": 2.2510733604431152, "step": 104530 }, { "epoch": 4.85352151910488, "grad_norm": 32.29066848754883, "learning_rate": 8.916848507358744e-09, "logits/chosen": -18.648284912109375, "logits/rejected": -18.22614097595215, "logps/chosen": -328.7198486328125, "logps/rejected": -253.93896484375, "loss": 1.0629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.776118755340576, "rewards/margins": 1.123365044593811, "rewards/rejected": 1.6527538299560547, "step": 104540 }, { "epoch": 4.8539857932123125, "grad_norm": 230.387939453125, "learning_rate": 8.888992060912762e-09, "logits/chosen": -18.77699089050293, "logits/rejected": -18.17934226989746, "logps/chosen": -422.452392578125, "logps/rejected": -401.66070556640625, "loss": 1.415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7548038959503174, "rewards/margins": 0.40989717841148376, "rewards/rejected": 3.3449065685272217, "step": 104550 }, { "epoch": 4.854450067319745, "grad_norm": 130.74639892578125, "learning_rate": 8.861135614466781e-09, "logits/chosen": -20.41159439086914, "logits/rejected": -20.170400619506836, "logps/chosen": -459.10919189453125, "logps/rejected": -478.5023498535156, "loss": 0.8646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.390271186828613, "rewards/margins": 0.5395203828811646, "rewards/rejected": 3.8507513999938965, "step": 104560 }, { "epoch": 4.854914341427179, "grad_norm": 28.60757827758789, "learning_rate": 8.833279168020799e-09, "logits/chosen": -18.740802764892578, "logits/rejected": -17.66424560546875, "logps/chosen": -474.50518798828125, "logps/rejected": -362.649658203125, "loss": 0.5721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.550562620162964, "rewards/margins": 0.8657127618789673, "rewards/rejected": 2.684849977493286, "step": 104570 }, { "epoch": 4.855378615534612, "grad_norm": 72.637451171875, "learning_rate": 8.805422721574817e-09, "logits/chosen": -19.607135772705078, "logits/rejected": -19.394573211669922, "logps/chosen": -551.231201171875, "logps/rejected": -445.6455078125, "loss": 0.8807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.331284046173096, "rewards/margins": 1.1525123119354248, "rewards/rejected": 4.178771018981934, "step": 104580 }, { "epoch": 4.8558428896420445, "grad_norm": 20.0360050201416, "learning_rate": 8.777566275128836e-09, "logits/chosen": -19.256689071655273, "logits/rejected": -18.070804595947266, "logps/chosen": -373.61981201171875, "logps/rejected": -290.34356689453125, "loss": 0.902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.064714193344116, "rewards/margins": 1.2816203832626343, "rewards/rejected": 1.7830936908721924, "step": 104590 }, { "epoch": 4.856307163749477, "grad_norm": 42.88118362426758, "learning_rate": 8.749709828682854e-09, "logits/chosen": -18.336729049682617, "logits/rejected": -17.651390075683594, "logps/chosen": -414.28448486328125, "logps/rejected": -314.0814514160156, "loss": 0.6478, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2740097045898438, "rewards/margins": 1.9524494409561157, "rewards/rejected": 1.3215596675872803, "step": 104600 }, { "epoch": 4.856771437856911, "grad_norm": 226.7032928466797, "learning_rate": 8.721853382236872e-09, "logits/chosen": -19.156570434570312, "logits/rejected": -18.23170280456543, "logps/chosen": -425.2464294433594, "logps/rejected": -318.2960510253906, "loss": 1.2332, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.634244918823242, "rewards/margins": 0.807562530040741, "rewards/rejected": 2.8266825675964355, "step": 104610 }, { "epoch": 4.857235711964344, "grad_norm": 190.6412353515625, "learning_rate": 8.693996935790891e-09, "logits/chosen": -18.56529426574707, "logits/rejected": -18.11483383178711, "logps/chosen": -283.71820068359375, "logps/rejected": -264.9897766113281, "loss": 1.1089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.595931887626648, "rewards/margins": 0.20087237656116486, "rewards/rejected": 1.39505934715271, "step": 104620 }, { "epoch": 4.857699986071776, "grad_norm": 225.26876831054688, "learning_rate": 8.66614048934491e-09, "logits/chosen": -19.452192306518555, "logits/rejected": -18.640689849853516, "logps/chosen": -424.30535888671875, "logps/rejected": -340.65570068359375, "loss": 0.8427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.619619846343994, "rewards/margins": 1.6640475988388062, "rewards/rejected": 2.9555716514587402, "step": 104630 }, { "epoch": 4.85816426017921, "grad_norm": 11.375161170959473, "learning_rate": 8.638284042898927e-09, "logits/chosen": -18.99393081665039, "logits/rejected": -19.28957748413086, "logps/chosen": -342.36724853515625, "logps/rejected": -344.2049865722656, "loss": 1.7079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.213395357131958, "rewards/margins": 0.2927168011665344, "rewards/rejected": 2.9206786155700684, "step": 104640 }, { "epoch": 4.858628534286643, "grad_norm": 99.1910171508789, "learning_rate": 8.610427596452945e-09, "logits/chosen": -20.265214920043945, "logits/rejected": -20.08727264404297, "logps/chosen": -271.26263427734375, "logps/rejected": -319.343505859375, "loss": 0.8595, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.08544921875, "rewards/margins": -0.10728069394826889, "rewards/rejected": 3.192729949951172, "step": 104650 }, { "epoch": 4.859092808394076, "grad_norm": 41.21681594848633, "learning_rate": 8.582571150006964e-09, "logits/chosen": -18.540386199951172, "logits/rejected": -19.06414794921875, "logps/chosen": -417.2783203125, "logps/rejected": -408.7455749511719, "loss": 0.6869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1121764183044434, "rewards/margins": 0.8276666402816772, "rewards/rejected": 2.2845101356506348, "step": 104660 }, { "epoch": 4.859557082501508, "grad_norm": 67.31302642822266, "learning_rate": 8.554714703560982e-09, "logits/chosen": -19.0740909576416, "logits/rejected": -18.823162078857422, "logps/chosen": -499.9051818847656, "logps/rejected": -454.5897521972656, "loss": 0.3129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.633814811706543, "rewards/margins": 1.4389806985855103, "rewards/rejected": 3.1948342323303223, "step": 104670 }, { "epoch": 4.860021356608942, "grad_norm": 272.4942626953125, "learning_rate": 8.526858257115e-09, "logits/chosen": -20.08415412902832, "logits/rejected": -19.32908058166504, "logps/chosen": -472.67742919921875, "logps/rejected": -433.1551818847656, "loss": 1.1931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.667227745056152, "rewards/margins": 0.4995102286338806, "rewards/rejected": 4.167717456817627, "step": 104680 }, { "epoch": 4.860485630716375, "grad_norm": 1.1816916465759277, "learning_rate": 8.499001810669018e-09, "logits/chosen": -18.786014556884766, "logits/rejected": -17.934551239013672, "logps/chosen": -494.5360412597656, "logps/rejected": -455.44195556640625, "loss": 0.5781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.475680351257324, "rewards/margins": 2.017033338546753, "rewards/rejected": 2.4586470127105713, "step": 104690 }, { "epoch": 4.860949904823808, "grad_norm": 0.12528091669082642, "learning_rate": 8.471145364223036e-09, "logits/chosen": -18.294374465942383, "logits/rejected": -18.58673858642578, "logps/chosen": -300.2687072753906, "logps/rejected": -344.0435485839844, "loss": 2.1258, "rewards/accuracies": 0.5, "rewards/chosen": 3.081441879272461, "rewards/margins": -0.30540376901626587, "rewards/rejected": 3.386845350265503, "step": 104700 }, { "epoch": 4.861414178931241, "grad_norm": 47.326473236083984, "learning_rate": 8.443288917777055e-09, "logits/chosen": -18.15957260131836, "logits/rejected": -18.78408432006836, "logps/chosen": -259.7828369140625, "logps/rejected": -349.5097351074219, "loss": 2.0748, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.371793746948242, "rewards/margins": -1.2630139589309692, "rewards/rejected": 3.634807586669922, "step": 104710 }, { "epoch": 4.861878453038674, "grad_norm": 209.11993408203125, "learning_rate": 8.415432471331073e-09, "logits/chosen": -19.335599899291992, "logits/rejected": -18.95656967163086, "logps/chosen": -434.897705078125, "logps/rejected": -394.00958251953125, "loss": 0.5801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.01301908493042, "rewards/margins": 0.8125143051147461, "rewards/rejected": 3.200505018234253, "step": 104720 }, { "epoch": 4.862342727146107, "grad_norm": 7.296746253967285, "learning_rate": 8.387576024885091e-09, "logits/chosen": -19.135513305664062, "logits/rejected": -18.139278411865234, "logps/chosen": -305.11175537109375, "logps/rejected": -216.79348754882812, "loss": 0.2186, "rewards/accuracies": 1.0, "rewards/chosen": 3.241779327392578, "rewards/margins": 2.449795961380005, "rewards/rejected": 0.7919834852218628, "step": 104730 }, { "epoch": 4.86280700125354, "grad_norm": 138.14309692382812, "learning_rate": 8.35971957843911e-09, "logits/chosen": -19.7183780670166, "logits/rejected": -18.48940658569336, "logps/chosen": -451.75054931640625, "logps/rejected": -339.7420349121094, "loss": 1.4278, "rewards/accuracies": 0.5, "rewards/chosen": 3.6644349098205566, "rewards/margins": 0.714124321937561, "rewards/rejected": 2.950310230255127, "step": 104740 }, { "epoch": 4.863271275360973, "grad_norm": 1.2364938259124756, "learning_rate": 8.331863131993128e-09, "logits/chosen": -18.05252456665039, "logits/rejected": -17.242456436157227, "logps/chosen": -403.4250183105469, "logps/rejected": -316.77581787109375, "loss": 0.9774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4714674949645996, "rewards/margins": 1.6824623346328735, "rewards/rejected": 1.7890050411224365, "step": 104750 }, { "epoch": 4.863735549468406, "grad_norm": 90.51407623291016, "learning_rate": 8.304006685547146e-09, "logits/chosen": -20.40289878845215, "logits/rejected": -19.6289005279541, "logps/chosen": -447.98931884765625, "logps/rejected": -387.8923034667969, "loss": 0.5504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.658140659332275, "rewards/margins": 1.1580336093902588, "rewards/rejected": 3.5001060962677, "step": 104760 }, { "epoch": 4.864199823575839, "grad_norm": 49.65109634399414, "learning_rate": 8.276150239101166e-09, "logits/chosen": -19.04505157470703, "logits/rejected": -17.946949005126953, "logps/chosen": -405.7286376953125, "logps/rejected": -297.46728515625, "loss": 0.5304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0546517372131348, "rewards/margins": 1.3498191833496094, "rewards/rejected": 1.704832673072815, "step": 104770 }, { "epoch": 4.8646640976832725, "grad_norm": 1.8390333652496338, "learning_rate": 8.248293792655184e-09, "logits/chosen": -19.512794494628906, "logits/rejected": -18.7603702545166, "logps/chosen": -510.24041748046875, "logps/rejected": -450.1512145996094, "loss": 0.6934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.693633079528809, "rewards/margins": 1.0577232837677002, "rewards/rejected": 3.6359105110168457, "step": 104780 }, { "epoch": 4.865128371790705, "grad_norm": 69.88972473144531, "learning_rate": 8.220437346209201e-09, "logits/chosen": -19.068267822265625, "logits/rejected": -19.142202377319336, "logps/chosen": -323.41839599609375, "logps/rejected": -343.88238525390625, "loss": 1.1201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7818078994750977, "rewards/margins": 0.03830919414758682, "rewards/rejected": 2.7434990406036377, "step": 104790 }, { "epoch": 4.865592645898138, "grad_norm": 100.20359802246094, "learning_rate": 8.19258089976322e-09, "logits/chosen": -19.255393981933594, "logits/rejected": -18.465375900268555, "logps/chosen": -505.3948669433594, "logps/rejected": -364.48651123046875, "loss": 0.4024, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.529571056365967, "rewards/margins": 1.5578792095184326, "rewards/rejected": 2.9716920852661133, "step": 104800 }, { "epoch": 4.866056920005572, "grad_norm": 32.0314826965332, "learning_rate": 8.164724453317239e-09, "logits/chosen": -18.92603302001953, "logits/rejected": -18.769630432128906, "logps/chosen": -272.8373107910156, "logps/rejected": -273.06048583984375, "loss": 0.6168, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.428220510482788, "rewards/margins": 0.7901321649551392, "rewards/rejected": 0.6380882859230042, "step": 104810 }, { "epoch": 4.866521194113004, "grad_norm": 45.90009689331055, "learning_rate": 8.136868006871257e-09, "logits/chosen": -18.441293716430664, "logits/rejected": -17.98930549621582, "logps/chosen": -314.11151123046875, "logps/rejected": -269.4458312988281, "loss": 0.5673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5032765865325928, "rewards/margins": 1.768865942955017, "rewards/rejected": 0.7344108819961548, "step": 104820 }, { "epoch": 4.866985468220437, "grad_norm": 94.23467254638672, "learning_rate": 8.109011560425274e-09, "logits/chosen": -19.45330047607422, "logits/rejected": -19.7755069732666, "logps/chosen": -456.5093688964844, "logps/rejected": -461.6893615722656, "loss": 1.1425, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0864486694335938, "rewards/margins": -0.4230080544948578, "rewards/rejected": 3.5094571113586426, "step": 104830 }, { "epoch": 4.867449742327871, "grad_norm": 2.913959503173828, "learning_rate": 8.081155113979294e-09, "logits/chosen": -19.629575729370117, "logits/rejected": -17.78848648071289, "logps/chosen": -477.00445556640625, "logps/rejected": -283.50933837890625, "loss": 0.2206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.507376194000244, "rewards/margins": 3.335303544998169, "rewards/rejected": 1.1720726490020752, "step": 104840 }, { "epoch": 4.867914016435304, "grad_norm": 126.28170013427734, "learning_rate": 8.05329866753331e-09, "logits/chosen": -19.452302932739258, "logits/rejected": -18.76788330078125, "logps/chosen": -355.58935546875, "logps/rejected": -349.1398620605469, "loss": 0.8093, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.24627685546875, "rewards/margins": 0.6604106426239014, "rewards/rejected": 2.5858664512634277, "step": 104850 }, { "epoch": 4.868378290542736, "grad_norm": 103.03642272949219, "learning_rate": 8.02544222108733e-09, "logits/chosen": -19.39649200439453, "logits/rejected": -18.562997817993164, "logps/chosen": -365.0350036621094, "logps/rejected": -298.3086853027344, "loss": 0.4097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.680842161178589, "rewards/margins": 1.7718760967254639, "rewards/rejected": 1.908966064453125, "step": 104860 }, { "epoch": 4.868842564650169, "grad_norm": 34.976600646972656, "learning_rate": 7.997585774641349e-09, "logits/chosen": -19.48236656188965, "logits/rejected": -18.161094665527344, "logps/chosen": -304.77618408203125, "logps/rejected": -199.63107299804688, "loss": 0.2933, "rewards/accuracies": 1.0, "rewards/chosen": 2.9467926025390625, "rewards/margins": 2.060368776321411, "rewards/rejected": 0.8864237070083618, "step": 104870 }, { "epoch": 4.869306838757603, "grad_norm": 24.20262908935547, "learning_rate": 7.969729328195365e-09, "logits/chosen": -18.480266571044922, "logits/rejected": -17.27488899230957, "logps/chosen": -413.2290954589844, "logps/rejected": -274.2573547363281, "loss": 0.3898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.39073371887207, "rewards/margins": 2.853038787841797, "rewards/rejected": 1.5376951694488525, "step": 104880 }, { "epoch": 4.869771112865036, "grad_norm": 137.16127014160156, "learning_rate": 7.941872881749385e-09, "logits/chosen": -18.530244827270508, "logits/rejected": -18.261404037475586, "logps/chosen": -295.4600524902344, "logps/rejected": -266.0697937011719, "loss": 0.8722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8070242404937744, "rewards/margins": 0.4435787796974182, "rewards/rejected": 1.363445520401001, "step": 104890 }, { "epoch": 4.870235386972468, "grad_norm": 37.547489166259766, "learning_rate": 7.914016435303403e-09, "logits/chosen": -19.323022842407227, "logits/rejected": -18.885578155517578, "logps/chosen": -344.91558837890625, "logps/rejected": -294.9709167480469, "loss": 0.4306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.869379758834839, "rewards/margins": 1.393525242805481, "rewards/rejected": 1.4758546352386475, "step": 104900 }, { "epoch": 4.870699661079902, "grad_norm": 57.58405685424805, "learning_rate": 7.88615998885742e-09, "logits/chosen": -19.734416961669922, "logits/rejected": -18.98133659362793, "logps/chosen": -425.6490783691406, "logps/rejected": -390.1399230957031, "loss": 0.9827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.218722105026245, "rewards/margins": 0.21695904433727264, "rewards/rejected": 3.001762866973877, "step": 104910 }, { "epoch": 4.871163935187335, "grad_norm": 5.637880802154541, "learning_rate": 7.85830354241144e-09, "logits/chosen": -19.884706497192383, "logits/rejected": -19.769899368286133, "logps/chosen": -395.867431640625, "logps/rejected": -348.28143310546875, "loss": 0.6986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0891053676605225, "rewards/margins": 0.951668381690979, "rewards/rejected": 2.137437343597412, "step": 104920 }, { "epoch": 4.871628209294768, "grad_norm": 329.645263671875, "learning_rate": 7.830447095965458e-09, "logits/chosen": -18.935775756835938, "logits/rejected": -17.839885711669922, "logps/chosen": -465.247314453125, "logps/rejected": -365.6248474121094, "loss": 0.4231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.255087852478027, "rewards/margins": 2.130314350128174, "rewards/rejected": 2.1247735023498535, "step": 104930 }, { "epoch": 4.8720924834022, "grad_norm": 54.92684555053711, "learning_rate": 7.802590649519476e-09, "logits/chosen": -19.99282455444336, "logits/rejected": -19.038005828857422, "logps/chosen": -423.96002197265625, "logps/rejected": -366.0341796875, "loss": 0.4608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.503387928009033, "rewards/margins": 1.7192500829696655, "rewards/rejected": 2.78413724899292, "step": 104940 }, { "epoch": 4.872556757509634, "grad_norm": 78.51773834228516, "learning_rate": 7.774734203073494e-09, "logits/chosen": -18.80695343017578, "logits/rejected": -18.097793579101562, "logps/chosen": -280.71795654296875, "logps/rejected": -224.9883270263672, "loss": 0.4478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9917428493499756, "rewards/margins": 1.0711621046066284, "rewards/rejected": 0.9205808639526367, "step": 104950 }, { "epoch": 4.873021031617067, "grad_norm": 12.381976127624512, "learning_rate": 7.746877756627513e-09, "logits/chosen": -19.444988250732422, "logits/rejected": -18.875389099121094, "logps/chosen": -381.82904052734375, "logps/rejected": -339.302734375, "loss": 0.7581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.826476573944092, "rewards/margins": 1.1519845724105835, "rewards/rejected": 2.674492359161377, "step": 104960 }, { "epoch": 4.8734853057245, "grad_norm": 0.17579691112041473, "learning_rate": 7.719021310181531e-09, "logits/chosen": -18.912599563598633, "logits/rejected": -17.68818473815918, "logps/chosen": -394.5362854003906, "logps/rejected": -258.08172607421875, "loss": 1.0664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.204576015472412, "rewards/margins": 1.687445878982544, "rewards/rejected": 1.5171306133270264, "step": 104970 }, { "epoch": 4.873949579831933, "grad_norm": 72.41505432128906, "learning_rate": 7.691164863735549e-09, "logits/chosen": -18.195993423461914, "logits/rejected": -17.841938018798828, "logps/chosen": -366.15716552734375, "logps/rejected": -343.27862548828125, "loss": 1.4903, "rewards/accuracies": 0.5, "rewards/chosen": 3.169076442718506, "rewards/margins": 0.8078473806381226, "rewards/rejected": 2.361229181289673, "step": 104980 }, { "epoch": 4.874413853939366, "grad_norm": 10.61536979675293, "learning_rate": 7.663308417289568e-09, "logits/chosen": -20.121597290039062, "logits/rejected": -19.111026763916016, "logps/chosen": -413.4061584472656, "logps/rejected": -316.79229736328125, "loss": 0.5681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.2115864753723145, "rewards/margins": 2.9517393112182617, "rewards/rejected": 2.2598471641540527, "step": 104990 }, { "epoch": 4.874878128046799, "grad_norm": 46.965843200683594, "learning_rate": 7.635451970843586e-09, "logits/chosen": -18.84054183959961, "logits/rejected": -18.951297760009766, "logps/chosen": -303.57159423828125, "logps/rejected": -337.5380554199219, "loss": 1.0553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.273693084716797, "rewards/margins": -0.120195172727108, "rewards/rejected": 2.393887996673584, "step": 105000 }, { "epoch": 4.8753424021542315, "grad_norm": 0.6436887979507446, "learning_rate": 7.607595524397604e-09, "logits/chosen": -18.346467971801758, "logits/rejected": -17.958894729614258, "logps/chosen": -326.8791198730469, "logps/rejected": -306.664306640625, "loss": 0.8161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.011265277862549, "rewards/margins": 0.8067000508308411, "rewards/rejected": 1.2045652866363525, "step": 105010 }, { "epoch": 4.875806676261665, "grad_norm": 159.10467529296875, "learning_rate": 7.579739077951622e-09, "logits/chosen": -18.798738479614258, "logits/rejected": -18.490514755249023, "logps/chosen": -405.27655029296875, "logps/rejected": -358.1882629394531, "loss": 0.6572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1293907165527344, "rewards/margins": 1.162187933921814, "rewards/rejected": 1.9672024250030518, "step": 105020 }, { "epoch": 4.876270950369098, "grad_norm": 244.3288116455078, "learning_rate": 7.551882631505641e-09, "logits/chosen": -18.94583511352539, "logits/rejected": -20.08778953552246, "logps/chosen": -308.6053771972656, "logps/rejected": -420.7084045410156, "loss": 1.5938, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.508186101913452, "rewards/margins": -0.8327435255050659, "rewards/rejected": 3.3409297466278076, "step": 105030 }, { "epoch": 4.876735224476531, "grad_norm": 32.254600524902344, "learning_rate": 7.524026185059659e-09, "logits/chosen": -18.889583587646484, "logits/rejected": -18.396148681640625, "logps/chosen": -374.491943359375, "logps/rejected": -312.6405944824219, "loss": 0.8221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.307842254638672, "rewards/margins": 1.490416169166565, "rewards/rejected": 1.817426323890686, "step": 105040 }, { "epoch": 4.877199498583964, "grad_norm": 0.003152282675728202, "learning_rate": 7.496169738613677e-09, "logits/chosen": -19.74250602722168, "logits/rejected": -19.141204833984375, "logps/chosen": -426.2618103027344, "logps/rejected": -383.10015869140625, "loss": 0.348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.961733341217041, "rewards/margins": 2.542410373687744, "rewards/rejected": 2.419322967529297, "step": 105050 }, { "epoch": 4.877663772691397, "grad_norm": 84.62781524658203, "learning_rate": 7.468313292167695e-09, "logits/chosen": -19.38345718383789, "logits/rejected": -18.81570816040039, "logps/chosen": -355.482421875, "logps/rejected": -272.0403747558594, "loss": 0.3452, "rewards/accuracies": 1.0, "rewards/chosen": 3.059945821762085, "rewards/margins": 1.3556500673294067, "rewards/rejected": 1.7042958736419678, "step": 105060 }, { "epoch": 4.87812804679883, "grad_norm": 0.06634466350078583, "learning_rate": 7.4404568457217135e-09, "logits/chosen": -19.206233978271484, "logits/rejected": -17.492279052734375, "logps/chosen": -432.3714904785156, "logps/rejected": -250.72799682617188, "loss": 0.4496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5055859088897705, "rewards/margins": 2.7238011360168457, "rewards/rejected": 0.7817845344543457, "step": 105070 }, { "epoch": 4.878592320906263, "grad_norm": 15.887876510620117, "learning_rate": 7.412600399275732e-09, "logits/chosen": -19.49679946899414, "logits/rejected": -18.698482513427734, "logps/chosen": -351.79693603515625, "logps/rejected": -277.1446228027344, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.709235668182373, "rewards/margins": 2.252480983734131, "rewards/rejected": 1.4567549228668213, "step": 105080 }, { "epoch": 4.879056595013696, "grad_norm": 61.353668212890625, "learning_rate": 7.384743952829751e-09, "logits/chosen": -18.92905616760254, "logits/rejected": -18.164342880249023, "logps/chosen": -391.38702392578125, "logps/rejected": -341.3816223144531, "loss": 0.8636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.526959180831909, "rewards/margins": 0.6025275588035583, "rewards/rejected": 2.924431324005127, "step": 105090 }, { "epoch": 4.879520869121129, "grad_norm": 105.6778793334961, "learning_rate": 7.356887506383769e-09, "logits/chosen": -18.998371124267578, "logits/rejected": -18.649799346923828, "logps/chosen": -308.37823486328125, "logps/rejected": -312.4455261230469, "loss": 0.9573, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5331056118011475, "rewards/margins": 0.13550327718257904, "rewards/rejected": 2.3976025581359863, "step": 105100 }, { "epoch": 4.879985143228562, "grad_norm": 10.956999778747559, "learning_rate": 7.329031059937787e-09, "logits/chosen": -19.31582260131836, "logits/rejected": -18.221729278564453, "logps/chosen": -434.18682861328125, "logps/rejected": -327.60247802734375, "loss": 0.6689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.882375717163086, "rewards/margins": 1.8650602102279663, "rewards/rejected": 2.017315626144409, "step": 105110 }, { "epoch": 4.880449417335996, "grad_norm": 51.498966217041016, "learning_rate": 7.301174613491804e-09, "logits/chosen": -18.57515525817871, "logits/rejected": -17.62516975402832, "logps/chosen": -486.21026611328125, "logps/rejected": -416.7535705566406, "loss": 0.5398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1404547691345215, "rewards/margins": 1.7724406719207764, "rewards/rejected": 1.3680139780044556, "step": 105120 }, { "epoch": 4.880913691443428, "grad_norm": 221.96241760253906, "learning_rate": 7.273318167045824e-09, "logits/chosen": -18.172027587890625, "logits/rejected": -17.446706771850586, "logps/chosen": -309.6239013671875, "logps/rejected": -235.4556121826172, "loss": 1.1302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.745164155960083, "rewards/margins": 0.4507996141910553, "rewards/rejected": 1.2943646907806396, "step": 105130 }, { "epoch": 4.881377965550861, "grad_norm": 5.403036594390869, "learning_rate": 7.2454617205998426e-09, "logits/chosen": -18.588092803955078, "logits/rejected": -18.090343475341797, "logps/chosen": -405.7712097167969, "logps/rejected": -304.17120361328125, "loss": 0.7108, "rewards/accuracies": 0.5, "rewards/chosen": 3.1824004650115967, "rewards/margins": 0.9493333101272583, "rewards/rejected": 2.233067035675049, "step": 105140 }, { "epoch": 4.881842239658294, "grad_norm": 43.4757194519043, "learning_rate": 7.2176052741538596e-09, "logits/chosen": -19.211780548095703, "logits/rejected": -18.850231170654297, "logps/chosen": -464.5835876464844, "logps/rejected": -442.8968200683594, "loss": 0.8043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.9097466468811035, "rewards/margins": 1.4203064441680908, "rewards/rejected": 3.489440441131592, "step": 105150 }, { "epoch": 4.8823065137657276, "grad_norm": 106.84761810302734, "learning_rate": 7.189748827707878e-09, "logits/chosen": -19.567968368530273, "logits/rejected": -18.755769729614258, "logps/chosen": -423.31256103515625, "logps/rejected": -355.72833251953125, "loss": 0.6766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.321633815765381, "rewards/margins": 2.1234288215637207, "rewards/rejected": 3.19820499420166, "step": 105160 }, { "epoch": 4.88277078787316, "grad_norm": 47.032958984375, "learning_rate": 7.161892381261896e-09, "logits/chosen": -19.846721649169922, "logits/rejected": -19.310239791870117, "logps/chosen": -480.40447998046875, "logps/rejected": -382.32452392578125, "loss": 0.8293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.13850736618042, "rewards/margins": 0.7819744348526001, "rewards/rejected": 2.356532573699951, "step": 105170 }, { "epoch": 4.883235061980593, "grad_norm": 26.977014541625977, "learning_rate": 7.134035934815915e-09, "logits/chosen": -19.310081481933594, "logits/rejected": -18.1658878326416, "logps/chosen": -432.90155029296875, "logps/rejected": -316.78302001953125, "loss": 0.6765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.154751300811768, "rewards/margins": 1.4236968755722046, "rewards/rejected": 2.7310543060302734, "step": 105180 }, { "epoch": 4.883699336088027, "grad_norm": 161.6255645751953, "learning_rate": 7.1061794883699334e-09, "logits/chosen": -19.05394744873047, "logits/rejected": -19.481237411499023, "logps/chosen": -356.3622741699219, "logps/rejected": -406.8377685546875, "loss": 1.0393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.469484806060791, "rewards/margins": -0.01525813341140747, "rewards/rejected": 3.4847426414489746, "step": 105190 }, { "epoch": 4.8841636101954595, "grad_norm": 6.23218297958374, "learning_rate": 7.078323041923951e-09, "logits/chosen": -19.86496353149414, "logits/rejected": -18.536781311035156, "logps/chosen": -426.9901428222656, "logps/rejected": -329.76104736328125, "loss": 0.9936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6218743324279785, "rewards/margins": 1.4492844343185425, "rewards/rejected": 2.1725897789001465, "step": 105200 }, { "epoch": 4.884627884302892, "grad_norm": 110.17256927490234, "learning_rate": 7.05046659547797e-09, "logits/chosen": -18.80810546875, "logits/rejected": -18.770721435546875, "logps/chosen": -407.0056457519531, "logps/rejected": -307.62750244140625, "loss": 0.6343, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.172882556915283, "rewards/margins": 0.8221433758735657, "rewards/rejected": 2.350739002227783, "step": 105210 }, { "epoch": 4.885092158410325, "grad_norm": 95.14973449707031, "learning_rate": 7.022610149031988e-09, "logits/chosen": -19.029836654663086, "logits/rejected": -18.502716064453125, "logps/chosen": -357.69842529296875, "logps/rejected": -341.1549987792969, "loss": 0.784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1628832817077637, "rewards/margins": 0.9479893445968628, "rewards/rejected": 2.2148935794830322, "step": 105220 }, { "epoch": 4.885556432517759, "grad_norm": 93.77677154541016, "learning_rate": 6.9947537025860065e-09, "logits/chosen": -18.6021728515625, "logits/rejected": -18.012794494628906, "logps/chosen": -396.5498046875, "logps/rejected": -338.0314636230469, "loss": 0.7451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.698974132537842, "rewards/margins": 1.0199204683303833, "rewards/rejected": 1.6790539026260376, "step": 105230 }, { "epoch": 4.8860207066251915, "grad_norm": 2.7791459560394287, "learning_rate": 6.966897256140025e-09, "logits/chosen": -19.2802791595459, "logits/rejected": -19.29705238342285, "logps/chosen": -326.14959716796875, "logps/rejected": -310.48284912109375, "loss": 0.8285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.848006248474121, "rewards/margins": 0.45471224188804626, "rewards/rejected": 2.393294095993042, "step": 105240 }, { "epoch": 4.886484980732624, "grad_norm": 156.532470703125, "learning_rate": 6.939040809694043e-09, "logits/chosen": -18.832324981689453, "logits/rejected": -18.093177795410156, "logps/chosen": -357.18157958984375, "logps/rejected": -345.4703063964844, "loss": 0.8372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2542717456817627, "rewards/margins": 1.0101112127304077, "rewards/rejected": 1.2441602945327759, "step": 105250 }, { "epoch": 4.886949254840058, "grad_norm": 94.9797592163086, "learning_rate": 6.911184363248062e-09, "logits/chosen": -18.40603256225586, "logits/rejected": -18.27228546142578, "logps/chosen": -356.11016845703125, "logps/rejected": -331.9550476074219, "loss": 1.036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.480271100997925, "rewards/margins": 0.5933412313461304, "rewards/rejected": 1.8869298696517944, "step": 105260 }, { "epoch": 4.887413528947491, "grad_norm": 0.026638906449079514, "learning_rate": 6.8833279168020795e-09, "logits/chosen": -19.157514572143555, "logits/rejected": -17.773710250854492, "logps/chosen": -381.4060363769531, "logps/rejected": -248.37728881835938, "loss": 0.945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8341617584228516, "rewards/margins": 1.7141536474227905, "rewards/rejected": 1.1200082302093506, "step": 105270 }, { "epoch": 4.8878778030549235, "grad_norm": 270.6261901855469, "learning_rate": 6.855471470356098e-09, "logits/chosen": -18.956409454345703, "logits/rejected": -17.74433708190918, "logps/chosen": -424.4808654785156, "logps/rejected": -250.8268280029297, "loss": 0.7718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.280846118927002, "rewards/margins": 1.382934808731079, "rewards/rejected": 1.8979108333587646, "step": 105280 }, { "epoch": 4.888342077162356, "grad_norm": 133.0601348876953, "learning_rate": 6.827615023910117e-09, "logits/chosen": -19.125356674194336, "logits/rejected": -18.028242111206055, "logps/chosen": -341.33392333984375, "logps/rejected": -331.88421630859375, "loss": 0.8154, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8838255405426025, "rewards/margins": 1.5012537240982056, "rewards/rejected": 1.382571816444397, "step": 105290 }, { "epoch": 4.88880635126979, "grad_norm": 73.61956024169922, "learning_rate": 6.799758577464135e-09, "logits/chosen": -18.43825340270996, "logits/rejected": -18.031234741210938, "logps/chosen": -270.8459167480469, "logps/rejected": -272.19378662109375, "loss": 1.2709, "rewards/accuracies": 0.5, "rewards/chosen": 2.387755870819092, "rewards/margins": 0.27171212434768677, "rewards/rejected": 2.1160433292388916, "step": 105300 }, { "epoch": 4.889270625377223, "grad_norm": 91.7934799194336, "learning_rate": 6.771902131018153e-09, "logits/chosen": -19.035249710083008, "logits/rejected": -18.740671157836914, "logps/chosen": -363.4992980957031, "logps/rejected": -320.80694580078125, "loss": 0.5897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9636902809143066, "rewards/margins": 0.9723526239395142, "rewards/rejected": 1.9913381338119507, "step": 105310 }, { "epoch": 4.8897348994846555, "grad_norm": 40.352046966552734, "learning_rate": 6.74404568457217e-09, "logits/chosen": -18.98797035217285, "logits/rejected": -17.933610916137695, "logps/chosen": -363.707275390625, "logps/rejected": -256.8927001953125, "loss": 0.944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9362924098968506, "rewards/margins": 0.950616717338562, "rewards/rejected": 1.9856754541397095, "step": 105320 }, { "epoch": 4.890199173592089, "grad_norm": 73.1723403930664, "learning_rate": 6.716189238126189e-09, "logits/chosen": -19.090206146240234, "logits/rejected": -18.513124465942383, "logps/chosen": -444.798095703125, "logps/rejected": -402.27838134765625, "loss": 0.4221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.585772752761841, "rewards/margins": 1.7687995433807373, "rewards/rejected": 1.816973328590393, "step": 105330 }, { "epoch": 4.890663447699522, "grad_norm": 0.7584170699119568, "learning_rate": 6.6883327916802086e-09, "logits/chosen": -19.997554779052734, "logits/rejected": -17.76529312133789, "logps/chosen": -485.6549377441406, "logps/rejected": -293.9249572753906, "loss": 0.5401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.043003559112549, "rewards/margins": 3.1432595252990723, "rewards/rejected": 1.8997443914413452, "step": 105340 }, { "epoch": 4.891127721806955, "grad_norm": 80.34156036376953, "learning_rate": 6.660476345234226e-09, "logits/chosen": -19.202672958374023, "logits/rejected": -18.240005493164062, "logps/chosen": -433.59381103515625, "logps/rejected": -368.6165466308594, "loss": 0.4742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.462807655334473, "rewards/margins": 2.040358304977417, "rewards/rejected": 2.4224488735198975, "step": 105350 }, { "epoch": 4.891591995914387, "grad_norm": 8.202531814575195, "learning_rate": 6.632619898788244e-09, "logits/chosen": -19.150306701660156, "logits/rejected": -18.659194946289062, "logps/chosen": -367.38751220703125, "logps/rejected": -307.9078063964844, "loss": 1.0015, "rewards/accuracies": 0.5, "rewards/chosen": 2.4511914253234863, "rewards/margins": 0.5138231515884399, "rewards/rejected": 1.9373681545257568, "step": 105360 }, { "epoch": 4.892056270021821, "grad_norm": 233.67662048339844, "learning_rate": 6.604763452342262e-09, "logits/chosen": -18.975971221923828, "logits/rejected": -18.679426193237305, "logps/chosen": -359.631103515625, "logps/rejected": -334.4809265136719, "loss": 1.2097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.452868700027466, "rewards/margins": 0.7093092203140259, "rewards/rejected": 1.74355947971344, "step": 105370 }, { "epoch": 4.892520544129254, "grad_norm": 14.737022399902344, "learning_rate": 6.576907005896281e-09, "logits/chosen": -19.0389347076416, "logits/rejected": -18.979007720947266, "logps/chosen": -334.6864318847656, "logps/rejected": -350.49200439453125, "loss": 0.9681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9625463485717773, "rewards/margins": 0.290688693523407, "rewards/rejected": 2.6718575954437256, "step": 105380 }, { "epoch": 4.892984818236687, "grad_norm": 23.65949249267578, "learning_rate": 6.549050559450299e-09, "logits/chosen": -19.08715057373047, "logits/rejected": -18.617509841918945, "logps/chosen": -382.2731018066406, "logps/rejected": -329.90380859375, "loss": 0.6268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5615406036376953, "rewards/margins": 1.5950101613998413, "rewards/rejected": 1.9665305614471436, "step": 105390 }, { "epoch": 4.89344909234412, "grad_norm": 7.322986125946045, "learning_rate": 6.521194113004317e-09, "logits/chosen": -19.49941635131836, "logits/rejected": -17.462472915649414, "logps/chosen": -406.55340576171875, "logps/rejected": -231.70419311523438, "loss": 0.2512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6528515815734863, "rewards/margins": 3.2504210472106934, "rewards/rejected": 0.4024302363395691, "step": 105400 }, { "epoch": 4.893913366451553, "grad_norm": 93.9773941040039, "learning_rate": 6.493337666558336e-09, "logits/chosen": -19.97732925415039, "logits/rejected": -19.287086486816406, "logps/chosen": -388.4584045410156, "logps/rejected": -331.36529541015625, "loss": 0.5535, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.366652250289917, "rewards/margins": 1.1947786808013916, "rewards/rejected": 2.1718738079071045, "step": 105410 }, { "epoch": 4.894377640558986, "grad_norm": 168.13555908203125, "learning_rate": 6.465481220112354e-09, "logits/chosen": -19.22012710571289, "logits/rejected": -17.587100982666016, "logps/chosen": -369.3387145996094, "logps/rejected": -258.3365173339844, "loss": 0.8994, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8818118572235107, "rewards/margins": 2.4038915634155273, "rewards/rejected": 1.4779205322265625, "step": 105420 }, { "epoch": 4.894841914666419, "grad_norm": 106.34661865234375, "learning_rate": 6.4376247736663725e-09, "logits/chosen": -19.286968231201172, "logits/rejected": -18.74686622619629, "logps/chosen": -334.5728454589844, "logps/rejected": -298.49102783203125, "loss": 0.5215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.616258144378662, "rewards/margins": 1.617415189743042, "rewards/rejected": 1.9988428354263306, "step": 105430 }, { "epoch": 4.895306188773852, "grad_norm": 0.02235693857073784, "learning_rate": 6.40976832722039e-09, "logits/chosen": -20.029434204101562, "logits/rejected": -18.474071502685547, "logps/chosen": -422.7864685058594, "logps/rejected": -287.66900634765625, "loss": 0.5661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8295276165008545, "rewards/margins": 2.280034303665161, "rewards/rejected": 1.5494939088821411, "step": 105440 }, { "epoch": 4.895770462881285, "grad_norm": 116.71409606933594, "learning_rate": 6.381911880774409e-09, "logits/chosen": -18.563690185546875, "logits/rejected": -17.65329360961914, "logps/chosen": -401.02734375, "logps/rejected": -343.2996520996094, "loss": 0.3764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4273600578308105, "rewards/margins": 1.9383125305175781, "rewards/rejected": 2.4890475273132324, "step": 105450 }, { "epoch": 4.896234736988718, "grad_norm": 51.523441314697266, "learning_rate": 6.354055434328428e-09, "logits/chosen": -20.448583602905273, "logits/rejected": -18.42586898803711, "logps/chosen": -369.354736328125, "logps/rejected": -210.4259033203125, "loss": 0.2458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.912969589233398, "rewards/margins": 3.7105770111083984, "rewards/rejected": 1.202392339706421, "step": 105460 }, { "epoch": 4.8966990110961515, "grad_norm": 97.20824432373047, "learning_rate": 6.3261989878824455e-09, "logits/chosen": -19.376745223999023, "logits/rejected": -17.866289138793945, "logps/chosen": -525.3159790039062, "logps/rejected": -385.43023681640625, "loss": 0.3901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.134469509124756, "rewards/margins": 2.6157164573669434, "rewards/rejected": 1.518753170967102, "step": 105470 }, { "epoch": 4.897163285203584, "grad_norm": 99.90867614746094, "learning_rate": 6.298342541436464e-09, "logits/chosen": -19.5413761138916, "logits/rejected": -18.966598510742188, "logps/chosen": -459.220458984375, "logps/rejected": -363.43450927734375, "loss": 0.5725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4484004974365234, "rewards/margins": 0.5794556140899658, "rewards/rejected": 2.8689451217651367, "step": 105480 }, { "epoch": 4.897627559311017, "grad_norm": 7.4850921630859375, "learning_rate": 6.270486094990481e-09, "logits/chosen": -19.037988662719727, "logits/rejected": -17.062114715576172, "logps/chosen": -362.9877624511719, "logps/rejected": -189.6659393310547, "loss": 0.229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.08925199508667, "rewards/margins": 3.481365919113159, "rewards/rejected": -0.39211368560791016, "step": 105490 }, { "epoch": 4.89809183341845, "grad_norm": 17.251707077026367, "learning_rate": 6.242629648544501e-09, "logits/chosen": -19.482891082763672, "logits/rejected": -19.12074851989746, "logps/chosen": -400.51580810546875, "logps/rejected": -308.7181396484375, "loss": 0.3354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3097198009490967, "rewards/margins": 1.7857122421264648, "rewards/rejected": 1.5240073204040527, "step": 105500 }, { "epoch": 4.898556107525883, "grad_norm": 78.33822631835938, "learning_rate": 6.214773202098519e-09, "logits/chosen": -18.806591033935547, "logits/rejected": -18.938335418701172, "logps/chosen": -425.04486083984375, "logps/rejected": -446.26519775390625, "loss": 1.2264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7635626792907715, "rewards/margins": 0.20527341961860657, "rewards/rejected": 3.5582892894744873, "step": 105510 }, { "epoch": 4.899020381633316, "grad_norm": 39.468570709228516, "learning_rate": 6.186916755652536e-09, "logits/chosen": -18.804054260253906, "logits/rejected": -17.957120895385742, "logps/chosen": -394.82525634765625, "logps/rejected": -271.2740783691406, "loss": 0.3459, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1890835762023926, "rewards/margins": 1.7915070056915283, "rewards/rejected": 1.397576928138733, "step": 105520 }, { "epoch": 4.899484655740749, "grad_norm": 65.56525421142578, "learning_rate": 6.159060309206555e-09, "logits/chosen": -18.809497833251953, "logits/rejected": -18.098194122314453, "logps/chosen": -507.65789794921875, "logps/rejected": -378.7494201660156, "loss": 0.5182, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7178397178649902, "rewards/margins": 1.3601101636886597, "rewards/rejected": 2.35772967338562, "step": 105530 }, { "epoch": 4.899948929848183, "grad_norm": 1.2912235260009766, "learning_rate": 6.131203862760573e-09, "logits/chosen": -19.464265823364258, "logits/rejected": -18.58565902709961, "logps/chosen": -355.328369140625, "logps/rejected": -266.04302978515625, "loss": 0.3577, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1179428100585938, "rewards/margins": 2.066681146621704, "rewards/rejected": 1.0512616634368896, "step": 105540 }, { "epoch": 4.900413203955615, "grad_norm": 3.588839054107666, "learning_rate": 6.103347416314592e-09, "logits/chosen": -19.206329345703125, "logits/rejected": -18.405628204345703, "logps/chosen": -459.4141540527344, "logps/rejected": -402.2716064453125, "loss": 0.7828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.305156230926514, "rewards/margins": 1.7804231643676758, "rewards/rejected": 2.524733066558838, "step": 105550 }, { "epoch": 4.900877478063048, "grad_norm": 63.67308807373047, "learning_rate": 6.07549096986861e-09, "logits/chosen": -18.273231506347656, "logits/rejected": -18.064287185668945, "logps/chosen": -537.0814208984375, "logps/rejected": -459.3717346191406, "loss": 0.7186, "rewards/accuracies": 0.5, "rewards/chosen": 5.110654830932617, "rewards/margins": 1.0306142568588257, "rewards/rejected": 4.080040454864502, "step": 105560 }, { "epoch": 4.901341752170481, "grad_norm": 10.742388725280762, "learning_rate": 6.047634523422628e-09, "logits/chosen": -19.128686904907227, "logits/rejected": -19.025127410888672, "logps/chosen": -297.8824462890625, "logps/rejected": -226.99197387695312, "loss": 0.8004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.059580087661743, "rewards/margins": 0.8030886650085449, "rewards/rejected": 1.2564914226531982, "step": 105570 }, { "epoch": 4.901806026277915, "grad_norm": 144.6060333251953, "learning_rate": 6.019778076976647e-09, "logits/chosen": -18.329288482666016, "logits/rejected": -17.463315963745117, "logps/chosen": -369.3853759765625, "logps/rejected": -279.3589172363281, "loss": 0.7092, "rewards/accuracies": 0.5, "rewards/chosen": 1.7761681079864502, "rewards/margins": 1.0901224613189697, "rewards/rejected": 0.6860455274581909, "step": 105580 }, { "epoch": 4.902270300385347, "grad_norm": 62.14728927612305, "learning_rate": 5.991921630530665e-09, "logits/chosen": -18.977344512939453, "logits/rejected": -17.988271713256836, "logps/chosen": -425.5376892089844, "logps/rejected": -301.2617492675781, "loss": 0.4395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.520477056503296, "rewards/margins": 1.0736315250396729, "rewards/rejected": 1.4468456506729126, "step": 105590 }, { "epoch": 4.90273457449278, "grad_norm": 40.01555633544922, "learning_rate": 5.964065184084683e-09, "logits/chosen": -19.32040786743164, "logits/rejected": -18.06610679626465, "logps/chosen": -452.48968505859375, "logps/rejected": -328.48565673828125, "loss": 0.1794, "rewards/accuracies": 1.0, "rewards/chosen": 5.260941982269287, "rewards/margins": 2.419340133666992, "rewards/rejected": 2.841602087020874, "step": 105600 }, { "epoch": 4.903198848600214, "grad_norm": 37.888038635253906, "learning_rate": 5.936208737638702e-09, "logits/chosen": -18.41057777404785, "logits/rejected": -17.468299865722656, "logps/chosen": -375.44647216796875, "logps/rejected": -245.404541015625, "loss": 0.5231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5969791412353516, "rewards/margins": 1.9039100408554077, "rewards/rejected": 1.6930692195892334, "step": 105610 }, { "epoch": 4.903663122707647, "grad_norm": 8.830299377441406, "learning_rate": 5.90835229119272e-09, "logits/chosen": -18.83713150024414, "logits/rejected": -17.98263931274414, "logps/chosen": -480.1580505371094, "logps/rejected": -374.729248046875, "loss": 0.8552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.030165195465088, "rewards/margins": 1.1801133155822754, "rewards/rejected": 2.8500521183013916, "step": 105620 }, { "epoch": 4.904127396815079, "grad_norm": 116.60619354248047, "learning_rate": 5.8804958447467385e-09, "logits/chosen": -19.76325035095215, "logits/rejected": -18.189477920532227, "logps/chosen": -397.924560546875, "logps/rejected": -291.17254638671875, "loss": 0.3478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.039044380187988, "rewards/margins": 2.3277409076690674, "rewards/rejected": 1.711303472518921, "step": 105630 }, { "epoch": 4.904591670922513, "grad_norm": 123.97721862792969, "learning_rate": 5.852639398300756e-09, "logits/chosen": -19.58247184753418, "logits/rejected": -18.726192474365234, "logps/chosen": -464.91485595703125, "logps/rejected": -351.7415466308594, "loss": 0.3275, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.102548122406006, "rewards/margins": 1.8666677474975586, "rewards/rejected": 3.235879898071289, "step": 105640 }, { "epoch": 4.905055945029946, "grad_norm": 121.50389099121094, "learning_rate": 5.824782951854775e-09, "logits/chosen": -18.168560028076172, "logits/rejected": -17.469791412353516, "logps/chosen": -426.551513671875, "logps/rejected": -392.75579833984375, "loss": 1.018, "rewards/accuracies": 0.5, "rewards/chosen": 3.954613447189331, "rewards/margins": 0.386930912733078, "rewards/rejected": 3.5676827430725098, "step": 105650 }, { "epoch": 4.905520219137379, "grad_norm": 73.18236541748047, "learning_rate": 5.796926505408794e-09, "logits/chosen": -18.565095901489258, "logits/rejected": -18.638463973999023, "logps/chosen": -429.7755432128906, "logps/rejected": -400.7147216796875, "loss": 1.0987, "rewards/accuracies": 0.5, "rewards/chosen": 2.6703569889068604, "rewards/margins": 0.10328960418701172, "rewards/rejected": 2.5670669078826904, "step": 105660 }, { "epoch": 4.905984493244812, "grad_norm": 156.81979370117188, "learning_rate": 5.7690700589628115e-09, "logits/chosen": -18.506023406982422, "logits/rejected": -18.482629776000977, "logps/chosen": -267.3078308105469, "logps/rejected": -305.343017578125, "loss": 0.9049, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.887403130531311, "rewards/margins": 0.3961030840873718, "rewards/rejected": 1.4912998676300049, "step": 105670 }, { "epoch": 4.906448767352245, "grad_norm": 138.1957550048828, "learning_rate": 5.74121361251683e-09, "logits/chosen": -18.796947479248047, "logits/rejected": -18.157489776611328, "logps/chosen": -425.02764892578125, "logps/rejected": -295.6107482910156, "loss": 0.7426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2228236198425293, "rewards/margins": 0.9949444532394409, "rewards/rejected": 2.227879047393799, "step": 105680 }, { "epoch": 4.906913041459678, "grad_norm": 76.7997055053711, "learning_rate": 5.713357166070847e-09, "logits/chosen": -19.699234008789062, "logits/rejected": -18.16739845275879, "logps/chosen": -424.49554443359375, "logps/rejected": -236.44149780273438, "loss": 0.7299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.372969627380371, "rewards/margins": 2.8657138347625732, "rewards/rejected": 1.5072557926177979, "step": 105690 }, { "epoch": 4.9073773155671105, "grad_norm": 32.34992218017578, "learning_rate": 5.685500719624866e-09, "logits/chosen": -18.73488998413086, "logits/rejected": -17.796031951904297, "logps/chosen": -505.77911376953125, "logps/rejected": -341.09747314453125, "loss": 0.7205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.328173875808716, "rewards/margins": 1.0663893222808838, "rewards/rejected": 2.261784076690674, "step": 105700 }, { "epoch": 4.907841589674544, "grad_norm": 61.381866455078125, "learning_rate": 5.657644273178885e-09, "logits/chosen": -19.089569091796875, "logits/rejected": -18.11847496032715, "logps/chosen": -380.42108154296875, "logps/rejected": -332.4173583984375, "loss": 0.531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6801018714904785, "rewards/margins": 1.2988773584365845, "rewards/rejected": 2.3812246322631836, "step": 105710 }, { "epoch": 4.908305863781977, "grad_norm": 68.2836685180664, "learning_rate": 5.6297878267329024e-09, "logits/chosen": -19.15264320373535, "logits/rejected": -18.60326385498047, "logps/chosen": -330.502197265625, "logps/rejected": -327.6416015625, "loss": 1.1341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6172990798950195, "rewards/margins": 0.4836192727088928, "rewards/rejected": 2.1336798667907715, "step": 105720 }, { "epoch": 4.90877013788941, "grad_norm": 0.32274630665779114, "learning_rate": 5.601931380286921e-09, "logits/chosen": -18.508176803588867, "logits/rejected": -17.483074188232422, "logps/chosen": -465.86651611328125, "logps/rejected": -351.9482116699219, "loss": 0.8297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.210425853729248, "rewards/margins": 1.5696556568145752, "rewards/rejected": 2.6407697200775146, "step": 105730 }, { "epoch": 4.909234411996843, "grad_norm": 48.701622009277344, "learning_rate": 5.574074933840939e-09, "logits/chosen": -19.249679565429688, "logits/rejected": -18.76401138305664, "logps/chosen": -291.96929931640625, "logps/rejected": -283.18896484375, "loss": 1.1263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9487085342407227, "rewards/margins": -0.014075112529098988, "rewards/rejected": 1.9627834558486938, "step": 105740 }, { "epoch": 4.909698686104276, "grad_norm": 145.40858459472656, "learning_rate": 5.546218487394958e-09, "logits/chosen": -18.12470054626465, "logits/rejected": -17.845365524291992, "logps/chosen": -409.88751220703125, "logps/rejected": -375.74932861328125, "loss": 0.687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1377713680267334, "rewards/margins": 0.747463583946228, "rewards/rejected": 2.390307903289795, "step": 105750 }, { "epoch": 4.910162960211709, "grad_norm": 25.363481521606445, "learning_rate": 5.518362040948976e-09, "logits/chosen": -20.152902603149414, "logits/rejected": -19.505395889282227, "logps/chosen": -380.6194763183594, "logps/rejected": -297.91864013671875, "loss": 0.2763, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5512070655822754, "rewards/margins": 2.0732059478759766, "rewards/rejected": 1.4780009984970093, "step": 105760 }, { "epoch": 4.910627234319142, "grad_norm": 184.9598846435547, "learning_rate": 5.490505594502994e-09, "logits/chosen": -18.56573486328125, "logits/rejected": -18.309032440185547, "logps/chosen": -371.2743835449219, "logps/rejected": -320.6539001464844, "loss": 0.5393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0177276134490967, "rewards/margins": 1.1575286388397217, "rewards/rejected": 1.860199213027954, "step": 105770 }, { "epoch": 4.911091508426575, "grad_norm": 160.7023162841797, "learning_rate": 5.462649148057013e-09, "logits/chosen": -19.822708129882812, "logits/rejected": -19.47140884399414, "logps/chosen": -365.9219665527344, "logps/rejected": -369.3000793457031, "loss": 0.7739, "rewards/accuracies": 0.5, "rewards/chosen": 3.0941083431243896, "rewards/margins": 0.6687833666801453, "rewards/rejected": 2.4253249168395996, "step": 105780 }, { "epoch": 4.911555782534008, "grad_norm": 14.575833320617676, "learning_rate": 5.434792701611031e-09, "logits/chosen": -19.624536514282227, "logits/rejected": -18.55908966064453, "logps/chosen": -293.4256286621094, "logps/rejected": -272.5821838378906, "loss": 0.648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3156142234802246, "rewards/margins": 1.8246681690216064, "rewards/rejected": 1.4909464120864868, "step": 105790 }, { "epoch": 4.912020056641441, "grad_norm": 17.991533279418945, "learning_rate": 5.406936255165049e-09, "logits/chosen": -19.37578010559082, "logits/rejected": -19.681699752807617, "logps/chosen": -293.7196044921875, "logps/rejected": -329.16009521484375, "loss": 1.0361, "rewards/accuracies": 0.5, "rewards/chosen": 2.4853975772857666, "rewards/margins": -0.19350266456604004, "rewards/rejected": 2.6789000034332275, "step": 105800 }, { "epoch": 4.912484330748875, "grad_norm": 245.51455688476562, "learning_rate": 5.379079808719068e-09, "logits/chosen": -18.110408782958984, "logits/rejected": -18.81167984008789, "logps/chosen": -275.5584716796875, "logps/rejected": -377.46875, "loss": 2.3376, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.4368530809879303, "rewards/margins": -1.956170678138733, "rewards/rejected": 2.393023729324341, "step": 105810 }, { "epoch": 4.912948604856307, "grad_norm": 152.5496368408203, "learning_rate": 5.351223362273086e-09, "logits/chosen": -19.596004486083984, "logits/rejected": -19.35904884338379, "logps/chosen": -499.22845458984375, "logps/rejected": -451.0914611816406, "loss": 0.21, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.830809116363525, "rewards/margins": 2.371201992034912, "rewards/rejected": 3.459606885910034, "step": 105820 }, { "epoch": 4.91341287896374, "grad_norm": 34.08015060424805, "learning_rate": 5.3233669158271045e-09, "logits/chosen": -18.699630737304688, "logits/rejected": -17.567903518676758, "logps/chosen": -346.62615966796875, "logps/rejected": -278.9568786621094, "loss": 1.0598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.081489086151123, "rewards/margins": 0.9653903245925903, "rewards/rejected": 2.1160988807678223, "step": 105830 }, { "epoch": 4.913877153071173, "grad_norm": 188.97268676757812, "learning_rate": 5.295510469381122e-09, "logits/chosen": -18.847843170166016, "logits/rejected": -17.981189727783203, "logps/chosen": -436.4452209472656, "logps/rejected": -341.8465270996094, "loss": 0.4826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.399332046508789, "rewards/margins": 1.6091629266738892, "rewards/rejected": 1.7901694774627686, "step": 105840 }, { "epoch": 4.914341427178607, "grad_norm": 67.16703796386719, "learning_rate": 5.267654022935141e-09, "logits/chosen": -18.53890037536621, "logits/rejected": -17.64635467529297, "logps/chosen": -418.51953125, "logps/rejected": -228.34909057617188, "loss": 0.4876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.523333787918091, "rewards/margins": 2.0567915439605713, "rewards/rejected": 1.4665426015853882, "step": 105850 }, { "epoch": 4.914805701286039, "grad_norm": 10.227031707763672, "learning_rate": 5.239797576489158e-09, "logits/chosen": -18.50657844543457, "logits/rejected": -19.043975830078125, "logps/chosen": -472.2982482910156, "logps/rejected": -465.5310974121094, "loss": 1.7984, "rewards/accuracies": 0.5, "rewards/chosen": 3.3666720390319824, "rewards/margins": -0.032364439219236374, "rewards/rejected": 3.399036407470703, "step": 105860 }, { "epoch": 4.915269975393472, "grad_norm": 135.40464782714844, "learning_rate": 5.2119411300431775e-09, "logits/chosen": -19.24985122680664, "logits/rejected": -19.1571044921875, "logps/chosen": -296.72674560546875, "logps/rejected": -335.5807189941406, "loss": 1.1734, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3658840656280518, "rewards/margins": -0.46778473258018494, "rewards/rejected": 2.8336687088012695, "step": 105870 }, { "epoch": 4.915734249500906, "grad_norm": 80.51732635498047, "learning_rate": 5.184084683597196e-09, "logits/chosen": -18.643657684326172, "logits/rejected": -18.300966262817383, "logps/chosen": -418.9908752441406, "logps/rejected": -319.119140625, "loss": 0.538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6442971229553223, "rewards/margins": 2.543192148208618, "rewards/rejected": 1.1011043787002563, "step": 105880 }, { "epoch": 4.9161985236083385, "grad_norm": 248.5968017578125, "learning_rate": 5.156228237151213e-09, "logits/chosen": -17.954059600830078, "logits/rejected": -17.806718826293945, "logps/chosen": -332.92431640625, "logps/rejected": -326.36859130859375, "loss": 0.9785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8817520141601562, "rewards/margins": 0.8380603790283203, "rewards/rejected": 2.043691635131836, "step": 105890 }, { "epoch": 4.916662797715771, "grad_norm": 29.190948486328125, "learning_rate": 5.128371790705232e-09, "logits/chosen": -18.40248680114746, "logits/rejected": -19.65606689453125, "logps/chosen": -349.3621520996094, "logps/rejected": -288.5097961425781, "loss": 0.8223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.081040620803833, "rewards/margins": 0.9729534983634949, "rewards/rejected": 2.1080870628356934, "step": 105900 }, { "epoch": 4.917127071823204, "grad_norm": 251.54660034179688, "learning_rate": 5.10051534425925e-09, "logits/chosen": -19.538854598999023, "logits/rejected": -18.685556411743164, "logps/chosen": -466.61578369140625, "logps/rejected": -428.08599853515625, "loss": 0.8382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.373638868331909, "rewards/margins": 0.8316643834114075, "rewards/rejected": 2.5419745445251465, "step": 105910 }, { "epoch": 4.917591345930638, "grad_norm": 58.19011306762695, "learning_rate": 5.0726588978132684e-09, "logits/chosen": -18.348844528198242, "logits/rejected": -17.495874404907227, "logps/chosen": -378.76806640625, "logps/rejected": -274.1330871582031, "loss": 0.5296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.852057933807373, "rewards/margins": 1.1515060663223267, "rewards/rejected": 1.700551986694336, "step": 105920 }, { "epoch": 4.9180556200380705, "grad_norm": 199.36102294921875, "learning_rate": 5.044802451367287e-09, "logits/chosen": -18.702951431274414, "logits/rejected": -18.514244079589844, "logps/chosen": -447.59112548828125, "logps/rejected": -380.1578063964844, "loss": 0.8709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9577057361602783, "rewards/margins": 0.7418127059936523, "rewards/rejected": 3.215892791748047, "step": 105930 }, { "epoch": 4.918519894145503, "grad_norm": 188.0076446533203, "learning_rate": 5.016946004921305e-09, "logits/chosen": -20.050460815429688, "logits/rejected": -19.503366470336914, "logps/chosen": -464.99969482421875, "logps/rejected": -427.60882568359375, "loss": 0.5425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.426735877990723, "rewards/margins": 0.9913175702095032, "rewards/rejected": 3.435417890548706, "step": 105940 }, { "epoch": 4.918984168252937, "grad_norm": 54.810523986816406, "learning_rate": 4.989089558475324e-09, "logits/chosen": -19.125110626220703, "logits/rejected": -18.0382022857666, "logps/chosen": -418.517333984375, "logps/rejected": -327.2312316894531, "loss": 0.3291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1653637886047363, "rewards/margins": 1.6323010921478271, "rewards/rejected": 1.5330625772476196, "step": 105950 }, { "epoch": 4.91944844236037, "grad_norm": 1.3214696645736694, "learning_rate": 4.9612331120293415e-09, "logits/chosen": -18.817747116088867, "logits/rejected": -17.43819808959961, "logps/chosen": -374.71563720703125, "logps/rejected": -255.7344207763672, "loss": 0.8408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.058988094329834, "rewards/margins": 1.6388180255889893, "rewards/rejected": 1.4201700687408447, "step": 105960 }, { "epoch": 4.9199127164678025, "grad_norm": 136.6318817138672, "learning_rate": 4.93337666558336e-09, "logits/chosen": -18.893064498901367, "logits/rejected": -18.69521141052246, "logps/chosen": -443.9226989746094, "logps/rejected": -458.78741455078125, "loss": 0.6742, "rewards/accuracies": 0.5, "rewards/chosen": 4.176724433898926, "rewards/margins": 0.9050512313842773, "rewards/rejected": 3.2716736793518066, "step": 105970 }, { "epoch": 4.920376990575235, "grad_norm": 16.62032127380371, "learning_rate": 4.905520219137379e-09, "logits/chosen": -18.839452743530273, "logits/rejected": -18.47650909423828, "logps/chosen": -337.58184814453125, "logps/rejected": -277.22479248046875, "loss": 1.0436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.552184581756592, "rewards/margins": -0.06760495901107788, "rewards/rejected": 2.6197896003723145, "step": 105980 }, { "epoch": 4.920841264682669, "grad_norm": 31.60563087463379, "learning_rate": 4.877663772691397e-09, "logits/chosen": -18.20343017578125, "logits/rejected": -17.693531036376953, "logps/chosen": -331.49041748046875, "logps/rejected": -287.7189636230469, "loss": 0.9608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9597694873809814, "rewards/margins": 1.0570249557495117, "rewards/rejected": 0.9027444124221802, "step": 105990 }, { "epoch": 4.921305538790102, "grad_norm": 10.779338836669922, "learning_rate": 4.849807326245415e-09, "logits/chosen": -19.271770477294922, "logits/rejected": -17.3789119720459, "logps/chosen": -336.5004577636719, "logps/rejected": -199.04501342773438, "loss": 0.4382, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0956404209136963, "rewards/margins": 1.9993282556533813, "rewards/rejected": 1.096312165260315, "step": 106000 }, { "epoch": 4.9217698128975345, "grad_norm": 54.713653564453125, "learning_rate": 4.821950879799433e-09, "logits/chosen": -19.301868438720703, "logits/rejected": -18.282400131225586, "logps/chosen": -402.7153625488281, "logps/rejected": -355.4459228515625, "loss": 0.7355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2298903465271, "rewards/margins": 1.3790932893753052, "rewards/rejected": 2.850797176361084, "step": 106010 }, { "epoch": 4.922234087004968, "grad_norm": 0.31007030606269836, "learning_rate": 4.794094433353452e-09, "logits/chosen": -19.519498825073242, "logits/rejected": -17.497177124023438, "logps/chosen": -493.2217712402344, "logps/rejected": -316.94091796875, "loss": 0.2456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.302649021148682, "rewards/margins": 3.3423264026641846, "rewards/rejected": 0.9603222012519836, "step": 106020 }, { "epoch": 4.922698361112401, "grad_norm": 34.19065475463867, "learning_rate": 4.7662379869074705e-09, "logits/chosen": -19.294902801513672, "logits/rejected": -18.716156005859375, "logps/chosen": -427.788818359375, "logps/rejected": -407.40966796875, "loss": 0.6646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.355854034423828, "rewards/margins": 0.5434153079986572, "rewards/rejected": 2.812439203262329, "step": 106030 }, { "epoch": 4.923162635219834, "grad_norm": 48.73501205444336, "learning_rate": 4.738381540461488e-09, "logits/chosen": -20.139007568359375, "logits/rejected": -18.954790115356445, "logps/chosen": -359.76666259765625, "logps/rejected": -301.5754089355469, "loss": 0.5419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2914981842041016, "rewards/margins": 1.1693369150161743, "rewards/rejected": 2.1221611499786377, "step": 106040 }, { "epoch": 4.923626909327266, "grad_norm": 75.66746520996094, "learning_rate": 4.710525094015507e-09, "logits/chosen": -19.821956634521484, "logits/rejected": -18.63329315185547, "logps/chosen": -357.3146667480469, "logps/rejected": -285.8651428222656, "loss": 0.4322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3772590160369873, "rewards/margins": 1.4125896692276, "rewards/rejected": 1.9646694660186768, "step": 106050 }, { "epoch": 4.9240911834347, "grad_norm": 48.791053771972656, "learning_rate": 4.682668647569525e-09, "logits/chosen": -19.76652717590332, "logits/rejected": -18.586904525756836, "logps/chosen": -340.71661376953125, "logps/rejected": -239.26840209960938, "loss": 0.501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1542294025421143, "rewards/margins": 1.3009333610534668, "rewards/rejected": 0.8532959222793579, "step": 106060 }, { "epoch": 4.924555457542133, "grad_norm": 73.65951538085938, "learning_rate": 4.654812201123543e-09, "logits/chosen": -19.219640731811523, "logits/rejected": -17.929569244384766, "logps/chosen": -373.0710144042969, "logps/rejected": -301.3561096191406, "loss": 0.4411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.779966354370117, "rewards/margins": 2.0083651542663574, "rewards/rejected": 1.7716013193130493, "step": 106070 }, { "epoch": 4.925019731649566, "grad_norm": 119.47216796875, "learning_rate": 4.626955754677561e-09, "logits/chosen": -19.536945343017578, "logits/rejected": -19.614471435546875, "logps/chosen": -275.92999267578125, "logps/rejected": -285.0408935546875, "loss": 0.8258, "rewards/accuracies": 0.5, "rewards/chosen": 1.7985687255859375, "rewards/margins": 0.47798481583595276, "rewards/rejected": 1.3205838203430176, "step": 106080 }, { "epoch": 4.925484005756999, "grad_norm": 0.2299785614013672, "learning_rate": 4.599099308231579e-09, "logits/chosen": -19.08283042907715, "logits/rejected": -18.502531051635742, "logps/chosen": -293.0805358886719, "logps/rejected": -269.8620300292969, "loss": 0.8199, "rewards/accuracies": 0.5, "rewards/chosen": 2.380342960357666, "rewards/margins": 0.9281433820724487, "rewards/rejected": 1.4521995782852173, "step": 106090 }, { "epoch": 4.925948279864432, "grad_norm": 26.645689010620117, "learning_rate": 4.571242861785598e-09, "logits/chosen": -19.944019317626953, "logits/rejected": -19.202281951904297, "logps/chosen": -455.72393798828125, "logps/rejected": -569.1038208007812, "loss": 0.4447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.221161365509033, "rewards/margins": 1.855507254600525, "rewards/rejected": 3.3656539916992188, "step": 106100 }, { "epoch": 4.926412553971865, "grad_norm": 41.979766845703125, "learning_rate": 4.543386415339617e-09, "logits/chosen": -19.36250877380371, "logits/rejected": -19.07781982421875, "logps/chosen": -327.82061767578125, "logps/rejected": -331.91851806640625, "loss": 0.5831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7016727924346924, "rewards/margins": 0.8695882558822632, "rewards/rejected": 1.8320844173431396, "step": 106110 }, { "epoch": 4.926876828079298, "grad_norm": 7.386874198913574, "learning_rate": 4.5155299688936344e-09, "logits/chosen": -19.293161392211914, "logits/rejected": -18.77981948852539, "logps/chosen": -381.08868408203125, "logps/rejected": -302.4364013671875, "loss": 0.418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.295444965362549, "rewards/margins": 1.8027359247207642, "rewards/rejected": 1.4927092790603638, "step": 106120 }, { "epoch": 4.927341102186731, "grad_norm": 55.629329681396484, "learning_rate": 4.487673522447653e-09, "logits/chosen": -19.08733558654785, "logits/rejected": -18.421009063720703, "logps/chosen": -429.53631591796875, "logps/rejected": -324.0295715332031, "loss": 0.4904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.574877977371216, "rewards/margins": 1.5118706226348877, "rewards/rejected": 2.063007354736328, "step": 106130 }, { "epoch": 4.927805376294164, "grad_norm": 224.1404266357422, "learning_rate": 4.459817076001671e-09, "logits/chosen": -19.790292739868164, "logits/rejected": -17.97762107849121, "logps/chosen": -388.51416015625, "logps/rejected": -310.4582214355469, "loss": 0.7394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7139248847961426, "rewards/margins": 1.794434905052185, "rewards/rejected": 1.9194896221160889, "step": 106140 }, { "epoch": 4.928269650401597, "grad_norm": 62.16980743408203, "learning_rate": 4.43196062955569e-09, "logits/chosen": -19.177034378051758, "logits/rejected": -18.5771541595459, "logps/chosen": -332.08197021484375, "logps/rejected": -334.21563720703125, "loss": 0.709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.004908800125122, "rewards/margins": 1.085355520248413, "rewards/rejected": 1.9195531606674194, "step": 106150 }, { "epoch": 4.9287339245090305, "grad_norm": 28.211557388305664, "learning_rate": 4.404104183109708e-09, "logits/chosen": -19.523117065429688, "logits/rejected": -19.001346588134766, "logps/chosen": -484.6415100097656, "logps/rejected": -438.62969970703125, "loss": 0.3989, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.079245567321777, "rewards/margins": 2.1247243881225586, "rewards/rejected": 2.9545211791992188, "step": 106160 }, { "epoch": 4.929198198616463, "grad_norm": 12.76988410949707, "learning_rate": 4.376247736663726e-09, "logits/chosen": -18.506851196289062, "logits/rejected": -18.28786277770996, "logps/chosen": -316.43475341796875, "logps/rejected": -309.34051513671875, "loss": 0.4985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.373692274093628, "rewards/margins": 1.6838972568511963, "rewards/rejected": 1.6897951364517212, "step": 106170 }, { "epoch": 4.929662472723896, "grad_norm": 73.8425521850586, "learning_rate": 4.348391290217744e-09, "logits/chosen": -20.169612884521484, "logits/rejected": -19.154253005981445, "logps/chosen": -486.9402770996094, "logps/rejected": -407.58349609375, "loss": 0.6325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.53756046295166, "rewards/margins": 1.2337706089019775, "rewards/rejected": 3.3037898540496826, "step": 106180 }, { "epoch": 4.930126746831329, "grad_norm": 300.3466491699219, "learning_rate": 4.320534843771763e-09, "logits/chosen": -20.15044593811035, "logits/rejected": -18.814313888549805, "logps/chosen": -395.3680725097656, "logps/rejected": -364.9399108886719, "loss": 1.0624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.294276714324951, "rewards/margins": 1.0904055833816528, "rewards/rejected": 3.2038707733154297, "step": 106190 }, { "epoch": 4.9305910209387624, "grad_norm": 2.5801494121551514, "learning_rate": 4.2926783973257805e-09, "logits/chosen": -18.53673553466797, "logits/rejected": -17.910167694091797, "logps/chosen": -371.78607177734375, "logps/rejected": -253.2945556640625, "loss": 0.9208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.245703935623169, "rewards/margins": 1.262096643447876, "rewards/rejected": 1.9836066961288452, "step": 106200 }, { "epoch": 4.931055295046195, "grad_norm": 10.259693145751953, "learning_rate": 4.264821950879799e-09, "logits/chosen": -19.373367309570312, "logits/rejected": -17.357921600341797, "logps/chosen": -322.60003662109375, "logps/rejected": -183.4161834716797, "loss": 0.1385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3932909965515137, "rewards/margins": 3.5018222332000732, "rewards/rejected": -0.10853134095668793, "step": 106210 }, { "epoch": 4.931519569153628, "grad_norm": 40.63618087768555, "learning_rate": 4.236965504433818e-09, "logits/chosen": -19.76540184020996, "logits/rejected": -18.809947967529297, "logps/chosen": -425.1863708496094, "logps/rejected": -336.7233581542969, "loss": 0.5583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8788115978240967, "rewards/margins": 1.7122398614883423, "rewards/rejected": 2.166572093963623, "step": 106220 }, { "epoch": 4.931983843261062, "grad_norm": 60.37303924560547, "learning_rate": 4.209109057987836e-09, "logits/chosen": -19.230613708496094, "logits/rejected": -19.414569854736328, "logps/chosen": -473.6053161621094, "logps/rejected": -476.70025634765625, "loss": 1.281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.793445587158203, "rewards/margins": 0.05291438102722168, "rewards/rejected": 3.7405312061309814, "step": 106230 }, { "epoch": 4.932448117368494, "grad_norm": 17.37449073791504, "learning_rate": 4.181252611541854e-09, "logits/chosen": -19.218130111694336, "logits/rejected": -19.05297088623047, "logps/chosen": -321.08074951171875, "logps/rejected": -377.5881652832031, "loss": 0.7376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1193368434906006, "rewards/margins": 0.8663884997367859, "rewards/rejected": 2.25294828414917, "step": 106240 }, { "epoch": 4.932912391475927, "grad_norm": 82.20423126220703, "learning_rate": 4.153396165095872e-09, "logits/chosen": -18.684528350830078, "logits/rejected": -18.025455474853516, "logps/chosen": -360.8583984375, "logps/rejected": -239.6913299560547, "loss": 0.7273, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.761228322982788, "rewards/margins": 0.8839554786682129, "rewards/rejected": 1.8772728443145752, "step": 106250 }, { "epoch": 4.93337666558336, "grad_norm": 16.741470336914062, "learning_rate": 4.125539718649891e-09, "logits/chosen": -20.378833770751953, "logits/rejected": -19.548154830932617, "logps/chosen": -319.38885498046875, "logps/rejected": -282.6914978027344, "loss": 0.8844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9836618900299072, "rewards/margins": 0.6033520698547363, "rewards/rejected": 1.3803095817565918, "step": 106260 }, { "epoch": 4.933840939690794, "grad_norm": 101.68052673339844, "learning_rate": 4.097683272203909e-09, "logits/chosen": -19.291595458984375, "logits/rejected": -17.872516632080078, "logps/chosen": -408.19708251953125, "logps/rejected": -305.0116882324219, "loss": 0.7725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.008727550506592, "rewards/margins": 1.8765922784805298, "rewards/rejected": 1.132135272026062, "step": 106270 }, { "epoch": 4.934305213798226, "grad_norm": 58.038143157958984, "learning_rate": 4.0698268257579274e-09, "logits/chosen": -19.706275939941406, "logits/rejected": -19.19623374938965, "logps/chosen": -309.87750244140625, "logps/rejected": -282.1827087402344, "loss": 0.8489, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.207805633544922, "rewards/margins": 0.7203564047813416, "rewards/rejected": 2.4874491691589355, "step": 106280 }, { "epoch": 4.934769487905659, "grad_norm": 30.700153350830078, "learning_rate": 4.041970379311945e-09, "logits/chosen": -19.587871551513672, "logits/rejected": -18.93514633178711, "logps/chosen": -482.43701171875, "logps/rejected": -396.60791015625, "loss": 0.831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.692859649658203, "rewards/margins": 1.7918239831924438, "rewards/rejected": 2.901036024093628, "step": 106290 }, { "epoch": 4.935233762013093, "grad_norm": 242.62879943847656, "learning_rate": 4.014113932865964e-09, "logits/chosen": -19.08889389038086, "logits/rejected": -18.291168212890625, "logps/chosen": -379.41033935546875, "logps/rejected": -360.134765625, "loss": 0.8687, "rewards/accuracies": 0.5, "rewards/chosen": 3.8449268341064453, "rewards/margins": 0.6414166688919067, "rewards/rejected": 3.203510284423828, "step": 106300 }, { "epoch": 4.935698036120526, "grad_norm": 27.064924240112305, "learning_rate": 3.986257486419983e-09, "logits/chosen": -18.946975708007812, "logits/rejected": -17.113332748413086, "logps/chosen": -379.48541259765625, "logps/rejected": -218.5131378173828, "loss": 0.3824, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.897012233734131, "rewards/margins": 2.3841378688812256, "rewards/rejected": 1.512874960899353, "step": 106310 }, { "epoch": 4.936162310227958, "grad_norm": 150.31649780273438, "learning_rate": 3.9584010399740005e-09, "logits/chosen": -19.330074310302734, "logits/rejected": -18.99494171142578, "logps/chosen": -411.4988708496094, "logps/rejected": -367.1881103515625, "loss": 0.612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8020145893096924, "rewards/margins": 0.8653114438056946, "rewards/rejected": 2.9367032051086426, "step": 106320 }, { "epoch": 4.936626584335391, "grad_norm": 6.232062339782715, "learning_rate": 3.930544593528019e-09, "logits/chosen": -18.730609893798828, "logits/rejected": -17.318933486938477, "logps/chosen": -360.5459289550781, "logps/rejected": -274.0505065917969, "loss": 0.4449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.080679893493652, "rewards/margins": 2.3358492851257324, "rewards/rejected": 1.7448304891586304, "step": 106330 }, { "epoch": 4.937090858442825, "grad_norm": 41.47214126586914, "learning_rate": 3.902688147082037e-09, "logits/chosen": -18.600624084472656, "logits/rejected": -17.470661163330078, "logps/chosen": -389.3685607910156, "logps/rejected": -285.07867431640625, "loss": 0.462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.610903263092041, "rewards/margins": 2.488558292388916, "rewards/rejected": 1.1223450899124146, "step": 106340 }, { "epoch": 4.937555132550258, "grad_norm": 110.53677368164062, "learning_rate": 3.874831700636055e-09, "logits/chosen": -19.128305435180664, "logits/rejected": -18.80039405822754, "logps/chosen": -474.3075256347656, "logps/rejected": -391.59417724609375, "loss": 0.9744, "rewards/accuracies": 0.5, "rewards/chosen": 3.664525270462036, "rewards/margins": 0.4078551232814789, "rewards/rejected": 3.256669521331787, "step": 106350 }, { "epoch": 4.93801940665769, "grad_norm": 45.1489143371582, "learning_rate": 3.8469752541900735e-09, "logits/chosen": -19.322509765625, "logits/rejected": -18.902050018310547, "logps/chosen": -352.6865539550781, "logps/rejected": -325.90740966796875, "loss": 1.1791, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8610634803771973, "rewards/margins": 1.0556390285491943, "rewards/rejected": 2.805424213409424, "step": 106360 }, { "epoch": 4.938483680765124, "grad_norm": 135.71112060546875, "learning_rate": 3.819118807744092e-09, "logits/chosen": -18.455249786376953, "logits/rejected": -17.394908905029297, "logps/chosen": -451.11297607421875, "logps/rejected": -356.2857666015625, "loss": 1.0457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0356125831604, "rewards/margins": 1.5379002094268799, "rewards/rejected": 2.4977126121520996, "step": 106370 }, { "epoch": 4.938947954872557, "grad_norm": 6.403566360473633, "learning_rate": 3.79126236129811e-09, "logits/chosen": -18.79178810119629, "logits/rejected": -18.766929626464844, "logps/chosen": -334.7382507324219, "logps/rejected": -311.03216552734375, "loss": 0.9396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3810524940490723, "rewards/margins": 0.7815597653388977, "rewards/rejected": 2.5994925498962402, "step": 106380 }, { "epoch": 4.9394122289799895, "grad_norm": 3.325356960296631, "learning_rate": 3.763405914852129e-09, "logits/chosen": -19.02191925048828, "logits/rejected": -16.928586959838867, "logps/chosen": -453.59814453125, "logps/rejected": -260.2793884277344, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": 3.871394634246826, "rewards/margins": 3.2861011028289795, "rewards/rejected": 0.5852934122085571, "step": 106390 }, { "epoch": 4.939876503087422, "grad_norm": 52.06482696533203, "learning_rate": 3.7355494684061465e-09, "logits/chosen": -18.97951889038086, "logits/rejected": -18.951557159423828, "logps/chosen": -299.3197937011719, "logps/rejected": -257.58642578125, "loss": 0.5564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1844778060913086, "rewards/margins": 0.8893097043037415, "rewards/rejected": 1.2951676845550537, "step": 106400 }, { "epoch": 4.940340777194856, "grad_norm": 19.410213470458984, "learning_rate": 3.7076930219601648e-09, "logits/chosen": -19.370378494262695, "logits/rejected": -18.431631088256836, "logps/chosen": -447.203125, "logps/rejected": -349.8328552246094, "loss": 0.312, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.253750801086426, "rewards/margins": 1.8223222494125366, "rewards/rejected": 2.4314286708831787, "step": 106410 }, { "epoch": 4.940805051302289, "grad_norm": 28.804643630981445, "learning_rate": 3.6798365755141835e-09, "logits/chosen": -19.730884552001953, "logits/rejected": -18.525617599487305, "logps/chosen": -373.1263732910156, "logps/rejected": -321.88250732421875, "loss": 0.4165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8520407676696777, "rewards/margins": 1.9760265350341797, "rewards/rejected": 1.8760141134262085, "step": 106420 }, { "epoch": 4.9412693254097215, "grad_norm": 108.05924987792969, "learning_rate": 3.6519801290682017e-09, "logits/chosen": -19.472753524780273, "logits/rejected": -18.100675582885742, "logps/chosen": -452.4134826660156, "logps/rejected": -403.9898681640625, "loss": 0.7961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.370017051696777, "rewards/margins": 0.4063708782196045, "rewards/rejected": 3.9636459350585938, "step": 106430 }, { "epoch": 4.941733599517155, "grad_norm": 205.04708862304688, "learning_rate": 3.62412368262222e-09, "logits/chosen": -19.061304092407227, "logits/rejected": -18.36437225341797, "logps/chosen": -406.8621826171875, "logps/rejected": -408.4677734375, "loss": 0.6225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.262772560119629, "rewards/margins": 1.277361273765564, "rewards/rejected": 2.9854114055633545, "step": 106440 }, { "epoch": 4.942197873624588, "grad_norm": 188.6513214111328, "learning_rate": 3.5962672361762382e-09, "logits/chosen": -19.02520751953125, "logits/rejected": -18.375995635986328, "logps/chosen": -293.23895263671875, "logps/rejected": -224.3872528076172, "loss": 0.9467, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2175848484039307, "rewards/margins": 1.1412465572357178, "rewards/rejected": 1.0763381719589233, "step": 106450 }, { "epoch": 4.942662147732021, "grad_norm": 84.70828247070312, "learning_rate": 3.5684107897302565e-09, "logits/chosen": -19.36687469482422, "logits/rejected": -18.666545867919922, "logps/chosen": -366.08587646484375, "logps/rejected": -301.4176940917969, "loss": 0.5133, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9482808113098145, "rewards/margins": 1.4278299808502197, "rewards/rejected": 2.520451068878174, "step": 106460 }, { "epoch": 4.943126421839454, "grad_norm": 0.7358551025390625, "learning_rate": 3.540554343284275e-09, "logits/chosen": -19.71137046813965, "logits/rejected": -18.197900772094727, "logps/chosen": -370.6177062988281, "logps/rejected": -261.25042724609375, "loss": 0.2058, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.980064868927002, "rewards/margins": 3.464104413986206, "rewards/rejected": 0.5159597396850586, "step": 106470 }, { "epoch": 4.943590695946887, "grad_norm": 12.118955612182617, "learning_rate": 3.5126978968382934e-09, "logits/chosen": -18.329870223999023, "logits/rejected": -17.59909439086914, "logps/chosen": -346.25164794921875, "logps/rejected": -304.070556640625, "loss": 0.8999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7194793224334717, "rewards/margins": 1.4476284980773926, "rewards/rejected": 2.271850824356079, "step": 106480 }, { "epoch": 4.94405497005432, "grad_norm": 36.96175765991211, "learning_rate": 3.4848414503923113e-09, "logits/chosen": -20.054622650146484, "logits/rejected": -19.375455856323242, "logps/chosen": -423.315673828125, "logps/rejected": -409.982421875, "loss": 0.4542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4481098651885986, "rewards/margins": 0.8691040277481079, "rewards/rejected": 2.579005718231201, "step": 106490 }, { "epoch": 4.944519244161754, "grad_norm": 53.51072311401367, "learning_rate": 3.4569850039463295e-09, "logits/chosen": -19.136608123779297, "logits/rejected": -18.582000732421875, "logps/chosen": -362.80157470703125, "logps/rejected": -317.71661376953125, "loss": 0.8524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3243465423583984, "rewards/margins": 0.7059985399246216, "rewards/rejected": 2.618347644805908, "step": 106500 }, { "epoch": 4.944983518269186, "grad_norm": 121.42098236083984, "learning_rate": 3.4291285575003478e-09, "logits/chosen": -20.013349533081055, "logits/rejected": -19.55099105834961, "logps/chosen": -362.13287353515625, "logps/rejected": -285.3177490234375, "loss": 0.5734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8628411293029785, "rewards/margins": 1.4726346731185913, "rewards/rejected": 2.3902063369750977, "step": 106510 }, { "epoch": 4.945447792376619, "grad_norm": 57.662322998046875, "learning_rate": 3.4012721110543665e-09, "logits/chosen": -17.766454696655273, "logits/rejected": -17.875551223754883, "logps/chosen": -285.6650695800781, "logps/rejected": -258.86602783203125, "loss": 0.4708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2445900440216064, "rewards/margins": 1.364292860031128, "rewards/rejected": 0.8802968263626099, "step": 106520 }, { "epoch": 4.945912066484052, "grad_norm": 148.8299560546875, "learning_rate": 3.3734156646083847e-09, "logits/chosen": -19.842824935913086, "logits/rejected": -19.388071060180664, "logps/chosen": -361.08087158203125, "logps/rejected": -342.2245178222656, "loss": 0.707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4969615936279297, "rewards/margins": 0.7727974057197571, "rewards/rejected": 2.7241642475128174, "step": 106530 }, { "epoch": 4.946376340591486, "grad_norm": 276.15325927734375, "learning_rate": 3.345559218162403e-09, "logits/chosen": -18.874313354492188, "logits/rejected": -18.541223526000977, "logps/chosen": -386.7704162597656, "logps/rejected": -315.9248046875, "loss": 0.8693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7858328819274902, "rewards/margins": 0.585037112236023, "rewards/rejected": 2.2007956504821777, "step": 106540 }, { "epoch": 4.946840614698918, "grad_norm": 28.17853546142578, "learning_rate": 3.3177027717164212e-09, "logits/chosen": -19.560758590698242, "logits/rejected": -18.738739013671875, "logps/chosen": -395.2996520996094, "logps/rejected": -341.069091796875, "loss": 0.6965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.515472888946533, "rewards/margins": 1.0010974407196045, "rewards/rejected": 2.514375925064087, "step": 106550 }, { "epoch": 4.947304888806351, "grad_norm": 86.0505599975586, "learning_rate": 3.2898463252704395e-09, "logits/chosen": -19.229686737060547, "logits/rejected": -18.592235565185547, "logps/chosen": -327.8915100097656, "logps/rejected": -379.17620849609375, "loss": 0.8571, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0555100440979004, "rewards/margins": 1.3624688386917114, "rewards/rejected": 1.6930415630340576, "step": 106560 }, { "epoch": 4.947769162913785, "grad_norm": 60.28705978393555, "learning_rate": 3.261989878824458e-09, "logits/chosen": -19.3082218170166, "logits/rejected": -18.387054443359375, "logps/chosen": -496.4640197753906, "logps/rejected": -437.3536682128906, "loss": 0.3989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.298897743225098, "rewards/margins": 1.2340105772018433, "rewards/rejected": 3.064887523651123, "step": 106570 }, { "epoch": 4.9482334370212175, "grad_norm": 38.68281555175781, "learning_rate": 3.2341334323784764e-09, "logits/chosen": -19.335250854492188, "logits/rejected": -18.210956573486328, "logps/chosen": -319.693603515625, "logps/rejected": -243.4214324951172, "loss": 0.3224, "rewards/accuracies": 1.0, "rewards/chosen": 2.3173840045928955, "rewards/margins": 2.0584826469421387, "rewards/rejected": 0.2589011788368225, "step": 106580 }, { "epoch": 4.94869771112865, "grad_norm": 3.2777979373931885, "learning_rate": 3.2062769859324943e-09, "logits/chosen": -19.363378524780273, "logits/rejected": -18.19382667541504, "logps/chosen": -393.9573059082031, "logps/rejected": -235.6734619140625, "loss": 0.4843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5035202503204346, "rewards/margins": 2.290750026702881, "rewards/rejected": 1.2127699851989746, "step": 106590 }, { "epoch": 4.949161985236083, "grad_norm": 22.544139862060547, "learning_rate": 3.1784205394865125e-09, "logits/chosen": -19.468605041503906, "logits/rejected": -17.737834930419922, "logps/chosen": -448.14129638671875, "logps/rejected": -264.0495910644531, "loss": 0.34, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.678666114807129, "rewards/margins": 2.973926544189453, "rewards/rejected": 1.7047393321990967, "step": 106600 }, { "epoch": 4.949626259343517, "grad_norm": 10.385727882385254, "learning_rate": 3.150564093040531e-09, "logits/chosen": -19.398990631103516, "logits/rejected": -19.464548110961914, "logps/chosen": -483.75347900390625, "logps/rejected": -391.3924865722656, "loss": 0.6867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.897424221038818, "rewards/margins": 2.2624568939208984, "rewards/rejected": 2.63496732711792, "step": 106610 }, { "epoch": 4.9500905334509495, "grad_norm": 130.1932373046875, "learning_rate": 3.122707646594549e-09, "logits/chosen": -19.72896385192871, "logits/rejected": -17.6572265625, "logps/chosen": -385.050537109375, "logps/rejected": -257.5832214355469, "loss": 0.4688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.892054796218872, "rewards/margins": 2.5393543243408203, "rewards/rejected": 1.3527004718780518, "step": 106620 }, { "epoch": 4.950554807558382, "grad_norm": 114.1189956665039, "learning_rate": 3.0948512001485677e-09, "logits/chosen": -19.331029891967773, "logits/rejected": -19.777095794677734, "logps/chosen": -353.52752685546875, "logps/rejected": -478.0616149902344, "loss": 1.2975, "rewards/accuracies": 0.5, "rewards/chosen": 3.8913276195526123, "rewards/margins": -0.3460841178894043, "rewards/rejected": 4.237411975860596, "step": 106630 }, { "epoch": 4.951019081665816, "grad_norm": 24.001684188842773, "learning_rate": 3.066994753702586e-09, "logits/chosen": -18.787731170654297, "logits/rejected": -18.136960983276367, "logps/chosen": -344.2251892089844, "logps/rejected": -291.96282958984375, "loss": 0.4047, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.431434154510498, "rewards/margins": 1.7203556299209595, "rewards/rejected": 0.711078405380249, "step": 106640 }, { "epoch": 4.951483355773249, "grad_norm": 51.96311569213867, "learning_rate": 3.0391383072566042e-09, "logits/chosen": -18.937339782714844, "logits/rejected": -18.61210823059082, "logps/chosen": -325.1612548828125, "logps/rejected": -326.3365783691406, "loss": 1.3178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7919843196868896, "rewards/margins": 0.15901437401771545, "rewards/rejected": 2.632969856262207, "step": 106650 }, { "epoch": 4.9519476298806815, "grad_norm": 93.44189453125, "learning_rate": 3.0112818608106225e-09, "logits/chosen": -19.765377044677734, "logits/rejected": -18.72671890258789, "logps/chosen": -400.27984619140625, "logps/rejected": -377.0763244628906, "loss": 0.8594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6996471881866455, "rewards/margins": 1.22077214717865, "rewards/rejected": 2.4788753986358643, "step": 106660 }, { "epoch": 4.952411903988114, "grad_norm": 67.93561553955078, "learning_rate": 2.9834254143646404e-09, "logits/chosen": -20.142681121826172, "logits/rejected": -18.810184478759766, "logps/chosen": -325.7348937988281, "logps/rejected": -207.61679077148438, "loss": 0.4141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.16092586517334, "rewards/margins": 2.4471123218536377, "rewards/rejected": 1.7138137817382812, "step": 106670 }, { "epoch": 4.952876178095548, "grad_norm": 7.746964931488037, "learning_rate": 2.9555689679186594e-09, "logits/chosen": -20.320823669433594, "logits/rejected": -19.38774871826172, "logps/chosen": -426.0533142089844, "logps/rejected": -337.6181945800781, "loss": 0.493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.354488372802734, "rewards/margins": 1.8385531902313232, "rewards/rejected": 2.5159354209899902, "step": 106680 }, { "epoch": 4.953340452202981, "grad_norm": 11.951624870300293, "learning_rate": 2.9277125214726773e-09, "logits/chosen": -19.472599029541016, "logits/rejected": -19.227922439575195, "logps/chosen": -363.6936950683594, "logps/rejected": -358.1866760253906, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4407880306243896, "rewards/margins": 0.5897364020347595, "rewards/rejected": 1.851051688194275, "step": 106690 }, { "epoch": 4.9538047263104135, "grad_norm": 94.96717071533203, "learning_rate": 2.8998560750266955e-09, "logits/chosen": -18.55491065979004, "logits/rejected": -17.912866592407227, "logps/chosen": -377.7619934082031, "logps/rejected": -357.2741394042969, "loss": 0.7034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.893122911453247, "rewards/margins": 1.035582423210144, "rewards/rejected": 2.8575406074523926, "step": 106700 }, { "epoch": 4.954269000417847, "grad_norm": 171.14370727539062, "learning_rate": 2.871999628580714e-09, "logits/chosen": -19.405973434448242, "logits/rejected": -18.0767822265625, "logps/chosen": -385.89666748046875, "logps/rejected": -344.4328918457031, "loss": 0.6361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2595794200897217, "rewards/margins": 1.1371331214904785, "rewards/rejected": 2.122446298599243, "step": 106710 }, { "epoch": 4.95473327452528, "grad_norm": 11.284955978393555, "learning_rate": 2.844143182134732e-09, "logits/chosen": -19.229907989501953, "logits/rejected": -18.804485321044922, "logps/chosen": -446.536376953125, "logps/rejected": -437.77252197265625, "loss": 0.2648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.587874412536621, "rewards/margins": 1.7143948078155518, "rewards/rejected": 2.8734798431396484, "step": 106720 }, { "epoch": 4.955197548632713, "grad_norm": 7.398829936981201, "learning_rate": 2.8162867356887507e-09, "logits/chosen": -18.351882934570312, "logits/rejected": -18.530853271484375, "logps/chosen": -309.21807861328125, "logps/rejected": -325.758544921875, "loss": 0.8453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.690952777862549, "rewards/margins": 0.5835952758789062, "rewards/rejected": 2.1073575019836426, "step": 106730 }, { "epoch": 4.955661822740145, "grad_norm": 1.139999508857727, "learning_rate": 2.788430289242769e-09, "logits/chosen": -19.189159393310547, "logits/rejected": -18.581377029418945, "logps/chosen": -493.89166259765625, "logps/rejected": -360.96148681640625, "loss": 0.4911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.099492073059082, "rewards/margins": 1.651642084121704, "rewards/rejected": 2.447849988937378, "step": 106740 }, { "epoch": 4.956126096847579, "grad_norm": 54.64194869995117, "learning_rate": 2.7605738427967873e-09, "logits/chosen": -18.961339950561523, "logits/rejected": -18.61403465270996, "logps/chosen": -306.04107666015625, "logps/rejected": -314.97283935546875, "loss": 1.7247, "rewards/accuracies": 0.5, "rewards/chosen": 1.4478329420089722, "rewards/margins": -0.9348308444023132, "rewards/rejected": 2.3826639652252197, "step": 106750 }, { "epoch": 4.956590370955012, "grad_norm": 7.502991676330566, "learning_rate": 2.7327173963508055e-09, "logits/chosen": -19.764633178710938, "logits/rejected": -18.783790588378906, "logps/chosen": -360.28277587890625, "logps/rejected": -277.36053466796875, "loss": 0.1848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.891835689544678, "rewards/margins": 3.201341152191162, "rewards/rejected": 1.690494179725647, "step": 106760 }, { "epoch": 4.957054645062445, "grad_norm": 0.06577183306217194, "learning_rate": 2.7048609499048234e-09, "logits/chosen": -18.482545852661133, "logits/rejected": -18.112091064453125, "logps/chosen": -427.43194580078125, "logps/rejected": -364.0897521972656, "loss": 0.654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5997090339660645, "rewards/margins": 2.231456756591797, "rewards/rejected": 2.3682520389556885, "step": 106770 }, { "epoch": 4.957518919169878, "grad_norm": 190.00048828125, "learning_rate": 2.677004503458842e-09, "logits/chosen": -18.61142349243164, "logits/rejected": -17.55270004272461, "logps/chosen": -408.6079406738281, "logps/rejected": -291.5352783203125, "loss": 0.7255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.281921863555908, "rewards/margins": 1.3799073696136475, "rewards/rejected": 1.902014136314392, "step": 106780 }, { "epoch": 4.957983193277311, "grad_norm": 54.37395095825195, "learning_rate": 2.6491480570128603e-09, "logits/chosen": -18.693782806396484, "logits/rejected": -18.63856315612793, "logps/chosen": -431.89703369140625, "logps/rejected": -397.45703125, "loss": 1.1475, "rewards/accuracies": 0.5, "rewards/chosen": 4.127294063568115, "rewards/margins": -0.015303564257919788, "rewards/rejected": 4.142597675323486, "step": 106790 }, { "epoch": 4.958447467384744, "grad_norm": 131.6414794921875, "learning_rate": 2.6212916105668785e-09, "logits/chosen": -20.000261306762695, "logits/rejected": -18.856054306030273, "logps/chosen": -491.6025390625, "logps/rejected": -396.1558532714844, "loss": 0.3666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9765572547912598, "rewards/margins": 2.004254102706909, "rewards/rejected": 1.972303032875061, "step": 106800 }, { "epoch": 4.958911741492177, "grad_norm": 169.0497589111328, "learning_rate": 2.593435164120897e-09, "logits/chosen": -19.265117645263672, "logits/rejected": -18.21457862854004, "logps/chosen": -527.6400756835938, "logps/rejected": -361.1492614746094, "loss": 0.984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.477625608444214, "rewards/margins": 0.9165645837783813, "rewards/rejected": 2.561060905456543, "step": 106810 }, { "epoch": 4.95937601559961, "grad_norm": 141.58743286132812, "learning_rate": 2.565578717674915e-09, "logits/chosen": -19.781970977783203, "logits/rejected": -18.343090057373047, "logps/chosen": -604.9304809570312, "logps/rejected": -449.09259033203125, "loss": 0.4747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.086678504943848, "rewards/margins": 1.823425531387329, "rewards/rejected": 3.2632534503936768, "step": 106820 }, { "epoch": 4.959840289707043, "grad_norm": 71.37458038330078, "learning_rate": 2.5377222712289333e-09, "logits/chosen": -18.88642692565918, "logits/rejected": -18.883411407470703, "logps/chosen": -347.97100830078125, "logps/rejected": -346.8794860839844, "loss": 0.9989, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2194247245788574, "rewards/margins": 0.24884358048439026, "rewards/rejected": 1.970581293106079, "step": 106830 }, { "epoch": 4.960304563814476, "grad_norm": 56.31679916381836, "learning_rate": 2.509865824782952e-09, "logits/chosen": -19.000377655029297, "logits/rejected": -17.927806854248047, "logps/chosen": -386.9193115234375, "logps/rejected": -301.02838134765625, "loss": 0.4477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.511615037918091, "rewards/margins": 1.4430478811264038, "rewards/rejected": 2.0685672760009766, "step": 106840 }, { "epoch": 4.9607688379219095, "grad_norm": 243.65530395507812, "learning_rate": 2.4820093783369703e-09, "logits/chosen": -18.65079116821289, "logits/rejected": -17.988122940063477, "logps/chosen": -355.0517883300781, "logps/rejected": -334.4058532714844, "loss": 1.0852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.545994520187378, "rewards/margins": 0.8217609524726868, "rewards/rejected": 1.724233627319336, "step": 106850 }, { "epoch": 4.961233112029342, "grad_norm": 72.12641143798828, "learning_rate": 2.454152931890988e-09, "logits/chosen": -19.5738468170166, "logits/rejected": -18.41300392150879, "logps/chosen": -355.6358337402344, "logps/rejected": -377.7715759277344, "loss": 0.473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.614129066467285, "rewards/margins": 1.0750229358673096, "rewards/rejected": 1.539106011390686, "step": 106860 }, { "epoch": 4.961697386136775, "grad_norm": 37.91426086425781, "learning_rate": 2.4262964854450064e-09, "logits/chosen": -19.335424423217773, "logits/rejected": -18.76380729675293, "logps/chosen": -432.853759765625, "logps/rejected": -366.02716064453125, "loss": 0.7156, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3727831840515137, "rewards/margins": 1.3397276401519775, "rewards/rejected": 2.0330557823181152, "step": 106870 }, { "epoch": 4.962161660244208, "grad_norm": 143.57754516601562, "learning_rate": 2.3984400389990246e-09, "logits/chosen": -18.52756118774414, "logits/rejected": -17.974952697753906, "logps/chosen": -390.0348205566406, "logps/rejected": -312.3617248535156, "loss": 0.7721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.859839916229248, "rewards/margins": 1.1518170833587646, "rewards/rejected": 1.7080227136611938, "step": 106880 }, { "epoch": 4.9626259343516415, "grad_norm": 21.081565856933594, "learning_rate": 2.3705835925530433e-09, "logits/chosen": -20.537715911865234, "logits/rejected": -20.53949546813965, "logps/chosen": -427.44366455078125, "logps/rejected": -498.8251037597656, "loss": 0.624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.8489460945129395, "rewards/margins": 0.7970312237739563, "rewards/rejected": 4.051915168762207, "step": 106890 }, { "epoch": 4.963090208459074, "grad_norm": 115.52596282958984, "learning_rate": 2.3427271461070616e-09, "logits/chosen": -18.93328285217285, "logits/rejected": -18.074459075927734, "logps/chosen": -319.21087646484375, "logps/rejected": -281.5035095214844, "loss": 0.4515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4612655639648438, "rewards/margins": 1.3336626291275024, "rewards/rejected": 1.1276029348373413, "step": 106900 }, { "epoch": 4.963554482566507, "grad_norm": 166.542236328125, "learning_rate": 2.31487069966108e-09, "logits/chosen": -18.816007614135742, "logits/rejected": -18.475231170654297, "logps/chosen": -269.3500061035156, "logps/rejected": -292.6517639160156, "loss": 0.8339, "rewards/accuracies": 0.5, "rewards/chosen": 1.847832441329956, "rewards/margins": 0.2934408485889435, "rewards/rejected": 1.554391622543335, "step": 106910 }, { "epoch": 4.964018756673941, "grad_norm": 156.3797607421875, "learning_rate": 2.287014253215098e-09, "logits/chosen": -18.234952926635742, "logits/rejected": -17.567358016967773, "logps/chosen": -353.4020080566406, "logps/rejected": -257.89544677734375, "loss": 0.4622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6143956184387207, "rewards/margins": 2.146588087081909, "rewards/rejected": 1.467807650566101, "step": 106920 }, { "epoch": 4.964483030781373, "grad_norm": 49.98554611206055, "learning_rate": 2.2591578067691163e-09, "logits/chosen": -18.19879150390625, "logits/rejected": -17.352025985717773, "logps/chosen": -291.400146484375, "logps/rejected": -237.0041961669922, "loss": 0.681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.373901844024658, "rewards/margins": 0.7806205749511719, "rewards/rejected": 1.5932815074920654, "step": 106930 }, { "epoch": 4.964947304888806, "grad_norm": 1.9453333616256714, "learning_rate": 2.2313013603231346e-09, "logits/chosen": -19.196407318115234, "logits/rejected": -18.34537124633789, "logps/chosen": -558.5645751953125, "logps/rejected": -406.7052917480469, "loss": 0.6631, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.680274963378906, "rewards/margins": 1.6235824823379517, "rewards/rejected": 3.056692123413086, "step": 106940 }, { "epoch": 4.965411578996239, "grad_norm": 0.9769436120986938, "learning_rate": 2.2034449138771533e-09, "logits/chosen": -19.979480743408203, "logits/rejected": -18.793254852294922, "logps/chosen": -363.65380859375, "logps/rejected": -283.5318908691406, "loss": 0.3875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9698147773742676, "rewards/margins": 1.6483064889907837, "rewards/rejected": 2.3215088844299316, "step": 106950 }, { "epoch": 4.965875853103673, "grad_norm": 22.1031494140625, "learning_rate": 2.175588467431171e-09, "logits/chosen": -19.144908905029297, "logits/rejected": -19.12461280822754, "logps/chosen": -538.4017333984375, "logps/rejected": -445.79541015625, "loss": 0.4889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.750115394592285, "rewards/margins": 1.2667890787124634, "rewards/rejected": 3.4833266735076904, "step": 106960 }, { "epoch": 4.966340127211105, "grad_norm": 135.26657104492188, "learning_rate": 2.1477320209851894e-09, "logits/chosen": -18.910642623901367, "logits/rejected": -17.720134735107422, "logps/chosen": -387.3340148925781, "logps/rejected": -284.6042175292969, "loss": 0.3093, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0275890827178955, "rewards/margins": 2.378060817718506, "rewards/rejected": 0.6495282053947449, "step": 106970 }, { "epoch": 4.966804401318538, "grad_norm": 37.4912223815918, "learning_rate": 2.119875574539208e-09, "logits/chosen": -18.688243865966797, "logits/rejected": -17.040010452270508, "logps/chosen": -363.0845031738281, "logps/rejected": -273.23248291015625, "loss": 0.7375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.218773365020752, "rewards/margins": 3.8745903968811035, "rewards/rejected": 1.3441827297210693, "step": 106980 }, { "epoch": 4.967268675425972, "grad_norm": 6.059486389160156, "learning_rate": 2.0920191280932263e-09, "logits/chosen": -19.008411407470703, "logits/rejected": -18.083410263061523, "logps/chosen": -306.00750732421875, "logps/rejected": -225.34481811523438, "loss": 0.8694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2087149620056152, "rewards/margins": 1.4113715887069702, "rewards/rejected": 0.7973432540893555, "step": 106990 }, { "epoch": 4.967732949533405, "grad_norm": 43.7933235168457, "learning_rate": 2.064162681647244e-09, "logits/chosen": -19.095104217529297, "logits/rejected": -17.371854782104492, "logps/chosen": -382.85345458984375, "logps/rejected": -202.23117065429688, "loss": 0.473, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.889582633972168, "rewards/margins": 3.4266762733459473, "rewards/rejected": 1.462906837463379, "step": 107000 }, { "epoch": 4.968197223640837, "grad_norm": 148.96543884277344, "learning_rate": 2.036306235201263e-09, "logits/chosen": -18.66201400756836, "logits/rejected": -18.17832374572754, "logps/chosen": -464.68023681640625, "logps/rejected": -310.128662109375, "loss": 0.9098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2264435291290283, "rewards/margins": 0.94038325548172, "rewards/rejected": 2.286060094833374, "step": 107010 }, { "epoch": 4.96866149774827, "grad_norm": 42.22763442993164, "learning_rate": 2.008449788755281e-09, "logits/chosen": -19.879844665527344, "logits/rejected": -19.293853759765625, "logps/chosen": -402.47674560546875, "logps/rejected": -343.406494140625, "loss": 0.5428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.875028610229492, "rewards/margins": 0.9526656866073608, "rewards/rejected": 2.922362804412842, "step": 107020 }, { "epoch": 4.969125771855704, "grad_norm": 38.904911041259766, "learning_rate": 1.9805933423092993e-09, "logits/chosen": -19.078048706054688, "logits/rejected": -18.595012664794922, "logps/chosen": -282.63067626953125, "logps/rejected": -226.5565185546875, "loss": 0.536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.259182095527649, "rewards/margins": 0.9188075065612793, "rewards/rejected": 0.34037455916404724, "step": 107030 }, { "epoch": 4.969590045963137, "grad_norm": 158.5774383544922, "learning_rate": 1.9527368958633176e-09, "logits/chosen": -17.64682388305664, "logits/rejected": -18.09718894958496, "logps/chosen": -383.5264892578125, "logps/rejected": -341.464111328125, "loss": 0.603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5794589519500732, "rewards/margins": 1.5597069263458252, "rewards/rejected": 2.019752264022827, "step": 107040 }, { "epoch": 4.970054320070569, "grad_norm": 66.34724426269531, "learning_rate": 1.924880449417336e-09, "logits/chosen": -18.992361068725586, "logits/rejected": -18.704776763916016, "logps/chosen": -395.15704345703125, "logps/rejected": -325.7265625, "loss": 0.6934, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3353867530822754, "rewards/margins": 1.184348225593567, "rewards/rejected": 2.151038408279419, "step": 107050 }, { "epoch": 4.970518594178003, "grad_norm": 0.01630234159529209, "learning_rate": 1.897024002971354e-09, "logits/chosen": -18.53219985961914, "logits/rejected": -17.672969818115234, "logps/chosen": -396.40692138671875, "logps/rejected": -349.8703308105469, "loss": 0.5344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.694383144378662, "rewards/margins": 2.123640537261963, "rewards/rejected": 1.5707426071166992, "step": 107060 }, { "epoch": 4.970982868285436, "grad_norm": 70.51361846923828, "learning_rate": 1.8691675565253724e-09, "logits/chosen": -19.271873474121094, "logits/rejected": -18.488941192626953, "logps/chosen": -367.188720703125, "logps/rejected": -381.36614990234375, "loss": 0.9684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.016364574432373, "rewards/margins": 0.6427764892578125, "rewards/rejected": 2.3735880851745605, "step": 107070 }, { "epoch": 4.971447142392869, "grad_norm": 6.379614353179932, "learning_rate": 1.8413111100793908e-09, "logits/chosen": -19.13944435119629, "logits/rejected": -18.400941848754883, "logps/chosen": -435.6966247558594, "logps/rejected": -410.89990234375, "loss": 0.6014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.379586219787598, "rewards/margins": 1.5835270881652832, "rewards/rejected": 3.7960593700408936, "step": 107080 }, { "epoch": 4.971911416500301, "grad_norm": 127.53435516357422, "learning_rate": 1.813454663633409e-09, "logits/chosen": -19.393354415893555, "logits/rejected": -18.564861297607422, "logps/chosen": -397.0693359375, "logps/rejected": -296.68670654296875, "loss": 0.4607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.223032474517822, "rewards/margins": 2.4222633838653564, "rewards/rejected": 1.8007690906524658, "step": 107090 }, { "epoch": 4.972375690607735, "grad_norm": 262.0791320800781, "learning_rate": 1.7855982171874274e-09, "logits/chosen": -18.7641658782959, "logits/rejected": -18.35310935974121, "logps/chosen": -290.4114074707031, "logps/rejected": -301.6611633300781, "loss": 1.3226, "rewards/accuracies": 0.5, "rewards/chosen": 3.4084842205047607, "rewards/margins": 0.7028648257255554, "rewards/rejected": 2.7056193351745605, "step": 107100 }, { "epoch": 4.972839964715168, "grad_norm": 16.562313079833984, "learning_rate": 1.7577417707414458e-09, "logits/chosen": -18.931913375854492, "logits/rejected": -18.807111740112305, "logps/chosen": -333.33526611328125, "logps/rejected": -313.84039306640625, "loss": 0.7793, "rewards/accuracies": 0.5, "rewards/chosen": 2.7426505088806152, "rewards/margins": 0.7568010091781616, "rewards/rejected": 1.9858496189117432, "step": 107110 }, { "epoch": 4.9733042388226005, "grad_norm": 84.80201721191406, "learning_rate": 1.7298853242954639e-09, "logits/chosen": -19.31952476501465, "logits/rejected": -18.899967193603516, "logps/chosen": -379.5279846191406, "logps/rejected": -312.84735107421875, "loss": 0.8838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2285735607147217, "rewards/margins": 0.6270228624343872, "rewards/rejected": 2.601550579071045, "step": 107120 }, { "epoch": 4.973768512930034, "grad_norm": 17.580869674682617, "learning_rate": 1.7048145224940803e-09, "logits/chosen": -19.342262268066406, "logits/rejected": -19.080259323120117, "logps/chosen": -486.04339599609375, "logps/rejected": -478.6731872558594, "loss": 0.695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.261535167694092, "rewards/margins": 0.33093157410621643, "rewards/rejected": 2.9306037425994873, "step": 107130 }, { "epoch": 4.974232787037467, "grad_norm": 145.64205932617188, "learning_rate": 1.6769580760480988e-09, "logits/chosen": -18.45473289489746, "logits/rejected": -18.122102737426758, "logps/chosen": -356.05865478515625, "logps/rejected": -242.74203491210938, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3097338676452637, "rewards/margins": 1.8372442722320557, "rewards/rejected": 1.472489595413208, "step": 107140 }, { "epoch": 4.9746970611449, "grad_norm": 5.041347980499268, "learning_rate": 1.649101629602117e-09, "logits/chosen": -20.020240783691406, "logits/rejected": -18.779052734375, "logps/chosen": -409.51324462890625, "logps/rejected": -351.971435546875, "loss": 0.4278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.928302049636841, "rewards/margins": 1.905491828918457, "rewards/rejected": 2.022810459136963, "step": 107150 }, { "epoch": 4.9751613352523325, "grad_norm": 39.22528076171875, "learning_rate": 1.6212451831561351e-09, "logits/chosen": -19.103240966796875, "logits/rejected": -17.792217254638672, "logps/chosen": -362.06744384765625, "logps/rejected": -216.2305450439453, "loss": 0.4227, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9286022186279297, "rewards/margins": 1.9220632314682007, "rewards/rejected": 1.0065388679504395, "step": 107160 }, { "epoch": 4.975625609359766, "grad_norm": 258.9122619628906, "learning_rate": 1.5933887367101536e-09, "logits/chosen": -18.678932189941406, "logits/rejected": -17.460399627685547, "logps/chosen": -412.44378662109375, "logps/rejected": -308.99078369140625, "loss": 0.5771, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2798213958740234, "rewards/margins": 1.7532684803009033, "rewards/rejected": 1.5265529155731201, "step": 107170 }, { "epoch": 4.976089883467199, "grad_norm": 28.93855094909668, "learning_rate": 1.5655322902641719e-09, "logits/chosen": -21.393936157226562, "logits/rejected": -19.884122848510742, "logps/chosen": -458.2230529785156, "logps/rejected": -302.281982421875, "loss": 0.2774, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.559284687042236, "rewards/margins": 2.0857415199279785, "rewards/rejected": 2.473543405532837, "step": 107180 }, { "epoch": 4.976554157574632, "grad_norm": 0.27913111448287964, "learning_rate": 1.5376758438181903e-09, "logits/chosen": -19.54317283630371, "logits/rejected": -18.422332763671875, "logps/chosen": -526.8020629882812, "logps/rejected": -396.3459777832031, "loss": 0.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.533321380615234, "rewards/margins": 1.5401450395584106, "rewards/rejected": 2.9931764602661133, "step": 107190 }, { "epoch": 4.977018431682065, "grad_norm": 149.48977661132812, "learning_rate": 1.5098193973722086e-09, "logits/chosen": -19.65460205078125, "logits/rejected": -19.470388412475586, "logps/chosen": -388.33636474609375, "logps/rejected": -351.8223571777344, "loss": 0.6162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.069493293762207, "rewards/margins": 1.4032708406448364, "rewards/rejected": 2.666222333908081, "step": 107200 }, { "epoch": 4.977482705789498, "grad_norm": 31.025362014770508, "learning_rate": 1.4819629509262266e-09, "logits/chosen": -19.786624908447266, "logits/rejected": -19.265674591064453, "logps/chosen": -404.9823913574219, "logps/rejected": -321.7244567871094, "loss": 0.7269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.228480339050293, "rewards/margins": 2.442328929901123, "rewards/rejected": 1.7861512899398804, "step": 107210 }, { "epoch": 4.977946979896931, "grad_norm": 131.2696533203125, "learning_rate": 1.454106504480245e-09, "logits/chosen": -18.23571014404297, "logits/rejected": -18.987316131591797, "logps/chosen": -397.2742004394531, "logps/rejected": -472.1031188964844, "loss": 0.7355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9652061462402344, "rewards/margins": 0.4878840446472168, "rewards/rejected": 2.4773221015930176, "step": 107220 }, { "epoch": 4.978411254004364, "grad_norm": 42.551788330078125, "learning_rate": 1.4262500580342634e-09, "logits/chosen": -18.477005004882812, "logits/rejected": -18.859609603881836, "logps/chosen": -398.0459899902344, "logps/rejected": -396.1434020996094, "loss": 0.926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7159228324890137, "rewards/margins": 0.30994096398353577, "rewards/rejected": 3.405982255935669, "step": 107230 }, { "epoch": 4.978875528111797, "grad_norm": 245.34786987304688, "learning_rate": 1.3983936115882816e-09, "logits/chosen": -18.398780822753906, "logits/rejected": -18.480478286743164, "logps/chosen": -361.40118408203125, "logps/rejected": -402.039306640625, "loss": 0.9675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3140079975128174, "rewards/margins": 0.7953842282295227, "rewards/rejected": 2.5186238288879395, "step": 107240 }, { "epoch": 4.97933980221923, "grad_norm": 14.191728591918945, "learning_rate": 1.3705371651423e-09, "logits/chosen": -19.41138458251953, "logits/rejected": -19.18738555908203, "logps/chosen": -357.538818359375, "logps/rejected": -352.72259521484375, "loss": 0.8746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.483231067657471, "rewards/margins": 0.43863630294799805, "rewards/rejected": 4.0445942878723145, "step": 107250 }, { "epoch": 4.979804076326663, "grad_norm": 21.528457641601562, "learning_rate": 1.3426807186963181e-09, "logits/chosen": -20.395282745361328, "logits/rejected": -19.51568031311035, "logps/chosen": -324.3050231933594, "logps/rejected": -262.08734130859375, "loss": 0.4693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.616936445236206, "rewards/margins": 1.6200268268585205, "rewards/rejected": 1.9969093799591064, "step": 107260 }, { "epoch": 4.9802683504340965, "grad_norm": 93.66338348388672, "learning_rate": 1.3148242722503366e-09, "logits/chosen": -18.805910110473633, "logits/rejected": -18.826656341552734, "logps/chosen": -348.3624572753906, "logps/rejected": -335.78765869140625, "loss": 1.4323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4529645442962646, "rewards/margins": 0.18228575587272644, "rewards/rejected": 2.270678997039795, "step": 107270 }, { "epoch": 4.980732624541529, "grad_norm": 163.9763946533203, "learning_rate": 1.2869678258043549e-09, "logits/chosen": -18.634952545166016, "logits/rejected": -18.23342514038086, "logps/chosen": -312.25103759765625, "logps/rejected": -272.64703369140625, "loss": 0.9534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8623337745666504, "rewards/margins": 0.10978834331035614, "rewards/rejected": 2.7525453567504883, "step": 107280 }, { "epoch": 4.981196898648962, "grad_norm": 6.606189250946045, "learning_rate": 1.2591113793583731e-09, "logits/chosen": -18.03702735900879, "logits/rejected": -18.678831100463867, "logps/chosen": -358.9299621582031, "logps/rejected": -433.46343994140625, "loss": 1.5086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3116822242736816, "rewards/margins": -0.38508403301239014, "rewards/rejected": 3.6967663764953613, "step": 107290 }, { "epoch": 4.981661172756396, "grad_norm": 2.963886260986328, "learning_rate": 1.2312549329123916e-09, "logits/chosen": -19.506067276000977, "logits/rejected": -18.778806686401367, "logps/chosen": -317.75909423828125, "logps/rejected": -208.6356964111328, "loss": 0.2442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.063265562057495, "rewards/margins": 2.1777844429016113, "rewards/rejected": 0.8854813575744629, "step": 107300 }, { "epoch": 4.9821254468638285, "grad_norm": 44.224246978759766, "learning_rate": 1.2033984864664096e-09, "logits/chosen": -19.327627182006836, "logits/rejected": -18.845890045166016, "logps/chosen": -558.185302734375, "logps/rejected": -469.0091247558594, "loss": 0.2994, "rewards/accuracies": 1.0, "rewards/chosen": 5.521939754486084, "rewards/margins": 1.794638991355896, "rewards/rejected": 3.7273011207580566, "step": 107310 }, { "epoch": 4.982589720971261, "grad_norm": 20.028018951416016, "learning_rate": 1.1755420400204279e-09, "logits/chosen": -19.83089828491211, "logits/rejected": -19.310558319091797, "logps/chosen": -411.36785888671875, "logps/rejected": -348.68023681640625, "loss": 0.6985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.589799404144287, "rewards/margins": 1.0666742324829102, "rewards/rejected": 2.523125171661377, "step": 107320 }, { "epoch": 4.983053995078695, "grad_norm": 50.03937530517578, "learning_rate": 1.1476855935744464e-09, "logits/chosen": -19.830196380615234, "logits/rejected": -19.16403579711914, "logps/chosen": -330.37640380859375, "logps/rejected": -298.28204345703125, "loss": 0.6206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.866506576538086, "rewards/margins": 1.498964786529541, "rewards/rejected": 2.367542266845703, "step": 107330 }, { "epoch": 4.983518269186128, "grad_norm": 55.95602035522461, "learning_rate": 1.1198291471284646e-09, "logits/chosen": -19.165645599365234, "logits/rejected": -18.839702606201172, "logps/chosen": -444.69757080078125, "logps/rejected": -418.9769592285156, "loss": 0.3935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7943522930145264, "rewards/margins": 0.972450852394104, "rewards/rejected": 2.821901321411133, "step": 107340 }, { "epoch": 4.9839825432935605, "grad_norm": 137.13247680664062, "learning_rate": 1.0919727006824829e-09, "logits/chosen": -19.94458770751953, "logits/rejected": -19.603641510009766, "logps/chosen": -473.94610595703125, "logps/rejected": -389.5128479003906, "loss": 0.5332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.727512836456299, "rewards/margins": 1.9669231176376343, "rewards/rejected": 1.7605892419815063, "step": 107350 }, { "epoch": 4.984446817400993, "grad_norm": 2.6823067665100098, "learning_rate": 1.0641162542365011e-09, "logits/chosen": -19.658931732177734, "logits/rejected": -19.154438018798828, "logps/chosen": -410.27752685546875, "logps/rejected": -296.4893798828125, "loss": 0.7012, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.310314655303955, "rewards/margins": 1.4493051767349243, "rewards/rejected": 1.8610093593597412, "step": 107360 }, { "epoch": 4.984911091508427, "grad_norm": 54.821556091308594, "learning_rate": 1.0362598077905196e-09, "logits/chosen": -19.286794662475586, "logits/rejected": -19.32839012145996, "logps/chosen": -392.748291015625, "logps/rejected": -357.26458740234375, "loss": 0.8981, "rewards/accuracies": 0.5, "rewards/chosen": 3.2636287212371826, "rewards/margins": 0.40366047620773315, "rewards/rejected": 2.859968662261963, "step": 107370 }, { "epoch": 4.98537536561586, "grad_norm": 134.51918029785156, "learning_rate": 1.0084033613445379e-09, "logits/chosen": -18.51555824279785, "logits/rejected": -18.214662551879883, "logps/chosen": -291.29888916015625, "logps/rejected": -270.0440368652344, "loss": 0.9283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5090566873550415, "rewards/margins": 0.661693811416626, "rewards/rejected": 0.8473628759384155, "step": 107380 }, { "epoch": 4.9858396397232925, "grad_norm": 161.8158416748047, "learning_rate": 9.805469148985561e-10, "logits/chosen": -19.477014541625977, "logits/rejected": -19.313486099243164, "logps/chosen": -342.128173828125, "logps/rejected": -245.7740936279297, "loss": 0.739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3577213287353516, "rewards/margins": 0.297088086605072, "rewards/rejected": 2.0606331825256348, "step": 107390 }, { "epoch": 4.986303913830726, "grad_norm": 79.67122650146484, "learning_rate": 9.526904684525744e-10, "logits/chosen": -18.337604522705078, "logits/rejected": -19.001087188720703, "logps/chosen": -402.33953857421875, "logps/rejected": -458.9049377441406, "loss": 1.3311, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5295066833496094, "rewards/margins": -0.5183725953102112, "rewards/rejected": 4.047879219055176, "step": 107400 }, { "epoch": 4.986768187938159, "grad_norm": 85.42291259765625, "learning_rate": 9.248340220065926e-10, "logits/chosen": -19.59676170349121, "logits/rejected": -17.71099281311035, "logps/chosen": -387.2118225097656, "logps/rejected": -305.35540771484375, "loss": 0.6542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0101470947265625, "rewards/margins": 2.598576068878174, "rewards/rejected": 1.4115711450576782, "step": 107410 }, { "epoch": 4.987232462045592, "grad_norm": 1.2666332721710205, "learning_rate": 8.96977575560611e-10, "logits/chosen": -19.719562530517578, "logits/rejected": -18.541362762451172, "logps/chosen": -346.74468994140625, "logps/rejected": -248.7245330810547, "loss": 0.5084, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.118029832839966, "rewards/margins": 2.1296589374542236, "rewards/rejected": 0.9883708953857422, "step": 107420 }, { "epoch": 4.9876967361530244, "grad_norm": 14.541522979736328, "learning_rate": 8.691211291146292e-10, "logits/chosen": -19.61827278137207, "logits/rejected": -19.65127944946289, "logps/chosen": -385.9414367675781, "logps/rejected": -396.83770751953125, "loss": 1.2261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2129642963409424, "rewards/margins": 0.19616155326366425, "rewards/rejected": 3.0168025493621826, "step": 107430 }, { "epoch": 4.988161010260458, "grad_norm": 180.27488708496094, "learning_rate": 8.412646826686475e-10, "logits/chosen": -19.3287296295166, "logits/rejected": -19.859432220458984, "logps/chosen": -517.9002075195312, "logps/rejected": -407.29742431640625, "loss": 1.1206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.066891670227051, "rewards/margins": 0.5603061318397522, "rewards/rejected": 3.5065855979919434, "step": 107440 }, { "epoch": 4.988625284367891, "grad_norm": 162.52703857421875, "learning_rate": 8.134082362226659e-10, "logits/chosen": -17.980344772338867, "logits/rejected": -18.129606246948242, "logps/chosen": -418.0421447753906, "logps/rejected": -394.04443359375, "loss": 1.2693, "rewards/accuracies": 0.5, "rewards/chosen": 2.567997932434082, "rewards/margins": -0.1903354823589325, "rewards/rejected": 2.758333444595337, "step": 107450 }, { "epoch": 4.989089558475324, "grad_norm": 0.11769233644008636, "learning_rate": 7.855517897766841e-10, "logits/chosen": -19.37372398376465, "logits/rejected": -18.822708129882812, "logps/chosen": -346.0331115722656, "logps/rejected": -358.50164794921875, "loss": 0.774, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.679427623748779, "rewards/margins": 1.2048074007034302, "rewards/rejected": 3.4746193885803223, "step": 107460 }, { "epoch": 4.989553832582757, "grad_norm": 121.03546905517578, "learning_rate": 7.576953433307024e-10, "logits/chosen": -20.410850524902344, "logits/rejected": -20.616247177124023, "logps/chosen": -352.99456787109375, "logps/rejected": -349.3279724121094, "loss": 1.2358, "rewards/accuracies": 0.5, "rewards/chosen": 3.4497838020324707, "rewards/margins": 0.20461967587471008, "rewards/rejected": 3.245164155960083, "step": 107470 }, { "epoch": 4.99001810669019, "grad_norm": 92.76680755615234, "learning_rate": 7.298388968847207e-10, "logits/chosen": -18.686969757080078, "logits/rejected": -18.191524505615234, "logps/chosen": -499.7765197753906, "logps/rejected": -361.4754638671875, "loss": 0.8599, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.029726028442383, "rewards/margins": 0.9642857313156128, "rewards/rejected": 2.0654404163360596, "step": 107480 }, { "epoch": 4.990482380797623, "grad_norm": 146.30613708496094, "learning_rate": 7.01982450438739e-10, "logits/chosen": -18.263263702392578, "logits/rejected": -18.090978622436523, "logps/chosen": -408.76861572265625, "logps/rejected": -389.49505615234375, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.045555353164673, "rewards/margins": 0.7455049753189087, "rewards/rejected": 2.3000502586364746, "step": 107490 }, { "epoch": 4.990946654905056, "grad_norm": 16.025320053100586, "learning_rate": 6.741260039927573e-10, "logits/chosen": -19.275793075561523, "logits/rejected": -19.138521194458008, "logps/chosen": -336.9376220703125, "logps/rejected": -266.25653076171875, "loss": 0.4506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.558053493499756, "rewards/margins": 1.4946720600128174, "rewards/rejected": 2.0633816719055176, "step": 107500 }, { "epoch": 4.991410929012489, "grad_norm": 0.2665066719055176, "learning_rate": 6.462695575467755e-10, "logits/chosen": -19.007139205932617, "logits/rejected": -17.07083511352539, "logps/chosen": -433.9690856933594, "logps/rejected": -256.6383361816406, "loss": 0.2695, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.352328777313232, "rewards/margins": 3.270265579223633, "rewards/rejected": 1.0820629596710205, "step": 107510 }, { "epoch": 4.991875203119922, "grad_norm": 23.634492874145508, "learning_rate": 6.184131111007938e-10, "logits/chosen": -18.98926544189453, "logits/rejected": -19.47519302368164, "logps/chosen": -431.68341064453125, "logps/rejected": -517.5543823242188, "loss": 1.2593, "rewards/accuracies": 0.5, "rewards/chosen": 3.9541728496551514, "rewards/margins": -0.4478476643562317, "rewards/rejected": 4.4020209312438965, "step": 107520 }, { "epoch": 4.992339477227355, "grad_norm": 59.89978790283203, "learning_rate": 5.905566646548122e-10, "logits/chosen": -19.125164031982422, "logits/rejected": -19.537822723388672, "logps/chosen": -337.40985107421875, "logps/rejected": -369.97735595703125, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.658818483352661, "rewards/margins": 1.0326979160308838, "rewards/rejected": 2.626120090484619, "step": 107530 }, { "epoch": 4.9928037513347885, "grad_norm": 31.53331184387207, "learning_rate": 5.627002182088305e-10, "logits/chosen": -19.20381736755371, "logits/rejected": -18.172077178955078, "logps/chosen": -281.79254150390625, "logps/rejected": -204.876220703125, "loss": 0.9995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6815996170043945, "rewards/margins": 0.8292368054389954, "rewards/rejected": 0.8523628115653992, "step": 107540 }, { "epoch": 4.993268025442221, "grad_norm": 15.402141571044922, "learning_rate": 5.348437717628488e-10, "logits/chosen": -18.81649398803711, "logits/rejected": -18.76742172241211, "logps/chosen": -335.2027893066406, "logps/rejected": -334.48895263671875, "loss": 0.9733, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.334394931793213, "rewards/margins": 0.13754865527153015, "rewards/rejected": 2.1968464851379395, "step": 107550 }, { "epoch": 4.993732299549654, "grad_norm": 38.72039794921875, "learning_rate": 5.06987325316867e-10, "logits/chosen": -18.79323959350586, "logits/rejected": -17.827919006347656, "logps/chosen": -329.3941345214844, "logps/rejected": -232.48526000976562, "loss": 0.3956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6200127601623535, "rewards/margins": 1.8771381378173828, "rewards/rejected": 0.7428744435310364, "step": 107560 }, { "epoch": 4.994196573657087, "grad_norm": 59.931396484375, "learning_rate": 4.791308788708853e-10, "logits/chosen": -18.143177032470703, "logits/rejected": -18.050159454345703, "logps/chosen": -297.0707092285156, "logps/rejected": -335.2476806640625, "loss": 0.5802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.645817995071411, "rewards/margins": 0.48387861251831055, "rewards/rejected": 2.1619391441345215, "step": 107570 }, { "epoch": 4.9946608477645205, "grad_norm": 166.57618713378906, "learning_rate": 4.5127443242490366e-10, "logits/chosen": -19.548473358154297, "logits/rejected": -19.00276756286621, "logps/chosen": -468.67779541015625, "logps/rejected": -349.13592529296875, "loss": 0.6103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.031797170639038, "rewards/margins": 0.9427558779716492, "rewards/rejected": 2.089041233062744, "step": 107580 }, { "epoch": 4.995125121871953, "grad_norm": 38.30690383911133, "learning_rate": 4.234179859789219e-10, "logits/chosen": -18.449398040771484, "logits/rejected": -17.292396545410156, "logps/chosen": -318.96124267578125, "logps/rejected": -234.33236694335938, "loss": 0.3233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8811089992523193, "rewards/margins": 1.40642511844635, "rewards/rejected": 0.47468385100364685, "step": 107590 }, { "epoch": 4.995589395979386, "grad_norm": 5.137350082397461, "learning_rate": 3.9556153953294023e-10, "logits/chosen": -20.1363525390625, "logits/rejected": -19.8283634185791, "logps/chosen": -336.7950134277344, "logps/rejected": -313.1929626464844, "loss": 0.5624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9755663871765137, "rewards/margins": 1.1044366359710693, "rewards/rejected": 1.8711293935775757, "step": 107600 }, { "epoch": 4.99605367008682, "grad_norm": 39.998619079589844, "learning_rate": 3.677050930869585e-10, "logits/chosen": -19.508060455322266, "logits/rejected": -18.93862533569336, "logps/chosen": -421.67828369140625, "logps/rejected": -355.8097839355469, "loss": 0.6107, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.803377151489258, "rewards/margins": 1.0967108011245728, "rewards/rejected": 2.7066659927368164, "step": 107610 }, { "epoch": 4.996517944194252, "grad_norm": 41.93593978881836, "learning_rate": 3.3984864664097686e-10, "logits/chosen": -20.325084686279297, "logits/rejected": -19.96661376953125, "logps/chosen": -496.07037353515625, "logps/rejected": -480.14373779296875, "loss": 0.2934, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.058510780334473, "rewards/margins": 1.7029043436050415, "rewards/rejected": 2.3556060791015625, "step": 107620 }, { "epoch": 4.996982218301685, "grad_norm": 41.18568420410156, "learning_rate": 3.119922001949951e-10, "logits/chosen": -20.66689682006836, "logits/rejected": -18.98281478881836, "logps/chosen": -482.42205810546875, "logps/rejected": -367.9514465332031, "loss": 0.5104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.394071578979492, "rewards/margins": 2.3245363235473633, "rewards/rejected": 2.06953501701355, "step": 107630 }, { "epoch": 4.997446492409118, "grad_norm": 101.70088958740234, "learning_rate": 2.841357537490134e-10, "logits/chosen": -18.224918365478516, "logits/rejected": -17.862979888916016, "logps/chosen": -426.29632568359375, "logps/rejected": -362.1079406738281, "loss": 0.4101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2816975116729736, "rewards/margins": 1.200165867805481, "rewards/rejected": 2.0815317630767822, "step": 107640 }, { "epoch": 4.997910766516552, "grad_norm": 42.09020233154297, "learning_rate": 2.562793073030317e-10, "logits/chosen": -19.61794662475586, "logits/rejected": -18.490650177001953, "logps/chosen": -470.28839111328125, "logps/rejected": -331.1594543457031, "loss": 0.2057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6550049781799316, "rewards/margins": 2.270134925842285, "rewards/rejected": 1.3848698139190674, "step": 107650 }, { "epoch": 4.998375040623984, "grad_norm": 5.862757682800293, "learning_rate": 2.2842286085705e-10, "logits/chosen": -18.439523696899414, "logits/rejected": -18.218868255615234, "logps/chosen": -451.74468994140625, "logps/rejected": -399.1405944824219, "loss": 0.8304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.648617744445801, "rewards/margins": 1.0047491788864136, "rewards/rejected": 3.6438674926757812, "step": 107660 }, { "epoch": 4.998839314731417, "grad_norm": 35.03554916381836, "learning_rate": 2.0056641441106828e-10, "logits/chosen": -19.252986907958984, "logits/rejected": -17.870567321777344, "logps/chosen": -351.8274230957031, "logps/rejected": -213.23306274414062, "loss": 0.3028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.08750057220459, "rewards/margins": 2.89631986618042, "rewards/rejected": 1.1911808252334595, "step": 107670 }, { "epoch": 4.999303588838851, "grad_norm": 61.36976623535156, "learning_rate": 1.727099679650866e-10, "logits/chosen": -19.657238006591797, "logits/rejected": -19.43453598022461, "logps/chosen": -475.9689025878906, "logps/rejected": -434.62646484375, "loss": 0.653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.127521991729736, "rewards/margins": 0.5174439549446106, "rewards/rejected": 3.6100783348083496, "step": 107680 }, { "epoch": 4.999767862946284, "grad_norm": 32.90167236328125, "learning_rate": 1.4485352151910488e-10, "logits/chosen": -19.426143646240234, "logits/rejected": -18.56009864807129, "logps/chosen": -318.7349853515625, "logps/rejected": -280.3258361816406, "loss": 0.6247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.037973642349243, "rewards/margins": 1.3486363887786865, "rewards/rejected": 1.689337134361267, "step": 107690 } ], "logging_steps": 10, "max_steps": 107695, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }