{ "best_metric": null, "best_model_checkpoint": null, "epoch": 35.95443037974684, "eval_steps": 100, "global_step": 7092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005063291139240506, "grad_norm": 520145.53108452284, "learning_rate": 1.4084507042253521e-09, "logits/chosen": -16.270591735839844, "logits/rejected": -16.343984603881836, "logps/chosen": -186.17276000976562, "logps/rejected": -175.8095703125, "loss": 122464.3125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05063291139240506, "grad_norm": 501181.05233525805, "learning_rate": 1.408450704225352e-08, "logits/chosen": -17.194263458251953, "logits/rejected": -17.04476547241211, "logps/chosen": -220.64031982421875, "logps/rejected": -220.79531860351562, "loss": 124716.2917, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 1.5937095554363623e-07, "rewards/margins": 1.5358187738456763e-05, "rewards/rejected": -1.5198814253380988e-05, "step": 10 }, { "epoch": 0.10126582278481013, "grad_norm": 537058.8643033113, "learning_rate": 2.816901408450704e-08, "logits/chosen": -16.468345642089844, "logits/rejected": -16.397050857543945, "logps/chosen": -238.16464233398438, "logps/rejected": -234.13320922851562, "loss": 125132.075, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -7.482715773221571e-06, "rewards/margins": 1.4337347238324583e-05, "rewards/rejected": -2.1820069378009066e-05, "step": 20 }, { "epoch": 0.1518987341772152, "grad_norm": 457257.68659374124, "learning_rate": 4.2253521126760564e-08, "logits/chosen": -16.952747344970703, "logits/rejected": -16.70650863647461, "logps/chosen": -242.9259490966797, "logps/rejected": -242.9457244873047, "loss": 124660.25, "rewards/accuracies": 0.4375, "rewards/chosen": -3.21022052958142e-05, "rewards/margins": 3.5706521885003895e-05, "rewards/rejected": -6.780872354283929e-05, "step": 30 }, { "epoch": 0.20253164556962025, "grad_norm": 520967.9129238899, "learning_rate": 5.633802816901408e-08, "logits/chosen": -16.920284271240234, "logits/rejected": -16.8529052734375, "logps/chosen": -243.7992706298828, "logps/rejected": -244.38906860351562, "loss": 124148.0625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0001235240779351443, "rewards/margins": 8.850651647662744e-05, "rewards/rejected": -0.00021203060168772936, "step": 40 }, { "epoch": 0.25316455696202533, "grad_norm": 722258.4292859514, "learning_rate": 7.042253521126761e-08, "logits/chosen": -16.24307632446289, "logits/rejected": -16.294937133789062, "logps/chosen": -238.68148803710938, "logps/rejected": -240.46337890625, "loss": 125272.85, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00024854010553099215, "rewards/margins": -7.368279329966754e-05, "rewards/rejected": -0.00017485734133515507, "step": 50 }, { "epoch": 0.3037974683544304, "grad_norm": 521725.51159479923, "learning_rate": 8.450704225352113e-08, "logits/chosen": -16.547048568725586, "logits/rejected": -16.562244415283203, "logps/chosen": -234.24453735351562, "logps/rejected": -236.03823852539062, "loss": 123692.1, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00036723288940265775, "rewards/margins": 6.122588274592999e-06, "rewards/rejected": -0.00037335552042350173, "step": 60 }, { "epoch": 0.35443037974683544, "grad_norm": 446768.20251500694, "learning_rate": 9.859154929577463e-08, "logits/chosen": -16.514156341552734, "logits/rejected": -16.41303062438965, "logps/chosen": -240.8957061767578, "logps/rejected": -235.2915496826172, "loss": 125937.8, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00028800699510611594, "rewards/margins": 0.00021416530944406986, "rewards/rejected": -0.0005021723336540163, "step": 70 }, { "epoch": 0.4050632911392405, "grad_norm": 463557.5011981856, "learning_rate": 1.1267605633802817e-07, "logits/chosen": -16.711376190185547, "logits/rejected": -16.489612579345703, "logps/chosen": -243.5523681640625, "logps/rejected": -228.8307342529297, "loss": 125818.525, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005035396316088736, "rewards/margins": 6.90509841660969e-05, "rewards/rejected": -0.0005725906230509281, "step": 80 }, { "epoch": 0.45569620253164556, "grad_norm": 465137.87035599066, "learning_rate": 1.2676056338028167e-07, "logits/chosen": -17.326900482177734, "logits/rejected": -17.396936416625977, "logps/chosen": -240.1623077392578, "logps/rejected": -234.27578735351562, "loss": 123894.4, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0008088796166703105, "rewards/margins": -0.00010466824460308999, "rewards/rejected": -0.0007042114739306271, "step": 90 }, { "epoch": 0.5063291139240507, "grad_norm": 505006.4054603859, "learning_rate": 1.4084507042253522e-07, "logits/chosen": -16.5346736907959, "logits/rejected": -16.46234893798828, "logps/chosen": -238.9674530029297, "logps/rejected": -235.36239624023438, "loss": 126640.2125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0006280581001192331, "rewards/margins": 2.4443055735900998e-05, "rewards/rejected": -0.0006525011267513037, "step": 100 }, { "epoch": 0.5569620253164557, "grad_norm": 475489.46555727004, "learning_rate": 1.549295774647887e-07, "logits/chosen": -16.67499351501465, "logits/rejected": -16.584075927734375, "logps/chosen": -240.5388946533203, "logps/rejected": -239.03366088867188, "loss": 122706.3, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0006213908782228827, "rewards/margins": 0.00010261077841278166, "rewards/rejected": -0.0007240016711875796, "step": 110 }, { "epoch": 0.6075949367088608, "grad_norm": 492764.07090207015, "learning_rate": 1.6901408450704225e-07, "logits/chosen": -16.746532440185547, "logits/rejected": -16.617717742919922, "logps/chosen": -227.05398559570312, "logps/rejected": -225.60214233398438, "loss": 126588.925, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0008889889577403665, "rewards/margins": 2.5076475139940158e-05, "rewards/rejected": -0.0009140653419308364, "step": 120 }, { "epoch": 0.6582278481012658, "grad_norm": 511084.4558498889, "learning_rate": 1.8309859154929577e-07, "logits/chosen": -16.747934341430664, "logits/rejected": -16.733430862426758, "logps/chosen": -240.7227325439453, "logps/rejected": -240.2967529296875, "loss": 125175.5125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0010187395382672548, "rewards/margins": 0.002705145161598921, "rewards/rejected": -0.003723885165527463, "step": 130 }, { "epoch": 0.7088607594936709, "grad_norm": 540454.6644647518, "learning_rate": 1.9718309859154927e-07, "logits/chosen": -16.1859073638916, "logits/rejected": -16.264835357666016, "logps/chosen": -231.37173461914062, "logps/rejected": -227.0606689453125, "loss": 126058.6375, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0008466474828310311, "rewards/margins": -4.7403918870259076e-05, "rewards/rejected": -0.0007992436294443905, "step": 140 }, { "epoch": 0.759493670886076, "grad_norm": 503077.16971538117, "learning_rate": 2.112676056338028e-07, "logits/chosen": -17.280269622802734, "logits/rejected": -17.093780517578125, "logps/chosen": -238.0977325439453, "logps/rejected": -238.93212890625, "loss": 126646.125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0008783842204138637, "rewards/margins": 0.00026031016022898257, "rewards/rejected": -0.0011386943515390158, "step": 150 }, { "epoch": 0.810126582278481, "grad_norm": 541715.9624559938, "learning_rate": 2.2535211267605633e-07, "logits/chosen": -16.782550811767578, "logits/rejected": -16.79593276977539, "logps/chosen": -250.48593139648438, "logps/rejected": -249.44924926757812, "loss": 124718.425, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0012445250758901238, "rewards/margins": 5.686017539119348e-05, "rewards/rejected": -0.0013013852294534445, "step": 160 }, { "epoch": 0.8607594936708861, "grad_norm": 548905.0358445289, "learning_rate": 2.394366197183098e-07, "logits/chosen": -17.04167938232422, "logits/rejected": -16.985572814941406, "logps/chosen": -255.06942749023438, "logps/rejected": -260.38128662109375, "loss": 125650.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021144188940525055, "rewards/margins": 0.001583110773935914, "rewards/rejected": -0.003697529900819063, "step": 170 }, { "epoch": 0.9113924050632911, "grad_norm": 561549.4959644328, "learning_rate": 2.5352112676056334e-07, "logits/chosen": -16.703407287597656, "logits/rejected": -16.487037658691406, "logps/chosen": -232.529052734375, "logps/rejected": -228.3297576904297, "loss": 127298.1375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002397818025201559, "rewards/margins": 0.0011578220874071121, "rewards/rejected": -0.003555640112608671, "step": 180 }, { "epoch": 0.9620253164556962, "grad_norm": 565071.0053763993, "learning_rate": 2.6760563380281686e-07, "logits/chosen": -16.11090660095215, "logits/rejected": -16.053157806396484, "logps/chosen": -239.39205932617188, "logps/rejected": -235.435791015625, "loss": 127009.225, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0010291519574820995, "rewards/margins": 0.00019036220328416675, "rewards/rejected": -0.0012195140589028597, "step": 190 }, { "epoch": 1.0126582278481013, "grad_norm": 497332.98430491646, "learning_rate": 2.8169014084507043e-07, "logits/chosen": -16.127140045166016, "logits/rejected": -15.988116264343262, "logps/chosen": -225.9070587158203, "logps/rejected": -227.90145874023438, "loss": 126358.875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0012379485415294766, "rewards/margins": 0.0006675361073575914, "rewards/rejected": -0.001905484707094729, "step": 200 }, { "epoch": 1.0632911392405062, "grad_norm": 935063.760892245, "learning_rate": 2.957746478873239e-07, "logits/chosen": -16.701793670654297, "logits/rejected": -16.669902801513672, "logps/chosen": -230.3677520751953, "logps/rejected": -229.03921508789062, "loss": 124250.775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0006614397279918194, "rewards/margins": 0.003238010685890913, "rewards/rejected": -0.003899450646713376, "step": 210 }, { "epoch": 1.1139240506329113, "grad_norm": 517399.2020129059, "learning_rate": 3.098591549295774e-07, "logits/chosen": -16.413972854614258, "logits/rejected": -16.371458053588867, "logps/chosen": -247.8984832763672, "logps/rejected": -249.5322723388672, "loss": 124993.7375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.0012706981506198645, "rewards/margins": 0.003060612827539444, "rewards/rejected": -0.004331310745328665, "step": 220 }, { "epoch": 1.1645569620253164, "grad_norm": 499036.7717944408, "learning_rate": 3.23943661971831e-07, "logits/chosen": -15.908624649047852, "logits/rejected": -15.847338676452637, "logps/chosen": -236.7013397216797, "logps/rejected": -239.3136749267578, "loss": 122842.5, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0006655483739450574, "rewards/margins": 0.0032406128011643887, "rewards/rejected": -0.0039061610586941242, "step": 230 }, { "epoch": 1.2151898734177216, "grad_norm": 540681.7856619481, "learning_rate": 3.380281690140845e-07, "logits/chosen": -16.052249908447266, "logits/rejected": -15.99653148651123, "logps/chosen": -229.74832153320312, "logps/rejected": -230.9803009033203, "loss": 124587.3625, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0007962372037582099, "rewards/margins": 0.0025483998470008373, "rewards/rejected": -0.003344637108966708, "step": 240 }, { "epoch": 1.2658227848101267, "grad_norm": 1023950.8355601664, "learning_rate": 3.52112676056338e-07, "logits/chosen": -15.299288749694824, "logits/rejected": -15.215815544128418, "logps/chosen": -231.2301788330078, "logps/rejected": -232.03359985351562, "loss": 121822.4, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -7.512583579227794e-06, "rewards/margins": 0.003883513854816556, "rewards/rejected": -0.0038910270668566227, "step": 250 }, { "epoch": 1.3164556962025316, "grad_norm": 620253.8184950812, "learning_rate": 3.6619718309859155e-07, "logits/chosen": -16.167770385742188, "logits/rejected": -15.915590286254883, "logps/chosen": -238.9904327392578, "logps/rejected": -239.73953247070312, "loss": 123388.8625, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.00017356239550281316, "rewards/margins": 0.0050824107602238655, "rewards/rejected": -0.005255972500890493, "step": 260 }, { "epoch": 1.3670886075949367, "grad_norm": 575104.3218096169, "learning_rate": 3.8028169014084507e-07, "logits/chosen": -15.480558395385742, "logits/rejected": -15.386639595031738, "logps/chosen": -241.60879516601562, "logps/rejected": -250.003173828125, "loss": 123555.7, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.001139859901741147, "rewards/margins": 0.005077657289803028, "rewards/rejected": -0.0062175169587135315, "step": 270 }, { "epoch": 1.4177215189873418, "grad_norm": 601224.4433091934, "learning_rate": 3.9436619718309853e-07, "logits/chosen": -15.266016960144043, "logits/rejected": -15.313554763793945, "logps/chosen": -230.73397827148438, "logps/rejected": -237.3317108154297, "loss": 125556.675, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0007209269679151475, "rewards/margins": 0.00534270191565156, "rewards/rejected": -0.004621774889528751, "step": 280 }, { "epoch": 1.4683544303797469, "grad_norm": 751936.3077706753, "learning_rate": 4.084507042253521e-07, "logits/chosen": -14.600263595581055, "logits/rejected": -14.538311958312988, "logps/chosen": -224.1177520751953, "logps/rejected": -226.97879028320312, "loss": 123584.675, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.0011863496620208025, "rewards/margins": 0.007649322040379047, "rewards/rejected": -0.006462973542511463, "step": 290 }, { "epoch": 1.518987341772152, "grad_norm": 575660.5828565176, "learning_rate": 4.225352112676056e-07, "logits/chosen": -14.935551643371582, "logits/rejected": -15.062429428100586, "logps/chosen": -235.7123565673828, "logps/rejected": -245.36181640625, "loss": 122562.1375, "rewards/accuracies": 0.75, "rewards/chosen": 0.0014863747637718916, "rewards/margins": 0.0057060932740569115, "rewards/rejected": -0.0042197187431156635, "step": 300 }, { "epoch": 1.5696202531645569, "grad_norm": 619514.1083852616, "learning_rate": 4.366197183098591e-07, "logits/chosen": -14.678690910339355, "logits/rejected": -14.617218017578125, "logps/chosen": -229.6386260986328, "logps/rejected": -234.1474151611328, "loss": 123630.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0006864996394142509, "rewards/margins": 0.004933560267090797, "rewards/rejected": -0.004247060976922512, "step": 310 }, { "epoch": 1.620253164556962, "grad_norm": 738538.1512211321, "learning_rate": 4.5070422535211266e-07, "logits/chosen": -14.131611824035645, "logits/rejected": -14.156657218933105, "logps/chosen": -241.20156860351562, "logps/rejected": -248.2321319580078, "loss": 124158.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0009155808947980404, "rewards/margins": 0.006913213524967432, "rewards/rejected": -0.007828795351088047, "step": 320 }, { "epoch": 1.6708860759493671, "grad_norm": 688317.7143989427, "learning_rate": 4.647887323943662e-07, "logits/chosen": -13.791796684265137, "logits/rejected": -13.970884323120117, "logps/chosen": -228.53079223632812, "logps/rejected": -235.5008087158203, "loss": 123378.175, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0017698236042633653, "rewards/margins": 0.006004182621836662, "rewards/rejected": -0.004234359599649906, "step": 330 }, { "epoch": 1.721518987341772, "grad_norm": 693314.5034252935, "learning_rate": 4.788732394366196e-07, "logits/chosen": -13.555567741394043, "logits/rejected": -13.32630729675293, "logps/chosen": -227.0249481201172, "logps/rejected": -232.2772216796875, "loss": 122521.475, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.001143553527072072, "rewards/margins": 0.009070896543562412, "rewards/rejected": -0.00792734231799841, "step": 340 }, { "epoch": 1.7721518987341773, "grad_norm": 758709.6120906892, "learning_rate": 4.929577464788733e-07, "logits/chosen": -13.520563125610352, "logits/rejected": -13.633130073547363, "logps/chosen": -234.7182159423828, "logps/rejected": -248.12890625, "loss": 121557.575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00047356385039165616, "rewards/margins": 0.00813873577862978, "rewards/rejected": -0.008612299337983131, "step": 350 }, { "epoch": 1.8227848101265822, "grad_norm": 689974.393201542, "learning_rate": 4.992165465371357e-07, "logits/chosen": -12.841153144836426, "logits/rejected": -12.86094856262207, "logps/chosen": -232.314697265625, "logps/rejected": -232.64297485351562, "loss": 121436.65, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0036194869317114353, "rewards/margins": 0.009506477043032646, "rewards/rejected": -0.005886988714337349, "step": 360 }, { "epoch": 1.8734177215189873, "grad_norm": 883375.543329047, "learning_rate": 4.976496396114071e-07, "logits/chosen": -12.77904224395752, "logits/rejected": -12.76900577545166, "logps/chosen": -239.8730010986328, "logps/rejected": -251.4569549560547, "loss": 122456.925, "rewards/accuracies": 0.75, "rewards/chosen": -0.0006393647054210305, "rewards/margins": 0.008665768429636955, "rewards/rejected": -0.009305133484303951, "step": 370 }, { "epoch": 1.9240506329113924, "grad_norm": 797554.0864386982, "learning_rate": 4.960827326856785e-07, "logits/chosen": -13.028135299682617, "logits/rejected": -13.148831367492676, "logps/chosen": -237.040771484375, "logps/rejected": -244.45181274414062, "loss": 124907.725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0025544934906065464, "rewards/margins": 0.008132859133183956, "rewards/rejected": -0.005578366108238697, "step": 380 }, { "epoch": 1.9746835443037973, "grad_norm": 793120.1180084129, "learning_rate": 4.945158257599498e-07, "logits/chosen": -12.312803268432617, "logits/rejected": -12.135167121887207, "logps/chosen": -235.60360717773438, "logps/rejected": -242.9219207763672, "loss": 121583.8, "rewards/accuracies": 0.75, "rewards/chosen": 0.003660207614302635, "rewards/margins": 0.011001082137227058, "rewards/rejected": -0.007340874522924423, "step": 390 }, { "epoch": 2.0253164556962027, "grad_norm": 767339.6192091529, "learning_rate": 4.929489188342212e-07, "logits/chosen": -12.052891731262207, "logits/rejected": -11.94625473022461, "logps/chosen": -225.0377197265625, "logps/rejected": -243.81039428710938, "loss": 119737.85, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.006956162396818399, "rewards/margins": 0.01727995090186596, "rewards/rejected": -0.010323788039386272, "step": 400 }, { "epoch": 2.0759493670886076, "grad_norm": 936793.207320047, "learning_rate": 4.913820119084926e-07, "logits/chosen": -11.38767147064209, "logits/rejected": -11.339715957641602, "logps/chosen": -219.8796844482422, "logps/rejected": -252.80581665039062, "loss": 114021.05, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.008199459873139858, "rewards/margins": 0.031510110944509506, "rewards/rejected": -0.023310650140047073, "step": 410 }, { "epoch": 2.1265822784810124, "grad_norm": 1035986.8564166825, "learning_rate": 4.89815104982764e-07, "logits/chosen": -10.819408416748047, "logits/rejected": -10.774351119995117, "logps/chosen": -231.78854370117188, "logps/rejected": -260.20355224609375, "loss": 116051.6, "rewards/accuracies": 0.875, "rewards/chosen": 0.0056950985454022884, "rewards/margins": 0.027868490666151047, "rewards/rejected": -0.02217339165508747, "step": 420 }, { "epoch": 2.1772151898734178, "grad_norm": 1036991.7861177241, "learning_rate": 4.882481980570354e-07, "logits/chosen": -10.84526252746582, "logits/rejected": -10.708145141601562, "logps/chosen": -221.5430908203125, "logps/rejected": -257.36114501953125, "loss": 113501.175, "rewards/accuracies": 0.875, "rewards/chosen": 0.005057200789451599, "rewards/margins": 0.038923002779483795, "rewards/rejected": -0.033865805715322495, "step": 430 }, { "epoch": 2.2278481012658227, "grad_norm": 1227488.243303788, "learning_rate": 4.866812911313068e-07, "logits/chosen": -10.5010986328125, "logits/rejected": -10.63232135772705, "logps/chosen": -233.42373657226562, "logps/rejected": -276.0982666015625, "loss": 112100.4, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.004779786802828312, "rewards/margins": 0.040522992610931396, "rewards/rejected": -0.03574320673942566, "step": 440 }, { "epoch": 2.278481012658228, "grad_norm": 1079397.6974786038, "learning_rate": 4.851143842055782e-07, "logits/chosen": -10.104026794433594, "logits/rejected": -10.142271995544434, "logps/chosen": -216.66940307617188, "logps/rejected": -258.98858642578125, "loss": 112483.4, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0053299954161047935, "rewards/margins": 0.03484385460615158, "rewards/rejected": -0.029513856396079063, "step": 450 }, { "epoch": 2.329113924050633, "grad_norm": 1367054.8438774655, "learning_rate": 4.835474772798496e-07, "logits/chosen": -10.148681640625, "logits/rejected": -10.183786392211914, "logps/chosen": -233.730224609375, "logps/rejected": -278.64349365234375, "loss": 111561.6625, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.011530257761478424, "rewards/margins": 0.04578756168484688, "rewards/rejected": -0.034257303923368454, "step": 460 }, { "epoch": 2.379746835443038, "grad_norm": 1298484.9349088285, "learning_rate": 4.819805703541209e-07, "logits/chosen": -10.018949508666992, "logits/rejected": -10.097805976867676, "logps/chosen": -224.6026153564453, "logps/rejected": -270.0591735839844, "loss": 112710.1875, "rewards/accuracies": 0.875, "rewards/chosen": 0.011801879853010178, "rewards/margins": 0.040784891694784164, "rewards/rejected": -0.028983011841773987, "step": 470 }, { "epoch": 2.430379746835443, "grad_norm": 1428524.6930006845, "learning_rate": 4.804136634283923e-07, "logits/chosen": -9.595979690551758, "logits/rejected": -9.634994506835938, "logps/chosen": -265.3009338378906, "logps/rejected": -315.98541259765625, "loss": 110031.3, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0027348275762051344, "rewards/margins": 0.05151837319135666, "rewards/rejected": -0.048783544450998306, "step": 480 }, { "epoch": 2.481012658227848, "grad_norm": 1467649.8441612076, "learning_rate": 4.788467565026637e-07, "logits/chosen": -8.871723175048828, "logits/rejected": -8.764354705810547, "logps/chosen": -203.2312774658203, "logps/rejected": -241.612548828125, "loss": 110534.325, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.01302252896130085, "rewards/margins": 0.03907207027077675, "rewards/rejected": -0.02604953944683075, "step": 490 }, { "epoch": 2.5316455696202533, "grad_norm": 1382959.9591988046, "learning_rate": 4.772798495769351e-07, "logits/chosen": -8.468270301818848, "logits/rejected": -8.384966850280762, "logps/chosen": -226.46237182617188, "logps/rejected": -269.6461181640625, "loss": 110480.175, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.015191495418548584, "rewards/margins": 0.0456535741686821, "rewards/rejected": -0.030462080612778664, "step": 500 }, { "epoch": 2.5822784810126582, "grad_norm": 1369494.2190603705, "learning_rate": 4.757129426512065e-07, "logits/chosen": -8.634099006652832, "logits/rejected": -8.640868186950684, "logps/chosen": -232.20022583007812, "logps/rejected": -304.80352783203125, "loss": 109921.975, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.013228721916675568, "rewards/margins": 0.07378505170345306, "rewards/rejected": -0.060556329786777496, "step": 510 }, { "epoch": 2.632911392405063, "grad_norm": 1750255.0550240122, "learning_rate": 4.741460357254779e-07, "logits/chosen": -7.8379316329956055, "logits/rejected": -7.4784440994262695, "logps/chosen": -213.3401641845703, "logps/rejected": -258.43743896484375, "loss": 111730.3875, "rewards/accuracies": 0.875, "rewards/chosen": 0.016342563554644585, "rewards/margins": 0.048144370317459106, "rewards/rejected": -0.03180180490016937, "step": 520 }, { "epoch": 2.6835443037974684, "grad_norm": 1447093.2174814222, "learning_rate": 4.7257912879974927e-07, "logits/chosen": -8.354089736938477, "logits/rejected": -7.889782905578613, "logps/chosen": -225.5243682861328, "logps/rejected": -276.7877502441406, "loss": 109226.9625, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.013667022809386253, "rewards/margins": 0.05627403408288956, "rewards/rejected": -0.042607005685567856, "step": 530 }, { "epoch": 2.7341772151898733, "grad_norm": 1477083.7533012358, "learning_rate": 4.710122218740207e-07, "logits/chosen": -7.921019077301025, "logits/rejected": -7.979846000671387, "logps/chosen": -237.23715209960938, "logps/rejected": -285.4289855957031, "loss": 109592.125, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.014952963218092918, "rewards/margins": 0.055934417992830276, "rewards/rejected": -0.040981464087963104, "step": 540 }, { "epoch": 2.7848101265822782, "grad_norm": 1486366.6324330876, "learning_rate": 4.6944531494829204e-07, "logits/chosen": -7.12634801864624, "logits/rejected": -7.396058082580566, "logps/chosen": -226.1304168701172, "logps/rejected": -276.8672790527344, "loss": 108245.925, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.00994019117206335, "rewards/margins": 0.0537477545440197, "rewards/rejected": -0.04380756989121437, "step": 550 }, { "epoch": 2.8354430379746836, "grad_norm": 1560304.698196799, "learning_rate": 4.6787840802256345e-07, "logits/chosen": -7.268878936767578, "logits/rejected": -7.414219856262207, "logps/chosen": -215.24661254882812, "logps/rejected": -276.79437255859375, "loss": 110187.5125, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.016926631331443787, "rewards/margins": 0.05572710186243057, "rewards/rejected": -0.03880046680569649, "step": 560 }, { "epoch": 2.8860759493670884, "grad_norm": 1647695.8714812996, "learning_rate": 4.663115010968348e-07, "logits/chosen": -8.584083557128906, "logits/rejected": -8.43793773651123, "logps/chosen": -239.3496856689453, "logps/rejected": -301.948974609375, "loss": 108493.15, "rewards/accuracies": 0.875, "rewards/chosen": 0.007640582975000143, "rewards/margins": 0.06335236132144928, "rewards/rejected": -0.0557117760181427, "step": 570 }, { "epoch": 2.9367088607594938, "grad_norm": 1523200.3846012072, "learning_rate": 4.647445941711062e-07, "logits/chosen": -8.875934600830078, "logits/rejected": -8.860316276550293, "logps/chosen": -234.2982635498047, "logps/rejected": -293.39727783203125, "loss": 107204.65, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0077833631075918674, "rewards/margins": 0.061719853430986404, "rewards/rejected": -0.05393648147583008, "step": 580 }, { "epoch": 2.9873417721518987, "grad_norm": 1605115.356703113, "learning_rate": 4.631776872453776e-07, "logits/chosen": -8.788633346557617, "logits/rejected": -8.637460708618164, "logps/chosen": -257.7025146484375, "logps/rejected": -303.82147216796875, "loss": 108959.225, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0057020229287445545, "rewards/margins": 0.053022872656583786, "rewards/rejected": -0.04732084274291992, "step": 590 }, { "epoch": 3.037974683544304, "grad_norm": 1435515.2852262415, "learning_rate": 4.61610780319649e-07, "logits/chosen": -7.956998348236084, "logits/rejected": -7.496169090270996, "logps/chosen": -219.92410278320312, "logps/rejected": -310.20123291015625, "loss": 95986.4875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.016220271587371826, "rewards/margins": 0.09167212247848511, "rewards/rejected": -0.07545184344053268, "step": 600 }, { "epoch": 3.088607594936709, "grad_norm": 1646011.901841717, "learning_rate": 4.6004387339392035e-07, "logits/chosen": -7.747580051422119, "logits/rejected": -7.5227952003479, "logps/chosen": -217.8295440673828, "logps/rejected": -343.4312438964844, "loss": 91538.925, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02667585015296936, "rewards/margins": 0.12547221779823303, "rewards/rejected": -0.09879636764526367, "step": 610 }, { "epoch": 3.1392405063291138, "grad_norm": 1631989.4144731541, "learning_rate": 4.5847696646819176e-07, "logits/chosen": -6.8127121925354, "logits/rejected": -6.8090972900390625, "logps/chosen": -209.46859741210938, "logps/rejected": -332.0594482421875, "loss": 92242.9, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026208167895674706, "rewards/margins": 0.12268342822790146, "rewards/rejected": -0.0964752584695816, "step": 620 }, { "epoch": 3.189873417721519, "grad_norm": 1627589.9925143481, "learning_rate": 4.569100595424631e-07, "logits/chosen": -6.631221771240234, "logits/rejected": -6.502354621887207, "logps/chosen": -211.57974243164062, "logps/rejected": -333.447265625, "loss": 89921.25, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.022689208388328552, "rewards/margins": 0.12395058572292328, "rewards/rejected": -0.10126137733459473, "step": 630 }, { "epoch": 3.240506329113924, "grad_norm": 1780107.5787213328, "learning_rate": 4.5534315261673453e-07, "logits/chosen": -7.868208885192871, "logits/rejected": -7.755393981933594, "logps/chosen": -209.3970184326172, "logps/rejected": -341.9508056640625, "loss": 89608.1875, "rewards/accuracies": 0.9375, "rewards/chosen": 0.027028566226363182, "rewards/margins": 0.133165642619133, "rewards/rejected": -0.10613708198070526, "step": 640 }, { "epoch": 3.291139240506329, "grad_norm": 1730512.4518714033, "learning_rate": 4.5377624569100595e-07, "logits/chosen": -7.359053134918213, "logits/rejected": -7.324367523193359, "logps/chosen": -193.1954803466797, "logps/rejected": -309.5513610839844, "loss": 93257.225, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028996175155043602, "rewards/margins": 0.11760006099939346, "rewards/rejected": -0.08860386908054352, "step": 650 }, { "epoch": 3.3417721518987342, "grad_norm": 1692816.769511115, "learning_rate": 4.5220933876527736e-07, "logits/chosen": -8.043203353881836, "logits/rejected": -8.003018379211426, "logps/chosen": -211.73648071289062, "logps/rejected": -336.10455322265625, "loss": 88400.4688, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.024640550836920738, "rewards/margins": 0.12655004858970642, "rewards/rejected": -0.10190950334072113, "step": 660 }, { "epoch": 3.392405063291139, "grad_norm": 1906377.7496358757, "learning_rate": 4.506424318395487e-07, "logits/chosen": -7.25619649887085, "logits/rejected": -7.37869119644165, "logps/chosen": -197.8258819580078, "logps/rejected": -324.2138671875, "loss": 89983.5688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026263948529958725, "rewards/margins": 0.12702925503253937, "rewards/rejected": -0.10076530277729034, "step": 670 }, { "epoch": 3.4430379746835444, "grad_norm": 1785643.0594316572, "learning_rate": 4.4907552491382013e-07, "logits/chosen": -6.798577785491943, "logits/rejected": -6.7768073081970215, "logps/chosen": -208.5835723876953, "logps/rejected": -323.3017883300781, "loss": 89767.5, "rewards/accuracies": 0.9375, "rewards/chosen": 0.025741413235664368, "rewards/margins": 0.1167701929807663, "rewards/rejected": -0.09102877229452133, "step": 680 }, { "epoch": 3.4936708860759493, "grad_norm": 2393957.296937455, "learning_rate": 4.475086179880915e-07, "logits/chosen": -6.352355480194092, "logits/rejected": -6.526197910308838, "logps/chosen": -187.56597900390625, "logps/rejected": -306.5972595214844, "loss": 89036.6875, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.024061182513833046, "rewards/margins": 0.11990946531295776, "rewards/rejected": -0.09584827721118927, "step": 690 }, { "epoch": 3.5443037974683547, "grad_norm": 1811486.2204670438, "learning_rate": 4.459417110623629e-07, "logits/chosen": -5.7466630935668945, "logits/rejected": -5.797163486480713, "logps/chosen": -212.6585235595703, "logps/rejected": -364.36199951171875, "loss": 88031.3, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.027677077800035477, "rewards/margins": 0.14764061570167542, "rewards/rejected": -0.11996352672576904, "step": 700 }, { "epoch": 3.5949367088607596, "grad_norm": 1724684.5755440604, "learning_rate": 4.4437480413663426e-07, "logits/chosen": -5.412962436676025, "logits/rejected": -5.541121959686279, "logps/chosen": -202.39065551757812, "logps/rejected": -333.0758056640625, "loss": 86956.675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0247800350189209, "rewards/margins": 0.12825721502304077, "rewards/rejected": -0.10347716510295868, "step": 710 }, { "epoch": 3.6455696202531644, "grad_norm": 1933271.7611355048, "learning_rate": 4.4280789721090567e-07, "logits/chosen": -5.053005218505859, "logits/rejected": -4.886711597442627, "logps/chosen": -199.10885620117188, "logps/rejected": -317.7257385253906, "loss": 86655.0125, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02152046002447605, "rewards/margins": 0.11774978786706924, "rewards/rejected": -0.09622932970523834, "step": 720 }, { "epoch": 3.6962025316455698, "grad_norm": 2267463.489494214, "learning_rate": 4.4124099028517703e-07, "logits/chosen": -6.616279602050781, "logits/rejected": -6.9615797996521, "logps/chosen": -200.58961486816406, "logps/rejected": -351.6376953125, "loss": 86181.3938, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.032253801822662354, "rewards/margins": 0.14937567710876465, "rewards/rejected": -0.1171218603849411, "step": 730 }, { "epoch": 3.7468354430379747, "grad_norm": 1734288.0953653858, "learning_rate": 4.3967408335944844e-07, "logits/chosen": -5.873335361480713, "logits/rejected": -5.689335823059082, "logps/chosen": -217.43637084960938, "logps/rejected": -350.2752990722656, "loss": 86780.825, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.031159091740846634, "rewards/margins": 0.13692796230316162, "rewards/rejected": -0.10576887428760529, "step": 740 }, { "epoch": 3.7974683544303796, "grad_norm": 1741715.9901586007, "learning_rate": 4.381071764337198e-07, "logits/chosen": -7.123785972595215, "logits/rejected": -7.188807487487793, "logps/chosen": -207.00045776367188, "logps/rejected": -336.5976867675781, "loss": 86139.5625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03052128478884697, "rewards/margins": 0.13043463230133057, "rewards/rejected": -0.0999133437871933, "step": 750 }, { "epoch": 3.848101265822785, "grad_norm": 1879351.8394690978, "learning_rate": 4.365402695079912e-07, "logits/chosen": -7.820990085601807, "logits/rejected": -7.7128729820251465, "logps/chosen": -213.57388305664062, "logps/rejected": -362.5634460449219, "loss": 87478.3625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.03660900145769119, "rewards/margins": 0.1480773240327835, "rewards/rejected": -0.11146833002567291, "step": 760 }, { "epoch": 3.8987341772151898, "grad_norm": 1968713.4204386624, "learning_rate": 4.349733625822626e-07, "logits/chosen": -7.314540863037109, "logits/rejected": -7.363668918609619, "logps/chosen": -213.6930694580078, "logps/rejected": -367.44073486328125, "loss": 86825.5813, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026752913370728493, "rewards/margins": 0.15061405301094055, "rewards/rejected": -0.1238611489534378, "step": 770 }, { "epoch": 3.9493670886075947, "grad_norm": 2163439.406665409, "learning_rate": 4.33406455656534e-07, "logits/chosen": -7.67099666595459, "logits/rejected": -7.536408424377441, "logps/chosen": -213.9747772216797, "logps/rejected": -344.7560119628906, "loss": 86913.0375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029844319447875023, "rewards/margins": 0.12930825352668762, "rewards/rejected": -0.09946390986442566, "step": 780 }, { "epoch": 4.0, "grad_norm": 1866234.1823014135, "learning_rate": 4.3183954873080535e-07, "logits/chosen": -7.922532081604004, "logits/rejected": -7.692726135253906, "logps/chosen": -211.41653442382812, "logps/rejected": -349.7116394042969, "loss": 86592.8938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.027728911489248276, "rewards/margins": 0.1435452550649643, "rewards/rejected": -0.11581633985042572, "step": 790 }, { "epoch": 4.050632911392405, "grad_norm": 1782853.8797277175, "learning_rate": 4.3027264180507676e-07, "logits/chosen": -8.29829216003418, "logits/rejected": -8.205643653869629, "logps/chosen": -178.8797149658203, "logps/rejected": -378.06121826171875, "loss": 69143.425, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05098045617341995, "rewards/margins": 0.1993386447429657, "rewards/rejected": -0.14835818111896515, "step": 800 }, { "epoch": 4.10126582278481, "grad_norm": 1719472.9461235409, "learning_rate": 4.287057348793481e-07, "logits/chosen": -7.558290958404541, "logits/rejected": -7.646592617034912, "logps/chosen": -186.36911010742188, "logps/rejected": -386.6961975097656, "loss": 67634.3375, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04189852252602577, "rewards/margins": 0.19968575239181519, "rewards/rejected": -0.1577872335910797, "step": 810 }, { "epoch": 4.151898734177215, "grad_norm": 1571399.8942716653, "learning_rate": 4.2713882795361953e-07, "logits/chosen": -7.811161994934082, "logits/rejected": -7.783130645751953, "logps/chosen": -181.81602478027344, "logps/rejected": -402.1683654785156, "loss": 66806.9187, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.049001529812812805, "rewards/margins": 0.21849961578845978, "rewards/rejected": -0.16949808597564697, "step": 820 }, { "epoch": 4.2025316455696204, "grad_norm": 1992030.3917670588, "learning_rate": 4.255719210278909e-07, "logits/chosen": -7.349759101867676, "logits/rejected": -7.380797386169434, "logps/chosen": -175.21702575683594, "logps/rejected": -396.2167053222656, "loss": 67021.875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.05283821374177933, "rewards/margins": 0.22190704941749573, "rewards/rejected": -0.169068843126297, "step": 830 }, { "epoch": 4.253164556962025, "grad_norm": 1859879.670487208, "learning_rate": 4.2400501410216235e-07, "logits/chosen": -7.482248783111572, "logits/rejected": -7.252910614013672, "logps/chosen": -187.070556640625, "logps/rejected": -401.1556701660156, "loss": 68463.9, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05697192624211311, "rewards/margins": 0.21645841002464294, "rewards/rejected": -0.15948647260665894, "step": 840 }, { "epoch": 4.30379746835443, "grad_norm": 1688181.1410657803, "learning_rate": 4.224381071764337e-07, "logits/chosen": -5.693742275238037, "logits/rejected": -5.435591697692871, "logps/chosen": -198.21900939941406, "logps/rejected": -398.49981689453125, "loss": 67266.2, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.04546400159597397, "rewards/margins": 0.20465342700481415, "rewards/rejected": -0.15918943285942078, "step": 850 }, { "epoch": 4.3544303797468356, "grad_norm": 1750431.6432656392, "learning_rate": 4.208712002507051e-07, "logits/chosen": -8.664016723632812, "logits/rejected": -8.082508087158203, "logps/chosen": -178.05966186523438, "logps/rejected": -402.77093505859375, "loss": 65760.2625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.056066203862428665, "rewards/margins": 0.22950176894664764, "rewards/rejected": -0.17343556880950928, "step": 860 }, { "epoch": 4.405063291139241, "grad_norm": 1904336.610304837, "learning_rate": 4.193042933249765e-07, "logits/chosen": -5.778517723083496, "logits/rejected": -5.432709693908691, "logps/chosen": -176.563720703125, "logps/rejected": -379.2276916503906, "loss": 67058.1125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.05091014504432678, "rewards/margins": 0.2058809995651245, "rewards/rejected": -0.15497085452079773, "step": 870 }, { "epoch": 4.455696202531645, "grad_norm": 1779397.1811982268, "learning_rate": 4.177373863992479e-07, "logits/chosen": -6.937778472900391, "logits/rejected": -6.611588954925537, "logps/chosen": -180.23001098632812, "logps/rejected": -400.9800720214844, "loss": 67019.0875, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05085798352956772, "rewards/margins": 0.2235671728849411, "rewards/rejected": -0.17270918190479279, "step": 880 }, { "epoch": 4.506329113924051, "grad_norm": 1755630.994265544, "learning_rate": 4.1617047947351925e-07, "logits/chosen": -6.663479804992676, "logits/rejected": -6.144991397857666, "logps/chosen": -189.93707275390625, "logps/rejected": -383.9622802734375, "loss": 66060.8813, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.053109876811504364, "rewards/margins": 0.20497091114521027, "rewards/rejected": -0.1518610268831253, "step": 890 }, { "epoch": 4.556962025316456, "grad_norm": 1729683.010514938, "learning_rate": 4.1460357254779067e-07, "logits/chosen": -7.10635232925415, "logits/rejected": -7.227837562561035, "logps/chosen": -184.3021240234375, "logps/rejected": -391.59930419921875, "loss": 67231.6313, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.050502438098192215, "rewards/margins": 0.20674797892570496, "rewards/rejected": -0.15624557435512543, "step": 900 }, { "epoch": 4.6075949367088604, "grad_norm": 1921064.671845176, "learning_rate": 4.13036665622062e-07, "logits/chosen": -7.409733772277832, "logits/rejected": -7.2668256759643555, "logps/chosen": -184.89645385742188, "logps/rejected": -395.2364501953125, "loss": 67370.1875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.047733135521411896, "rewards/margins": 0.2108074128627777, "rewards/rejected": -0.1630742847919464, "step": 910 }, { "epoch": 4.658227848101266, "grad_norm": 1780170.6356310213, "learning_rate": 4.1146975869633344e-07, "logits/chosen": -8.294339179992676, "logits/rejected": -8.312765121459961, "logps/chosen": -185.74949645996094, "logps/rejected": -405.0606689453125, "loss": 64484.2438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.05801473185420036, "rewards/margins": 0.21365991234779358, "rewards/rejected": -0.15564517676830292, "step": 920 }, { "epoch": 4.708860759493671, "grad_norm": 1755118.627079852, "learning_rate": 4.099028517706048e-07, "logits/chosen": -8.692441940307617, "logits/rejected": -8.729148864746094, "logps/chosen": -177.8703155517578, "logps/rejected": -410.15179443359375, "loss": 65960.6812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.061922211199998856, "rewards/margins": 0.2333444058895111, "rewards/rejected": -0.17142215371131897, "step": 930 }, { "epoch": 4.759493670886076, "grad_norm": 1801666.0452341542, "learning_rate": 4.083359448448762e-07, "logits/chosen": -8.838138580322266, "logits/rejected": -8.679426193237305, "logps/chosen": -160.35488891601562, "logps/rejected": -387.3427429199219, "loss": 65957.3, "rewards/accuracies": 1.0, "rewards/chosen": 0.061734091490507126, "rewards/margins": 0.2303626835346222, "rewards/rejected": -0.16862855851650238, "step": 940 }, { "epoch": 4.810126582278481, "grad_norm": 1823914.1164093877, "learning_rate": 4.0676903791914757e-07, "logits/chosen": -8.039133071899414, "logits/rejected": -8.235550880432129, "logps/chosen": -181.90818786621094, "logps/rejected": -390.46075439453125, "loss": 65100.0437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05453425645828247, "rewards/margins": 0.20622405409812927, "rewards/rejected": -0.1516897976398468, "step": 950 }, { "epoch": 4.860759493670886, "grad_norm": 2552504.752187401, "learning_rate": 4.05202130993419e-07, "logits/chosen": -8.228861808776855, "logits/rejected": -8.044200897216797, "logps/chosen": -175.62306213378906, "logps/rejected": -387.7801818847656, "loss": 65251.5563, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05643890053033829, "rewards/margins": 0.2162017822265625, "rewards/rejected": -0.15976287424564362, "step": 960 }, { "epoch": 4.911392405063291, "grad_norm": 2112562.829549655, "learning_rate": 4.0363522406769034e-07, "logits/chosen": -8.678482055664062, "logits/rejected": -8.680012702941895, "logps/chosen": -180.9581298828125, "logps/rejected": -402.48944091796875, "loss": 65731.7188, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05988938361406326, "rewards/margins": 0.22270476818084717, "rewards/rejected": -0.1628153920173645, "step": 970 }, { "epoch": 4.962025316455696, "grad_norm": 1800725.2761679955, "learning_rate": 4.0206831714196175e-07, "logits/chosen": -9.068916320800781, "logits/rejected": -8.908533096313477, "logps/chosen": -191.30018615722656, "logps/rejected": -433.2850036621094, "loss": 64987.5125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0664498582482338, "rewards/margins": 0.24509286880493164, "rewards/rejected": -0.17864301800727844, "step": 980 }, { "epoch": 5.012658227848101, "grad_norm": 1442340.8531233447, "learning_rate": 4.005014102162331e-07, "logits/chosen": -7.928460121154785, "logits/rejected": -7.941502571105957, "logps/chosen": -175.59664916992188, "logps/rejected": -406.7601623535156, "loss": 62010.275, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.06751301139593124, "rewards/margins": 0.23539571464061737, "rewards/rejected": -0.16788268089294434, "step": 990 }, { "epoch": 5.063291139240507, "grad_norm": 1557498.8859861568, "learning_rate": 3.989345032905045e-07, "logits/chosen": -7.7452850341796875, "logits/rejected": -8.02453899383545, "logps/chosen": -154.46292114257812, "logps/rejected": -469.1910095214844, "loss": 49347.1687, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08384937047958374, "rewards/margins": 0.31221631169319153, "rewards/rejected": -0.2283669412136078, "step": 1000 }, { "epoch": 5.113924050632911, "grad_norm": 1581238.5613807905, "learning_rate": 3.973675963647759e-07, "logits/chosen": -7.881131649017334, "logits/rejected": -7.651412010192871, "logps/chosen": -169.71153259277344, "logps/rejected": -476.58477783203125, "loss": 49390.7562, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.08512581884860992, "rewards/margins": 0.3120972514152527, "rewards/rejected": -0.22697141766548157, "step": 1010 }, { "epoch": 5.1645569620253164, "grad_norm": 1497324.3970905554, "learning_rate": 3.958006894390473e-07, "logits/chosen": -6.736274719238281, "logits/rejected": -6.750421047210693, "logps/chosen": -151.04129028320312, "logps/rejected": -459.47808837890625, "loss": 49656.7812, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.07378469407558441, "rewards/margins": 0.3127291798591614, "rewards/rejected": -0.23894445598125458, "step": 1020 }, { "epoch": 5.215189873417722, "grad_norm": 1898671.7222835466, "learning_rate": 3.942337825133187e-07, "logits/chosen": -7.030360221862793, "logits/rejected": -6.9101104736328125, "logps/chosen": -168.35183715820312, "logps/rejected": -469.60235595703125, "loss": 49247.5312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08571706712245941, "rewards/margins": 0.3044472634792328, "rewards/rejected": -0.21873018145561218, "step": 1030 }, { "epoch": 5.265822784810126, "grad_norm": 1859831.3291458376, "learning_rate": 3.926668755875901e-07, "logits/chosen": -6.842263698577881, "logits/rejected": -6.943556308746338, "logps/chosen": -153.25328063964844, "logps/rejected": -473.513427734375, "loss": 51145.4938, "rewards/accuracies": 1.0, "rewards/chosen": 0.08420612663030624, "rewards/margins": 0.3194884657859802, "rewards/rejected": -0.235282301902771, "step": 1040 }, { "epoch": 5.3164556962025316, "grad_norm": 1855378.6614461695, "learning_rate": 3.910999686618615e-07, "logits/chosen": -7.331165313720703, "logits/rejected": -7.468164920806885, "logps/chosen": -162.1797637939453, "logps/rejected": -474.08074951171875, "loss": 50799.1687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0886077731847763, "rewards/margins": 0.31340503692626953, "rewards/rejected": -0.22479727864265442, "step": 1050 }, { "epoch": 5.367088607594937, "grad_norm": 1600231.8694471747, "learning_rate": 3.895330617361329e-07, "logits/chosen": -7.2842841148376465, "logits/rejected": -7.146345615386963, "logps/chosen": -140.54055786132812, "logps/rejected": -446.4241638183594, "loss": 49384.9875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.08458932489156723, "rewards/margins": 0.3061215877532959, "rewards/rejected": -0.22153222560882568, "step": 1060 }, { "epoch": 5.417721518987342, "grad_norm": 1820648.707460815, "learning_rate": 3.8796615481040425e-07, "logits/chosen": -7.4867706298828125, "logits/rejected": -7.318013668060303, "logps/chosen": -162.54937744140625, "logps/rejected": -469.13433837890625, "loss": 48744.0469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0876765102148056, "rewards/margins": 0.31078898906707764, "rewards/rejected": -0.22311246395111084, "step": 1070 }, { "epoch": 5.468354430379747, "grad_norm": 1629981.2772913359, "learning_rate": 3.8639924788467566e-07, "logits/chosen": -8.141877174377441, "logits/rejected": -7.992497444152832, "logps/chosen": -151.8604736328125, "logps/rejected": -496.25201416015625, "loss": 46868.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.09172078222036362, "rewards/margins": 0.3495192527770996, "rewards/rejected": -0.257798433303833, "step": 1080 }, { "epoch": 5.518987341772152, "grad_norm": 1843259.5793917184, "learning_rate": 3.84832340958947e-07, "logits/chosen": -7.577700614929199, "logits/rejected": -7.340989589691162, "logps/chosen": -152.68710327148438, "logps/rejected": -466.3287048339844, "loss": 48765.2375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08904045075178146, "rewards/margins": 0.31981557607650757, "rewards/rejected": -0.2307751476764679, "step": 1090 }, { "epoch": 5.569620253164557, "grad_norm": 1848670.003471961, "learning_rate": 3.8326543403321843e-07, "logits/chosen": -5.992789268493652, "logits/rejected": -5.831528663635254, "logps/chosen": -131.7107696533203, "logps/rejected": -433.0040588378906, "loss": 48441.2188, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08974520117044449, "rewards/margins": 0.2995590269565582, "rewards/rejected": -0.20981380343437195, "step": 1100 }, { "epoch": 5.620253164556962, "grad_norm": 1834994.3527284127, "learning_rate": 3.816985271074898e-07, "logits/chosen": -6.8782501220703125, "logits/rejected": -7.123211860656738, "logps/chosen": -143.1776885986328, "logps/rejected": -439.9363708496094, "loss": 50301.1625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0787430927157402, "rewards/margins": 0.29441121220588684, "rewards/rejected": -0.21566812694072723, "step": 1110 }, { "epoch": 5.670886075949367, "grad_norm": 2055858.9168272892, "learning_rate": 3.801316201817612e-07, "logits/chosen": -7.6317338943481445, "logits/rejected": -7.619107723236084, "logps/chosen": -152.3334503173828, "logps/rejected": -453.30120849609375, "loss": 49359.2312, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0867711529135704, "rewards/margins": 0.2968466281890869, "rewards/rejected": -0.2100754976272583, "step": 1120 }, { "epoch": 5.7215189873417724, "grad_norm": 1760917.726879333, "learning_rate": 3.7856471325603256e-07, "logits/chosen": -6.669379234313965, "logits/rejected": -6.568717002868652, "logps/chosen": -152.34774780273438, "logps/rejected": -439.8075256347656, "loss": 48808.2812, "rewards/accuracies": 1.0, "rewards/chosen": 0.08005286753177643, "rewards/margins": 0.28860196471214294, "rewards/rejected": -0.20854909718036652, "step": 1130 }, { "epoch": 5.772151898734177, "grad_norm": 1793917.574084858, "learning_rate": 3.76997806330304e-07, "logits/chosen": -7.020206451416016, "logits/rejected": -6.4513840675354, "logps/chosen": -126.99436950683594, "logps/rejected": -429.0069274902344, "loss": 48991.9938, "rewards/accuracies": 1.0, "rewards/chosen": 0.08981131762266159, "rewards/margins": 0.3046417832374573, "rewards/rejected": -0.21483047306537628, "step": 1140 }, { "epoch": 5.822784810126582, "grad_norm": 1856995.4726512374, "learning_rate": 3.7543089940457533e-07, "logits/chosen": -7.1540846824646, "logits/rejected": -7.103608131408691, "logps/chosen": -150.0362548828125, "logps/rejected": -459.3680114746094, "loss": 45240.3094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.08858338743448257, "rewards/margins": 0.3066866397857666, "rewards/rejected": -0.21810325980186462, "step": 1150 }, { "epoch": 5.8734177215189876, "grad_norm": 2252812.5376150296, "learning_rate": 3.7386399247884675e-07, "logits/chosen": -6.23285436630249, "logits/rejected": -5.795694351196289, "logps/chosen": -145.6466827392578, "logps/rejected": -485.41229248046875, "loss": 46892.1625, "rewards/accuracies": 1.0, "rewards/chosen": 0.09205026924610138, "rewards/margins": 0.34098342061042786, "rewards/rejected": -0.24893316626548767, "step": 1160 }, { "epoch": 5.924050632911392, "grad_norm": 1669143.1623524264, "learning_rate": 3.722970855531181e-07, "logits/chosen": -7.314904689788818, "logits/rejected": -7.455816745758057, "logps/chosen": -133.58151245117188, "logps/rejected": -482.9154357910156, "loss": 46493.0938, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09256922453641891, "rewards/margins": 0.34824666380882263, "rewards/rejected": -0.2556774616241455, "step": 1170 }, { "epoch": 5.974683544303797, "grad_norm": 1914279.6891733713, "learning_rate": 3.707301786273895e-07, "logits/chosen": -6.429854393005371, "logits/rejected": -5.985020160675049, "logps/chosen": -142.39651489257812, "logps/rejected": -442.7286682128906, "loss": 47640.0813, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.08776311576366425, "rewards/margins": 0.30018630623817444, "rewards/rejected": -0.2124231606721878, "step": 1180 }, { "epoch": 6.025316455696203, "grad_norm": 1287103.6124582873, "learning_rate": 3.691632717016609e-07, "logits/chosen": -6.58931827545166, "logits/rejected": -6.494097709655762, "logps/chosen": -136.68003845214844, "logps/rejected": -493.61822509765625, "loss": 41587.3125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10335598886013031, "rewards/margins": 0.36172229051589966, "rewards/rejected": -0.25836625695228577, "step": 1190 }, { "epoch": 6.075949367088608, "grad_norm": 1654691.3160849167, "learning_rate": 3.675963647759323e-07, "logits/chosen": -5.342609882354736, "logits/rejected": -5.393660545349121, "logps/chosen": -116.93675231933594, "logps/rejected": -476.22833251953125, "loss": 38118.9437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10985767841339111, "rewards/margins": 0.3632175922393799, "rewards/rejected": -0.25335997343063354, "step": 1200 }, { "epoch": 6.1265822784810124, "grad_norm": 1390108.9081190277, "learning_rate": 3.6602945785020365e-07, "logits/chosen": -5.185478687286377, "logits/rejected": -4.843894958496094, "logps/chosen": -128.81143188476562, "logps/rejected": -519.8304443359375, "loss": 36511.2875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1154375821352005, "rewards/margins": 0.3926604092121124, "rewards/rejected": -0.27722278237342834, "step": 1210 }, { "epoch": 6.177215189873418, "grad_norm": 1502780.5568957475, "learning_rate": 3.644625509244751e-07, "logits/chosen": -4.163270473480225, "logits/rejected": -3.8083653450012207, "logps/chosen": -120.57966613769531, "logps/rejected": -497.63226318359375, "loss": 37966.2937, "rewards/accuracies": 1.0, "rewards/chosen": 0.11406160891056061, "rewards/margins": 0.37608999013900757, "rewards/rejected": -0.2620283365249634, "step": 1220 }, { "epoch": 6.227848101265823, "grad_norm": 1846607.9980803088, "learning_rate": 3.6289564399874647e-07, "logits/chosen": -4.317009925842285, "logits/rejected": -4.062619209289551, "logps/chosen": -112.0468521118164, "logps/rejected": -490.73974609375, "loss": 36750.4688, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.11853437125682831, "rewards/margins": 0.37694281339645386, "rewards/rejected": -0.25840842723846436, "step": 1230 }, { "epoch": 6.2784810126582276, "grad_norm": 1432477.9223833755, "learning_rate": 3.613287370730179e-07, "logits/chosen": -4.580340385437012, "logits/rejected": -4.493284225463867, "logps/chosen": -123.97422790527344, "logps/rejected": -509.47076416015625, "loss": 37540.4875, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.11854572594165802, "rewards/margins": 0.38835546374320984, "rewards/rejected": -0.2698097229003906, "step": 1240 }, { "epoch": 6.329113924050633, "grad_norm": 1551602.6793086384, "learning_rate": 3.5976183014728924e-07, "logits/chosen": -3.541313886642456, "logits/rejected": -3.6754157543182373, "logps/chosen": -120.3751220703125, "logps/rejected": -483.46221923828125, "loss": 35927.6062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10841184854507446, "rewards/margins": 0.3652178645133972, "rewards/rejected": -0.25680604577064514, "step": 1250 }, { "epoch": 6.379746835443038, "grad_norm": 1628016.050343189, "learning_rate": 3.5819492322156066e-07, "logits/chosen": -3.570946216583252, "logits/rejected": -3.6950716972351074, "logps/chosen": -134.7080535888672, "logps/rejected": -500.80108642578125, "loss": 36467.1375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1038375124335289, "rewards/margins": 0.36301389336586, "rewards/rejected": -0.2591763734817505, "step": 1260 }, { "epoch": 6.430379746835443, "grad_norm": 1416336.114974791, "learning_rate": 3.56628016295832e-07, "logits/chosen": -2.9958808422088623, "logits/rejected": -3.158600330352783, "logps/chosen": -120.319580078125, "logps/rejected": -493.46075439453125, "loss": 35704.05, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.11720545589923859, "rewards/margins": 0.3729427754878998, "rewards/rejected": -0.2557373046875, "step": 1270 }, { "epoch": 6.481012658227848, "grad_norm": 1429276.465119334, "learning_rate": 3.5506110937010343e-07, "logits/chosen": -5.23915958404541, "logits/rejected": -5.513189792633057, "logps/chosen": -106.6229476928711, "logps/rejected": -512.9346923828125, "loss": 37476.4688, "rewards/accuracies": 1.0, "rewards/chosen": 0.1187194362282753, "rewards/margins": 0.4039131700992584, "rewards/rejected": -0.2851937413215637, "step": 1280 }, { "epoch": 6.531645569620253, "grad_norm": 1838991.6289765981, "learning_rate": 3.534942024443748e-07, "logits/chosen": -3.1320407390594482, "logits/rejected": -3.531493663787842, "logps/chosen": -114.69315338134766, "logps/rejected": -521.70458984375, "loss": 37236.3688, "rewards/accuracies": 1.0, "rewards/chosen": 0.12156815826892853, "rewards/margins": 0.39552414417266846, "rewards/rejected": -0.2739560008049011, "step": 1290 }, { "epoch": 6.582278481012658, "grad_norm": 1965294.5428377022, "learning_rate": 3.519272955186462e-07, "logits/chosen": -3.1404528617858887, "logits/rejected": -3.159364938735962, "logps/chosen": -108.1359634399414, "logps/rejected": -441.573486328125, "loss": 35760.8688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10433737188577652, "rewards/margins": 0.3334364593029022, "rewards/rejected": -0.2290991097688675, "step": 1300 }, { "epoch": 6.632911392405063, "grad_norm": 1744782.725381992, "learning_rate": 3.5036038859291756e-07, "logits/chosen": -5.149240970611572, "logits/rejected": -4.872938632965088, "logps/chosen": -110.17635345458984, "logps/rejected": -462.6591796875, "loss": 38854.3313, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10797703266143799, "rewards/margins": 0.35402077436447144, "rewards/rejected": -0.24604372680187225, "step": 1310 }, { "epoch": 6.6835443037974684, "grad_norm": 1449584.094036676, "learning_rate": 3.4879348166718897e-07, "logits/chosen": -5.302030086517334, "logits/rejected": -5.005532264709473, "logps/chosen": -114.39412689208984, "logps/rejected": -497.2879943847656, "loss": 37031.9281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.11388063430786133, "rewards/margins": 0.38410684466362, "rewards/rejected": -0.27022621035575867, "step": 1320 }, { "epoch": 6.734177215189874, "grad_norm": 1655726.3529691189, "learning_rate": 3.4722657474146033e-07, "logits/chosen": -5.846579074859619, "logits/rejected": -5.164810657501221, "logps/chosen": -122.16035461425781, "logps/rejected": -490.97503662109375, "loss": 35881.3438, "rewards/accuracies": 1.0, "rewards/chosen": 0.11242518573999405, "rewards/margins": 0.3698340058326721, "rewards/rejected": -0.2574087679386139, "step": 1330 }, { "epoch": 6.784810126582278, "grad_norm": 1473850.8586688952, "learning_rate": 3.4565966781573174e-07, "logits/chosen": -6.604684352874756, "logits/rejected": -6.540472984313965, "logps/chosen": -141.56655883789062, "logps/rejected": -504.536865234375, "loss": 35791.1937, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.11088699102401733, "rewards/margins": 0.36103492975234985, "rewards/rejected": -0.2501479685306549, "step": 1340 }, { "epoch": 6.8354430379746836, "grad_norm": 1716575.4855753484, "learning_rate": 3.440927608900031e-07, "logits/chosen": -5.3845696449279785, "logits/rejected": -5.094508647918701, "logps/chosen": -126.5009536743164, "logps/rejected": -501.36407470703125, "loss": 36855.7281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.12023582309484482, "rewards/margins": 0.3794700503349304, "rewards/rejected": -0.2592342793941498, "step": 1350 }, { "epoch": 6.886075949367089, "grad_norm": 1860603.9086510486, "learning_rate": 3.425258539642745e-07, "logits/chosen": -5.825100898742676, "logits/rejected": -5.165715217590332, "logps/chosen": -123.0651626586914, "logps/rejected": -519.5916748046875, "loss": 37158.7969, "rewards/accuracies": 1.0, "rewards/chosen": 0.11998645961284637, "rewards/margins": 0.40252119302749634, "rewards/rejected": -0.28253474831581116, "step": 1360 }, { "epoch": 6.936708860759493, "grad_norm": 1781429.39957367, "learning_rate": 3.4095894703854587e-07, "logits/chosen": -5.593798637390137, "logits/rejected": -5.400781631469727, "logps/chosen": -122.57585144042969, "logps/rejected": -500.21844482421875, "loss": 36281.8938, "rewards/accuracies": 1.0, "rewards/chosen": 0.11947381496429443, "rewards/margins": 0.377518892288208, "rewards/rejected": -0.25804510712623596, "step": 1370 }, { "epoch": 6.987341772151899, "grad_norm": 1883344.192547866, "learning_rate": 3.393920401128173e-07, "logits/chosen": -5.272061347961426, "logits/rejected": -5.000374794006348, "logps/chosen": -109.66764831542969, "logps/rejected": -471.388916015625, "loss": 37081.4062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.11030924320220947, "rewards/margins": 0.36379513144493103, "rewards/rejected": -0.25348588824272156, "step": 1380 }, { "epoch": 7.037974683544304, "grad_norm": 1158283.9951295503, "learning_rate": 3.3782513318708864e-07, "logits/chosen": -4.4635396003723145, "logits/rejected": -4.055373668670654, "logps/chosen": -126.25242614746094, "logps/rejected": -513.0021362304688, "loss": 32182.2562, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.12364669889211655, "rewards/margins": 0.39015716314315796, "rewards/rejected": -0.2665104568004608, "step": 1390 }, { "epoch": 7.0886075949367084, "grad_norm": 1635336.0000705447, "learning_rate": 3.3625822626136005e-07, "logits/chosen": -3.2711379528045654, "logits/rejected": -2.849708080291748, "logps/chosen": -120.3502426147461, "logps/rejected": -554.61669921875, "loss": 28154.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.1400633156299591, "rewards/margins": 0.4437219500541687, "rewards/rejected": -0.3036586344242096, "step": 1400 }, { "epoch": 7.139240506329114, "grad_norm": 1478880.6175367055, "learning_rate": 3.346913193356314e-07, "logits/chosen": -1.498684048652649, "logits/rejected": -1.5719478130340576, "logps/chosen": -97.41731262207031, "logps/rejected": -528.29833984375, "loss": 30443.8531, "rewards/accuracies": 1.0, "rewards/chosen": 0.13250485062599182, "rewards/margins": 0.4276755452156067, "rewards/rejected": -0.29517072439193726, "step": 1410 }, { "epoch": 7.189873417721519, "grad_norm": 1190966.9261622827, "learning_rate": 3.331244124099029e-07, "logits/chosen": -3.576815366744995, "logits/rejected": -3.1508662700653076, "logps/chosen": -92.4610595703125, "logps/rejected": -499.2225646972656, "loss": 30200.7656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1342589408159256, "rewards/margins": 0.40714582800865173, "rewards/rejected": -0.2728869318962097, "step": 1420 }, { "epoch": 7.2405063291139244, "grad_norm": 1654460.4321586012, "learning_rate": 3.3155750548417424e-07, "logits/chosen": -3.6517982482910156, "logits/rejected": -2.912386894226074, "logps/chosen": -113.77073669433594, "logps/rejected": -548.2919921875, "loss": 29291.1719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13462531566619873, "rewards/margins": 0.435891717672348, "rewards/rejected": -0.3012663722038269, "step": 1430 }, { "epoch": 7.291139240506329, "grad_norm": 1547048.8074025025, "learning_rate": 3.2999059855844565e-07, "logits/chosen": -4.762998580932617, "logits/rejected": -4.417517185211182, "logps/chosen": -103.59019470214844, "logps/rejected": -516.0870361328125, "loss": 30597.95, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1253672093153, "rewards/margins": 0.4090943932533264, "rewards/rejected": -0.28372713923454285, "step": 1440 }, { "epoch": 7.341772151898734, "grad_norm": 1083334.846955902, "learning_rate": 3.28423691632717e-07, "logits/chosen": -4.341902732849121, "logits/rejected": -3.4809889793395996, "logps/chosen": -105.1113052368164, "logps/rejected": -537.7858276367188, "loss": 28933.9125, "rewards/accuracies": 1.0, "rewards/chosen": 0.139817476272583, "rewards/margins": 0.4371423125267029, "rewards/rejected": -0.2973248362541199, "step": 1450 }, { "epoch": 7.3924050632911396, "grad_norm": 1583721.4157786674, "learning_rate": 3.268567847069884e-07, "logits/chosen": -5.8856353759765625, "logits/rejected": -5.3746867179870605, "logps/chosen": -94.76522827148438, "logps/rejected": -525.3110961914062, "loss": 29575.7844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13582661747932434, "rewards/margins": 0.4354213774204254, "rewards/rejected": -0.29959478974342346, "step": 1460 }, { "epoch": 7.443037974683544, "grad_norm": 1391896.6733071958, "learning_rate": 3.252898777812598e-07, "logits/chosen": -3.2749342918395996, "logits/rejected": -3.6061177253723145, "logps/chosen": -99.21089172363281, "logps/rejected": -534.4422607421875, "loss": 29207.5719, "rewards/accuracies": 1.0, "rewards/chosen": 0.1312985122203827, "rewards/margins": 0.433136522769928, "rewards/rejected": -0.3018379807472229, "step": 1470 }, { "epoch": 7.493670886075949, "grad_norm": 1294960.5242478126, "learning_rate": 3.237229708555312e-07, "logits/chosen": -2.985567808151245, "logits/rejected": -1.8726612329483032, "logps/chosen": -112.32755279541016, "logps/rejected": -509.37286376953125, "loss": 29187.1594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1289207637310028, "rewards/margins": 0.4079267978668213, "rewards/rejected": -0.27900606393814087, "step": 1480 }, { "epoch": 7.544303797468355, "grad_norm": 1193173.6877739348, "learning_rate": 3.2215606392980255e-07, "logits/chosen": -2.0656161308288574, "logits/rejected": -2.3443799018859863, "logps/chosen": -97.64754486083984, "logps/rejected": -511.40576171875, "loss": 29322.4313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13589712977409363, "rewards/margins": 0.413860946893692, "rewards/rejected": -0.2779638171195984, "step": 1490 }, { "epoch": 7.594936708860759, "grad_norm": 1279108.0637389964, "learning_rate": 3.2058915700407396e-07, "logits/chosen": -3.5005557537078857, "logits/rejected": -3.4204413890838623, "logps/chosen": -107.39742279052734, "logps/rejected": -530.2638549804688, "loss": 27542.3625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13995657861232758, "rewards/margins": 0.42647701501846313, "rewards/rejected": -0.28652042150497437, "step": 1500 }, { "epoch": 7.6455696202531644, "grad_norm": 2707102.044355496, "learning_rate": 3.190222500783453e-07, "logits/chosen": -4.715664863586426, "logits/rejected": -4.245431900024414, "logps/chosen": -101.01532745361328, "logps/rejected": -561.7377319335938, "loss": 29571.3625, "rewards/accuracies": 1.0, "rewards/chosen": 0.14493677020072937, "rewards/margins": 0.4646069407463074, "rewards/rejected": -0.3196701109409332, "step": 1510 }, { "epoch": 7.69620253164557, "grad_norm": 1346703.2802720347, "learning_rate": 3.1745534315261674e-07, "logits/chosen": -2.4094414710998535, "logits/rejected": -2.316082715988159, "logps/chosen": -90.64556121826172, "logps/rejected": -524.6895751953125, "loss": 29962.2875, "rewards/accuracies": 1.0, "rewards/chosen": 0.1430484652519226, "rewards/margins": 0.4339544177055359, "rewards/rejected": -0.2909059524536133, "step": 1520 }, { "epoch": 7.746835443037975, "grad_norm": 1570681.8076612286, "learning_rate": 3.158884362268881e-07, "logits/chosen": -1.977839708328247, "logits/rejected": -1.748456597328186, "logps/chosen": -95.17073822021484, "logps/rejected": -536.3465576171875, "loss": 29005.075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13247540593147278, "rewards/margins": 0.44195109605789185, "rewards/rejected": -0.3094756603240967, "step": 1530 }, { "epoch": 7.7974683544303796, "grad_norm": 1321655.562082779, "learning_rate": 3.143215293011595e-07, "logits/chosen": -5.75424861907959, "logits/rejected": -5.283251762390137, "logps/chosen": -109.5367202758789, "logps/rejected": -538.626220703125, "loss": 29057.1688, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.14621947705745697, "rewards/margins": 0.43537068367004395, "rewards/rejected": -0.2891511619091034, "step": 1540 }, { "epoch": 7.848101265822785, "grad_norm": 1360253.1191038797, "learning_rate": 3.1275462237543087e-07, "logits/chosen": -3.4590229988098145, "logits/rejected": -3.5962212085723877, "logps/chosen": -114.27938079833984, "logps/rejected": -566.5555419921875, "loss": 29716.3094, "rewards/accuracies": 1.0, "rewards/chosen": 0.1347774863243103, "rewards/margins": 0.44886675477027893, "rewards/rejected": -0.314089298248291, "step": 1550 }, { "epoch": 7.89873417721519, "grad_norm": 1269167.0621019504, "learning_rate": 3.111877154497023e-07, "logits/chosen": -1.0884647369384766, "logits/rejected": -0.7194244265556335, "logps/chosen": -89.07111358642578, "logps/rejected": -494.15789794921875, "loss": 29335.9875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1284293383359909, "rewards/margins": 0.4071559011936188, "rewards/rejected": -0.2787265181541443, "step": 1560 }, { "epoch": 7.949367088607595, "grad_norm": 1453875.4579149496, "learning_rate": 3.0962080852397364e-07, "logits/chosen": -2.750883102416992, "logits/rejected": -3.123683452606201, "logps/chosen": -98.0600357055664, "logps/rejected": -508.206298828125, "loss": 29392.4875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.13056252896785736, "rewards/margins": 0.4083867073059082, "rewards/rejected": -0.2778242230415344, "step": 1570 }, { "epoch": 8.0, "grad_norm": 1764041.9454831716, "learning_rate": 3.0805390159824505e-07, "logits/chosen": -3.7020182609558105, "logits/rejected": -2.8675622940063477, "logps/chosen": -112.20640563964844, "logps/rejected": -527.1363525390625, "loss": 30214.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.1370132714509964, "rewards/margins": 0.42148295044898987, "rewards/rejected": -0.2844696640968323, "step": 1580 }, { "epoch": 8.050632911392405, "grad_norm": 1502727.0577222395, "learning_rate": 3.064869946725164e-07, "logits/chosen": -2.0656542778015137, "logits/rejected": -1.5985521078109741, "logps/chosen": -84.60444641113281, "logps/rejected": -520.1857299804688, "loss": 24723.275, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1492975652217865, "rewards/margins": 0.4404692053794861, "rewards/rejected": -0.2911716103553772, "step": 1590 }, { "epoch": 8.10126582278481, "grad_norm": 838369.9468876831, "learning_rate": 3.049200877467878e-07, "logits/chosen": -1.758178949356079, "logits/rejected": -0.7727742791175842, "logps/chosen": -83.45867919921875, "logps/rejected": -530.3883666992188, "loss": 25817.0203, "rewards/accuracies": 1.0, "rewards/chosen": 0.14538443088531494, "rewards/margins": 0.45367687940597534, "rewards/rejected": -0.3082924485206604, "step": 1600 }, { "epoch": 8.151898734177216, "grad_norm": 1012852.54550217, "learning_rate": 3.0335318082105923e-07, "logits/chosen": -2.217496156692505, "logits/rejected": -2.0143866539001465, "logps/chosen": -100.38580322265625, "logps/rejected": -549.8438720703125, "loss": 25090.8891, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13634233176708221, "rewards/margins": 0.44348135590553284, "rewards/rejected": -0.30713900923728943, "step": 1610 }, { "epoch": 8.20253164556962, "grad_norm": 1056784.1797241461, "learning_rate": 3.0178627389533064e-07, "logits/chosen": -1.1953948736190796, "logits/rejected": -0.2751680910587311, "logps/chosen": -89.64523315429688, "logps/rejected": -510.4059143066406, "loss": 24456.725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.14029642939567566, "rewards/margins": 0.4281511902809143, "rewards/rejected": -0.28785476088523865, "step": 1620 }, { "epoch": 8.253164556962025, "grad_norm": 1147595.1251004518, "learning_rate": 3.00219366969602e-07, "logits/chosen": -2.550518035888672, "logits/rejected": -2.5027434825897217, "logps/chosen": -76.6513442993164, "logps/rejected": -524.4201049804688, "loss": 23486.5594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15493164956569672, "rewards/margins": 0.44891220331192017, "rewards/rejected": -0.29398053884506226, "step": 1630 }, { "epoch": 8.30379746835443, "grad_norm": 1390175.0732444616, "learning_rate": 2.986524600438734e-07, "logits/chosen": -0.059876419603824615, "logits/rejected": 0.00422248849645257, "logps/chosen": -74.77996063232422, "logps/rejected": -544.7862548828125, "loss": 24176.6094, "rewards/accuracies": 1.0, "rewards/chosen": 0.151381716132164, "rewards/margins": 0.4694734215736389, "rewards/rejected": -0.3180916905403137, "step": 1640 }, { "epoch": 8.354430379746836, "grad_norm": 1846159.1203677754, "learning_rate": 2.970855531181448e-07, "logits/chosen": -3.206434726715088, "logits/rejected": -2.6545357704162598, "logps/chosen": -79.13458251953125, "logps/rejected": -529.1912841796875, "loss": 25560.5344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.14862783253192902, "rewards/margins": 0.4489147663116455, "rewards/rejected": -0.3002868890762329, "step": 1650 }, { "epoch": 8.405063291139241, "grad_norm": 1294602.7153889702, "learning_rate": 2.955186461924162e-07, "logits/chosen": -1.0581172704696655, "logits/rejected": -0.6744507551193237, "logps/chosen": -78.69017028808594, "logps/rejected": -526.4840087890625, "loss": 25549.9125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.14595063030719757, "rewards/margins": 0.44837069511413574, "rewards/rejected": -0.302420049905777, "step": 1660 }, { "epoch": 8.455696202531646, "grad_norm": 1653521.5239311927, "learning_rate": 2.9395173926668755e-07, "logits/chosen": -0.9036309123039246, "logits/rejected": -0.16554176807403564, "logps/chosen": -83.71012878417969, "logps/rejected": -525.7719116210938, "loss": 25089.5516, "rewards/accuracies": 1.0, "rewards/chosen": 0.14826878905296326, "rewards/margins": 0.4438709616661072, "rewards/rejected": -0.2956022024154663, "step": 1670 }, { "epoch": 8.50632911392405, "grad_norm": 1371497.4089594388, "learning_rate": 2.9238483234095896e-07, "logits/chosen": -1.423182725906372, "logits/rejected": -1.0717556476593018, "logps/chosen": -89.4638671875, "logps/rejected": -577.1199340820312, "loss": 24558.0953, "rewards/accuracies": 1.0, "rewards/chosen": 0.15898647904396057, "rewards/margins": 0.48913446068763733, "rewards/rejected": -0.330147922039032, "step": 1680 }, { "epoch": 8.556962025316455, "grad_norm": 1476867.0955964676, "learning_rate": 2.908179254152303e-07, "logits/chosen": -3.2004425525665283, "logits/rejected": -2.7161200046539307, "logps/chosen": -86.7264633178711, "logps/rejected": -543.3889770507812, "loss": 26642.4781, "rewards/accuracies": 1.0, "rewards/chosen": 0.1485292911529541, "rewards/margins": 0.4551934599876404, "rewards/rejected": -0.3066641688346863, "step": 1690 }, { "epoch": 8.60759493670886, "grad_norm": 1134090.4892000444, "learning_rate": 2.8925101848950173e-07, "logits/chosen": -0.274528443813324, "logits/rejected": 0.4862538278102875, "logps/chosen": -79.16570281982422, "logps/rejected": -513.53173828125, "loss": 23741.9938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15034614503383636, "rewards/margins": 0.43597039580345154, "rewards/rejected": -0.28562426567077637, "step": 1700 }, { "epoch": 8.658227848101266, "grad_norm": 1314089.2981008843, "learning_rate": 2.876841115637731e-07, "logits/chosen": 0.6013806462287903, "logits/rejected": 1.2335985898971558, "logps/chosen": -90.46197509765625, "logps/rejected": -551.8345947265625, "loss": 24216.4281, "rewards/accuracies": 1.0, "rewards/chosen": 0.1541350781917572, "rewards/margins": 0.47102633118629456, "rewards/rejected": -0.3168913424015045, "step": 1710 }, { "epoch": 8.708860759493671, "grad_norm": 1622019.967143891, "learning_rate": 2.861172046380445e-07, "logits/chosen": 0.2407432496547699, "logits/rejected": 0.4264713227748871, "logps/chosen": -93.0431900024414, "logps/rejected": -564.0677490234375, "loss": 23649.3016, "rewards/accuracies": 1.0, "rewards/chosen": 0.147947758436203, "rewards/margins": 0.4662678837776184, "rewards/rejected": -0.3183201253414154, "step": 1720 }, { "epoch": 8.759493670886076, "grad_norm": 1520791.345848389, "learning_rate": 2.8455029771231586e-07, "logits/chosen": 0.6626393795013428, "logits/rejected": 0.7864507436752319, "logps/chosen": -94.95128631591797, "logps/rejected": -540.1358642578125, "loss": 25224.3125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.14551883935928345, "rewards/margins": 0.4529417157173157, "rewards/rejected": -0.3074227571487427, "step": 1730 }, { "epoch": 8.810126582278482, "grad_norm": 1625465.2135884068, "learning_rate": 2.8298339078658727e-07, "logits/chosen": -0.07786345481872559, "logits/rejected": -0.031427524983882904, "logps/chosen": -90.72882843017578, "logps/rejected": -539.1676025390625, "loss": 24133.7531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15023007988929749, "rewards/margins": 0.4491490423679352, "rewards/rejected": -0.2989189624786377, "step": 1740 }, { "epoch": 8.860759493670885, "grad_norm": 1330490.8036484018, "learning_rate": 2.8141648386085863e-07, "logits/chosen": 0.1896178424358368, "logits/rejected": 1.3701179027557373, "logps/chosen": -78.11041259765625, "logps/rejected": -545.9954833984375, "loss": 24713.5375, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.15004639327526093, "rewards/margins": 0.4731353223323822, "rewards/rejected": -0.32308894395828247, "step": 1750 }, { "epoch": 8.91139240506329, "grad_norm": 1240332.5244059283, "learning_rate": 2.7984957693513004e-07, "logits/chosen": 0.09949211776256561, "logits/rejected": 0.6086061596870422, "logps/chosen": -84.04310607910156, "logps/rejected": -550.8171997070312, "loss": 24452.55, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.14817103743553162, "rewards/margins": 0.47146469354629517, "rewards/rejected": -0.32329362630844116, "step": 1760 }, { "epoch": 8.962025316455696, "grad_norm": 1279998.0524960216, "learning_rate": 2.782826700094014e-07, "logits/chosen": -1.9250777959823608, "logits/rejected": -1.7448539733886719, "logps/chosen": -92.84037780761719, "logps/rejected": -539.1063232421875, "loss": 25664.2531, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1440330594778061, "rewards/margins": 0.45180240273475647, "rewards/rejected": -0.3077693581581116, "step": 1770 }, { "epoch": 9.012658227848101, "grad_norm": 1042157.0097295721, "learning_rate": 2.767157630836728e-07, "logits/chosen": -2.344456911087036, "logits/rejected": -2.174999713897705, "logps/chosen": -74.14456939697266, "logps/rejected": -549.884033203125, "loss": 22791.725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.16015887260437012, "rewards/margins": 0.47513628005981445, "rewards/rejected": -0.31497737765312195, "step": 1780 }, { "epoch": 9.063291139240507, "grad_norm": 1604328.8989550385, "learning_rate": 2.751488561579442e-07, "logits/chosen": -0.4028230607509613, "logits/rejected": -0.017443586140871048, "logps/chosen": -78.17924499511719, "logps/rejected": -555.5220947265625, "loss": 21934.7781, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.161887988448143, "rewards/margins": 0.47605371475219727, "rewards/rejected": -0.3141656517982483, "step": 1790 }, { "epoch": 9.113924050632912, "grad_norm": 930218.7877013405, "learning_rate": 2.7358194923221564e-07, "logits/chosen": -0.10258030891418457, "logits/rejected": -0.2491408884525299, "logps/chosen": -67.35882568359375, "logps/rejected": -562.8963623046875, "loss": 20609.7047, "rewards/accuracies": 1.0, "rewards/chosen": 0.1559842973947525, "rewards/margins": 0.4920543134212494, "rewards/rejected": -0.33607012033462524, "step": 1800 }, { "epoch": 9.164556962025316, "grad_norm": 1965412.9139898522, "learning_rate": 2.72015042306487e-07, "logits/chosen": 0.5992544889450073, "logits/rejected": 0.6971222162246704, "logps/chosen": -68.12413024902344, "logps/rejected": -546.7501220703125, "loss": 21574.0656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16274484992027283, "rewards/margins": 0.475511372089386, "rewards/rejected": -0.31276652216911316, "step": 1810 }, { "epoch": 9.215189873417721, "grad_norm": 1012215.1362345209, "learning_rate": 2.704481353807584e-07, "logits/chosen": -0.252922922372818, "logits/rejected": 0.7370151281356812, "logps/chosen": -68.61247253417969, "logps/rejected": -545.773193359375, "loss": 21584.0, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.15732263028621674, "rewards/margins": 0.47610074281692505, "rewards/rejected": -0.3187780976295471, "step": 1820 }, { "epoch": 9.265822784810126, "grad_norm": 1317328.2635211374, "learning_rate": 2.6888122845502977e-07, "logits/chosen": -0.5902656316757202, "logits/rejected": -0.200765460729599, "logps/chosen": -72.17051696777344, "logps/rejected": -560.718994140625, "loss": 20662.6562, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16373535990715027, "rewards/margins": 0.49004659056663513, "rewards/rejected": -0.32631123065948486, "step": 1830 }, { "epoch": 9.316455696202532, "grad_norm": 1202220.669797323, "learning_rate": 2.673143215293012e-07, "logits/chosen": -0.9152681231498718, "logits/rejected": -0.46515974402427673, "logps/chosen": -71.53898620605469, "logps/rejected": -545.0053100585938, "loss": 22147.6375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1599002182483673, "rewards/margins": 0.47435054183006287, "rewards/rejected": -0.31445032358169556, "step": 1840 }, { "epoch": 9.367088607594937, "grad_norm": 858793.4443150639, "learning_rate": 2.6574741460357254e-07, "logits/chosen": 0.8187123537063599, "logits/rejected": 0.9660876393318176, "logps/chosen": -68.53959655761719, "logps/rejected": -533.693603515625, "loss": 22383.2656, "rewards/accuracies": 1.0, "rewards/chosen": 0.15871909260749817, "rewards/margins": 0.46780315041542053, "rewards/rejected": -0.30908405780792236, "step": 1850 }, { "epoch": 9.417721518987342, "grad_norm": 753710.4553891663, "learning_rate": 2.6418050767784395e-07, "logits/chosen": 0.07855646312236786, "logits/rejected": -0.0003270745219197124, "logps/chosen": -71.92098236083984, "logps/rejected": -532.4739990234375, "loss": 22731.7687, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16191932559013367, "rewards/margins": 0.4635027348995209, "rewards/rejected": -0.3015834391117096, "step": 1860 }, { "epoch": 9.468354430379748, "grad_norm": 1208088.8106737435, "learning_rate": 2.626136007521153e-07, "logits/chosen": -0.23646318912506104, "logits/rejected": 0.0054475306533277035, "logps/chosen": -66.38209533691406, "logps/rejected": -541.2474365234375, "loss": 22257.4375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15835285186767578, "rewards/margins": 0.47471290826797485, "rewards/rejected": -0.3163600265979767, "step": 1870 }, { "epoch": 9.518987341772151, "grad_norm": 1301078.6439378709, "learning_rate": 2.610466938263867e-07, "logits/chosen": -1.2212382555007935, "logits/rejected": -1.2270792722702026, "logps/chosen": -69.9106674194336, "logps/rejected": -537.7271728515625, "loss": 22528.825, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15471485257148743, "rewards/margins": 0.46409493684768677, "rewards/rejected": -0.30938002467155457, "step": 1880 }, { "epoch": 9.569620253164556, "grad_norm": 1146807.5987679055, "learning_rate": 2.594797869006581e-07, "logits/chosen": -1.618896484375, "logits/rejected": -1.3599251508712769, "logps/chosen": -77.14048767089844, "logps/rejected": -519.0086059570312, "loss": 20937.9, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1560250073671341, "rewards/margins": 0.44421762228012085, "rewards/rejected": -0.28819265961647034, "step": 1890 }, { "epoch": 9.620253164556962, "grad_norm": 1143412.3516794874, "learning_rate": 2.579128799749295e-07, "logits/chosen": -0.6647695302963257, "logits/rejected": -0.6680254936218262, "logps/chosen": -85.31086730957031, "logps/rejected": -573.4449462890625, "loss": 21446.8719, "rewards/accuracies": 1.0, "rewards/chosen": 0.16069479286670685, "rewards/margins": 0.486908495426178, "rewards/rejected": -0.32621368765830994, "step": 1900 }, { "epoch": 9.670886075949367, "grad_norm": 874554.4726819041, "learning_rate": 2.5634597304920085e-07, "logits/chosen": -2.4332644939422607, "logits/rejected": -2.143573522567749, "logps/chosen": -73.66841125488281, "logps/rejected": -567.8841552734375, "loss": 21540.7203, "rewards/accuracies": 1.0, "rewards/chosen": 0.1681254804134369, "rewards/margins": 0.49868589639663696, "rewards/rejected": -0.3305602967739105, "step": 1910 }, { "epoch": 9.721518987341772, "grad_norm": 1796698.8005837006, "learning_rate": 2.5477906612347227e-07, "logits/chosen": 1.2071720361709595, "logits/rejected": 1.811336874961853, "logps/chosen": -68.67604064941406, "logps/rejected": -531.2750244140625, "loss": 22819.1078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1492142677307129, "rewards/margins": 0.4690275192260742, "rewards/rejected": -0.31981322169303894, "step": 1920 }, { "epoch": 9.772151898734178, "grad_norm": 1652289.4059097564, "learning_rate": 2.532121591977436e-07, "logits/chosen": -0.47033196687698364, "logits/rejected": -0.13743743300437927, "logps/chosen": -58.46977996826172, "logps/rejected": -548.3218383789062, "loss": 22147.9906, "rewards/accuracies": 1.0, "rewards/chosen": 0.16477254033088684, "rewards/margins": 0.4882374703884125, "rewards/rejected": -0.32346493005752563, "step": 1930 }, { "epoch": 9.822784810126583, "grad_norm": 1031570.3956932048, "learning_rate": 2.5164525227201504e-07, "logits/chosen": -1.3281480073928833, "logits/rejected": -0.6028780937194824, "logps/chosen": -71.20520782470703, "logps/rejected": -560.7177124023438, "loss": 21547.1453, "rewards/accuracies": 1.0, "rewards/chosen": 0.16829116642475128, "rewards/margins": 0.4920671880245209, "rewards/rejected": -0.3237760066986084, "step": 1940 }, { "epoch": 9.873417721518987, "grad_norm": 997159.4818661372, "learning_rate": 2.500783453462864e-07, "logits/chosen": 0.0865519791841507, "logits/rejected": 1.0491398572921753, "logps/chosen": -66.77009582519531, "logps/rejected": -538.1752319335938, "loss": 21311.2047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15671603381633759, "rewards/margins": 0.4763658046722412, "rewards/rejected": -0.3196497857570648, "step": 1950 }, { "epoch": 9.924050632911392, "grad_norm": 2765789.1484618983, "learning_rate": 2.485114384205578e-07, "logits/chosen": 0.05377687141299248, "logits/rejected": 0.6552912592887878, "logps/chosen": -67.99398803710938, "logps/rejected": -554.9031982421875, "loss": 20360.5656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.16012230515480042, "rewards/margins": 0.48966652154922485, "rewards/rejected": -0.3295442461967468, "step": 1960 }, { "epoch": 9.974683544303797, "grad_norm": 778456.3899893347, "learning_rate": 2.4694453149482917e-07, "logits/chosen": -1.8621749877929688, "logits/rejected": -0.9629243612289429, "logps/chosen": -76.34040832519531, "logps/rejected": -570.4073486328125, "loss": 20853.2188, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16522939503192902, "rewards/margins": 0.4955335259437561, "rewards/rejected": -0.3303041160106659, "step": 1970 }, { "epoch": 10.025316455696203, "grad_norm": 1813632.2248899846, "learning_rate": 2.453776245691006e-07, "logits/chosen": -1.164282202720642, "logits/rejected": -1.4965863227844238, "logps/chosen": -64.31637573242188, "logps/rejected": -555.818115234375, "loss": 20145.1469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17363281548023224, "rewards/margins": 0.4858935475349426, "rewards/rejected": -0.3122607469558716, "step": 1980 }, { "epoch": 10.075949367088608, "grad_norm": 1332924.3073966086, "learning_rate": 2.4381071764337194e-07, "logits/chosen": -0.629298746585846, "logits/rejected": -0.301331102848053, "logps/chosen": -63.670082092285156, "logps/rejected": -531.6769409179688, "loss": 19644.3969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17206737399101257, "rewards/margins": 0.46868830919265747, "rewards/rejected": -0.29662084579467773, "step": 1990 }, { "epoch": 10.126582278481013, "grad_norm": 1851357.970280298, "learning_rate": 2.4224381071764335e-07, "logits/chosen": -0.845658004283905, "logits/rejected": -0.27886706590652466, "logps/chosen": -64.26731872558594, "logps/rejected": -551.6077270507812, "loss": 19949.7859, "rewards/accuracies": 1.0, "rewards/chosen": 0.16876797378063202, "rewards/margins": 0.4869278073310852, "rewards/rejected": -0.318159818649292, "step": 2000 }, { "epoch": 10.177215189873417, "grad_norm": 711674.3296077562, "learning_rate": 2.4067690379191476e-07, "logits/chosen": -0.5739536285400391, "logits/rejected": -0.18802312016487122, "logps/chosen": -68.64383697509766, "logps/rejected": -557.96484375, "loss": 18812.7141, "rewards/accuracies": 1.0, "rewards/chosen": 0.16457512974739075, "rewards/margins": 0.48935467004776, "rewards/rejected": -0.3247795104980469, "step": 2010 }, { "epoch": 10.227848101265822, "grad_norm": 1174456.0466990366, "learning_rate": 2.391099968661861e-07, "logits/chosen": -1.7381559610366821, "logits/rejected": -0.10386524349451065, "logps/chosen": -62.054725646972656, "logps/rejected": -570.2944946289062, "loss": 19933.8734, "rewards/accuracies": 1.0, "rewards/chosen": 0.17416052520275116, "rewards/margins": 0.515259861946106, "rewards/rejected": -0.3410993218421936, "step": 2020 }, { "epoch": 10.278481012658228, "grad_norm": 1096601.381122318, "learning_rate": 2.375430899404575e-07, "logits/chosen": -0.8541361093521118, "logits/rejected": -0.3781866133213043, "logps/chosen": -56.479164123535156, "logps/rejected": -554.1986694335938, "loss": 19863.0656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17042097449302673, "rewards/margins": 0.4970123767852783, "rewards/rejected": -0.3265914022922516, "step": 2030 }, { "epoch": 10.329113924050633, "grad_norm": 840837.174069809, "learning_rate": 2.3597618301472892e-07, "logits/chosen": -1.694748878479004, "logits/rejected": -1.400233268737793, "logps/chosen": -75.36106872558594, "logps/rejected": -590.9649047851562, "loss": 19475.8531, "rewards/accuracies": 1.0, "rewards/chosen": 0.1772707998752594, "rewards/margins": 0.5156514644622803, "rewards/rejected": -0.33838069438934326, "step": 2040 }, { "epoch": 10.379746835443038, "grad_norm": 803470.3932805886, "learning_rate": 2.344092760890003e-07, "logits/chosen": -1.5708272457122803, "logits/rejected": -1.7595367431640625, "logps/chosen": -67.97745513916016, "logps/rejected": -574.4043579101562, "loss": 20348.9719, "rewards/accuracies": 1.0, "rewards/chosen": 0.1695125252008438, "rewards/margins": 0.5006899237632751, "rewards/rejected": -0.33117741346359253, "step": 2050 }, { "epoch": 10.430379746835444, "grad_norm": 1004663.9694340345, "learning_rate": 2.328423691632717e-07, "logits/chosen": -1.962457299232483, "logits/rejected": -1.3877923488616943, "logps/chosen": -68.37916564941406, "logps/rejected": -552.3621826171875, "loss": 19908.175, "rewards/accuracies": 1.0, "rewards/chosen": 0.17361022531986237, "rewards/margins": 0.4852239489555359, "rewards/rejected": -0.31161370873451233, "step": 2060 }, { "epoch": 10.481012658227849, "grad_norm": 907359.5128728322, "learning_rate": 2.3127546223754308e-07, "logits/chosen": -0.9135034680366516, "logits/rejected": -0.6688288450241089, "logps/chosen": -66.34752655029297, "logps/rejected": -564.5549926757812, "loss": 19321.1625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17101381719112396, "rewards/margins": 0.49828824400901794, "rewards/rejected": -0.3272744417190552, "step": 2070 }, { "epoch": 10.531645569620252, "grad_norm": 846691.3768602766, "learning_rate": 2.2970855531181446e-07, "logits/chosen": -0.16942422091960907, "logits/rejected": 0.07732643932104111, "logps/chosen": -70.53272247314453, "logps/rejected": -561.4410400390625, "loss": 20015.3156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16945675015449524, "rewards/margins": 0.4873596131801605, "rewards/rejected": -0.31790289282798767, "step": 2080 }, { "epoch": 10.582278481012658, "grad_norm": 1071818.197184184, "learning_rate": 2.2814164838608585e-07, "logits/chosen": -3.7376797199249268, "logits/rejected": -3.6469883918762207, "logps/chosen": -74.2595443725586, "logps/rejected": -581.4434814453125, "loss": 19872.15, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17747794091701508, "rewards/margins": 0.5011757612228394, "rewards/rejected": -0.3236978054046631, "step": 2090 }, { "epoch": 10.632911392405063, "grad_norm": 1004256.8801454039, "learning_rate": 2.2657474146035723e-07, "logits/chosen": -3.6712310314178467, "logits/rejected": -2.93229603767395, "logps/chosen": -71.2375259399414, "logps/rejected": -567.6956787109375, "loss": 19287.7531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17674970626831055, "rewards/margins": 0.4991089403629303, "rewards/rejected": -0.322359174489975, "step": 2100 }, { "epoch": 10.683544303797468, "grad_norm": 1106280.1198113484, "learning_rate": 2.2500783453462862e-07, "logits/chosen": -0.8213077783584595, "logits/rejected": 0.14719510078430176, "logps/chosen": -63.996498107910156, "logps/rejected": -572.0599975585938, "loss": 19310.0187, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17046719789505005, "rewards/margins": 0.5157285928726196, "rewards/rejected": -0.34526145458221436, "step": 2110 }, { "epoch": 10.734177215189874, "grad_norm": 1464811.168383667, "learning_rate": 2.234409276089e-07, "logits/chosen": -0.11996922641992569, "logits/rejected": 0.22597141563892365, "logps/chosen": -76.11662292480469, "logps/rejected": -563.3325805664062, "loss": 19843.4688, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16797539591789246, "rewards/margins": 0.4910767078399658, "rewards/rejected": -0.32310131192207336, "step": 2120 }, { "epoch": 10.784810126582279, "grad_norm": 1138818.9289973595, "learning_rate": 2.218740206831714e-07, "logits/chosen": -1.5511647462844849, "logits/rejected": -0.5638203620910645, "logps/chosen": -54.634178161621094, "logps/rejected": -540.9115600585938, "loss": 19217.1797, "rewards/accuracies": 1.0, "rewards/chosen": 0.16898050904273987, "rewards/margins": 0.4923567771911621, "rewards/rejected": -0.32337623834609985, "step": 2130 }, { "epoch": 10.835443037974684, "grad_norm": 927204.123761474, "learning_rate": 2.203071137574428e-07, "logits/chosen": -0.11952924728393555, "logits/rejected": 0.11829443275928497, "logps/chosen": -68.7413101196289, "logps/rejected": -549.9212036132812, "loss": 19664.7969, "rewards/accuracies": 1.0, "rewards/chosen": 0.16766998171806335, "rewards/margins": 0.4846928119659424, "rewards/rejected": -0.31702274084091187, "step": 2140 }, { "epoch": 10.886075949367088, "grad_norm": 987326.8653252205, "learning_rate": 2.187402068317142e-07, "logits/chosen": -0.7956012487411499, "logits/rejected": -0.13277845084667206, "logps/chosen": -66.15580749511719, "logps/rejected": -539.9743041992188, "loss": 19319.2359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.16284069418907166, "rewards/margins": 0.47170519828796387, "rewards/rejected": -0.3088645040988922, "step": 2150 }, { "epoch": 10.936708860759493, "grad_norm": 1490559.10150877, "learning_rate": 2.1717329990598557e-07, "logits/chosen": 0.23329691588878632, "logits/rejected": 0.3798617720603943, "logps/chosen": -65.20997619628906, "logps/rejected": -566.1799926757812, "loss": 18358.2687, "rewards/accuracies": 1.0, "rewards/chosen": 0.15990014374256134, "rewards/margins": 0.502142608165741, "rewards/rejected": -0.34224241971969604, "step": 2160 }, { "epoch": 10.987341772151899, "grad_norm": 910221.5561234908, "learning_rate": 2.1560639298025696e-07, "logits/chosen": -1.0350775718688965, "logits/rejected": -0.4896017909049988, "logps/chosen": -80.80205535888672, "logps/rejected": -605.629150390625, "loss": 19122.5234, "rewards/accuracies": 1.0, "rewards/chosen": 0.17700453102588654, "rewards/margins": 0.5280236601829529, "rewards/rejected": -0.35101914405822754, "step": 2170 }, { "epoch": 11.037974683544304, "grad_norm": 852169.287356795, "learning_rate": 2.1403948605452835e-07, "logits/chosen": -1.0383515357971191, "logits/rejected": 0.3044077157974243, "logps/chosen": -60.7518196105957, "logps/rejected": -550.4581909179688, "loss": 18261.975, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16871869564056396, "rewards/margins": 0.49391689896583557, "rewards/rejected": -0.32519814372062683, "step": 2180 }, { "epoch": 11.08860759493671, "grad_norm": 850664.061578799, "learning_rate": 2.1247257912879973e-07, "logits/chosen": -0.5247487425804138, "logits/rejected": -0.718704342842102, "logps/chosen": -48.23347473144531, "logps/rejected": -571.79296875, "loss": 17780.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.17942146956920624, "rewards/margins": 0.5196394920349121, "rewards/rejected": -0.34021803736686707, "step": 2190 }, { "epoch": 11.139240506329115, "grad_norm": 795813.8223153341, "learning_rate": 2.1090567220307112e-07, "logits/chosen": 0.2913626730442047, "logits/rejected": 0.3964959681034088, "logps/chosen": -57.057777404785156, "logps/rejected": -553.8439331054688, "loss": 19198.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.1739949882030487, "rewards/margins": 0.49791765213012695, "rewards/rejected": -0.32392266392707825, "step": 2200 }, { "epoch": 11.189873417721518, "grad_norm": 1113023.3688515616, "learning_rate": 2.093387652773425e-07, "logits/chosen": 1.5053379535675049, "logits/rejected": 2.2073726654052734, "logps/chosen": -52.245140075683594, "logps/rejected": -549.0379028320312, "loss": 18112.9031, "rewards/accuracies": 1.0, "rewards/chosen": 0.1701221615076065, "rewards/margins": 0.49869513511657715, "rewards/rejected": -0.32857298851013184, "step": 2210 }, { "epoch": 11.240506329113924, "grad_norm": 1112437.2131689412, "learning_rate": 2.077718583516139e-07, "logits/chosen": -0.7113906741142273, "logits/rejected": -0.593052089214325, "logps/chosen": -56.02216720581055, "logps/rejected": -588.62255859375, "loss": 18765.7359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18194417655467987, "rewards/margins": 0.529647707939148, "rewards/rejected": -0.3477035462856293, "step": 2220 }, { "epoch": 11.291139240506329, "grad_norm": 735799.2580717172, "learning_rate": 2.0620495142588527e-07, "logits/chosen": -0.9520748257637024, "logits/rejected": -0.6387659907341003, "logps/chosen": -58.523109436035156, "logps/rejected": -582.5303344726562, "loss": 17604.2656, "rewards/accuracies": 1.0, "rewards/chosen": 0.17585232853889465, "rewards/margins": 0.522950291633606, "rewards/rejected": -0.3470980226993561, "step": 2230 }, { "epoch": 11.341772151898734, "grad_norm": 716407.5247360148, "learning_rate": 2.0463804450015669e-07, "logits/chosen": 1.4925919771194458, "logits/rejected": 1.6499805450439453, "logps/chosen": -63.138038635253906, "logps/rejected": -546.4395751953125, "loss": 18588.6406, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1618063747882843, "rewards/margins": 0.48370370268821716, "rewards/rejected": -0.3218972980976105, "step": 2240 }, { "epoch": 11.39240506329114, "grad_norm": 598500.3265676593, "learning_rate": 2.0307113757442807e-07, "logits/chosen": 0.6475615501403809, "logits/rejected": 1.338098406791687, "logps/chosen": -58.75787353515625, "logps/rejected": -563.3907470703125, "loss": 18119.6031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17143133282661438, "rewards/margins": 0.5086871981620789, "rewards/rejected": -0.3372558653354645, "step": 2250 }, { "epoch": 11.443037974683545, "grad_norm": 1221314.1531539639, "learning_rate": 2.0150423064869946e-07, "logits/chosen": -0.327157199382782, "logits/rejected": 0.03896377235651016, "logps/chosen": -58.68574905395508, "logps/rejected": -558.2637329101562, "loss": 17534.2281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17224976420402527, "rewards/margins": 0.49942049384117126, "rewards/rejected": -0.327170729637146, "step": 2260 }, { "epoch": 11.49367088607595, "grad_norm": 456316.6263000263, "learning_rate": 1.9993732372297084e-07, "logits/chosen": -0.07340321689844131, "logits/rejected": 0.9581168293952942, "logps/chosen": -56.39067459106445, "logps/rejected": -567.6375732421875, "loss": 17502.8781, "rewards/accuracies": 1.0, "rewards/chosen": 0.1778116524219513, "rewards/margins": 0.5131680965423584, "rewards/rejected": -0.3353564143180847, "step": 2270 }, { "epoch": 11.544303797468354, "grad_norm": 711686.0768962563, "learning_rate": 1.9837041679724223e-07, "logits/chosen": -0.8106869459152222, "logits/rejected": -0.6330159902572632, "logps/chosen": -61.687591552734375, "logps/rejected": -573.0241088867188, "loss": 17796.2391, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18241460621356964, "rewards/margins": 0.5145494937896729, "rewards/rejected": -0.3321349024772644, "step": 2280 }, { "epoch": 11.594936708860759, "grad_norm": 1355769.5974116765, "learning_rate": 1.9680350987151361e-07, "logits/chosen": 2.7271580696105957, "logits/rejected": 3.408385753631592, "logps/chosen": -53.9175910949707, "logps/rejected": -532.6714477539062, "loss": 18442.0969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16783255338668823, "rewards/margins": 0.4785786271095276, "rewards/rejected": -0.31074607372283936, "step": 2290 }, { "epoch": 11.645569620253164, "grad_norm": 1885360.6056858273, "learning_rate": 1.95236602945785e-07, "logits/chosen": -0.4679819941520691, "logits/rejected": 0.16113388538360596, "logps/chosen": -63.9486198425293, "logps/rejected": -550.3961181640625, "loss": 17411.3969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17148110270500183, "rewards/margins": 0.4901048243045807, "rewards/rejected": -0.3186236619949341, "step": 2300 }, { "epoch": 11.69620253164557, "grad_norm": 758901.4037823884, "learning_rate": 1.9366969602005639e-07, "logits/chosen": 0.85181725025177, "logits/rejected": 1.3077051639556885, "logps/chosen": -73.22114562988281, "logps/rejected": -575.5013427734375, "loss": 17968.0844, "rewards/accuracies": 1.0, "rewards/chosen": 0.1745305359363556, "rewards/margins": 0.5058612823486328, "rewards/rejected": -0.33133071660995483, "step": 2310 }, { "epoch": 11.746835443037975, "grad_norm": 520118.42882549425, "learning_rate": 1.9210278909432777e-07, "logits/chosen": -0.6327224969863892, "logits/rejected": 0.7259325385093689, "logps/chosen": -60.48676681518555, "logps/rejected": -574.37939453125, "loss": 18215.2938, "rewards/accuracies": 1.0, "rewards/chosen": 0.18099671602249146, "rewards/margins": 0.5182011127471924, "rewards/rejected": -0.33720433712005615, "step": 2320 }, { "epoch": 11.79746835443038, "grad_norm": 743117.6330674689, "learning_rate": 1.9053588216859918e-07, "logits/chosen": 1.2280547618865967, "logits/rejected": 1.3038314580917358, "logps/chosen": -59.2470817565918, "logps/rejected": -559.13916015625, "loss": 17567.2906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17356745898723602, "rewards/margins": 0.49933862686157227, "rewards/rejected": -0.32577118277549744, "step": 2330 }, { "epoch": 11.848101265822784, "grad_norm": 730673.5249396141, "learning_rate": 1.8896897524287057e-07, "logits/chosen": 1.2314859628677368, "logits/rejected": 1.3703396320343018, "logps/chosen": -58.14827346801758, "logps/rejected": -552.53759765625, "loss": 17758.8719, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1748059093952179, "rewards/margins": 0.4981175363063812, "rewards/rejected": -0.3233116567134857, "step": 2340 }, { "epoch": 11.89873417721519, "grad_norm": 597117.4885736415, "learning_rate": 1.8740206831714195e-07, "logits/chosen": -0.7092142105102539, "logits/rejected": -0.0756240040063858, "logps/chosen": -62.97068405151367, "logps/rejected": -567.6489868164062, "loss": 18044.8, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17830543220043182, "rewards/margins": 0.5064790844917297, "rewards/rejected": -0.3281736969947815, "step": 2350 }, { "epoch": 11.949367088607595, "grad_norm": 687586.0618323467, "learning_rate": 1.8583516139141334e-07, "logits/chosen": -1.2183369398117065, "logits/rejected": -1.056317925453186, "logps/chosen": -65.71519470214844, "logps/rejected": -578.7620239257812, "loss": 18082.8625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18341727554798126, "rewards/margins": 0.5148480534553528, "rewards/rejected": -0.33143073320388794, "step": 2360 }, { "epoch": 12.0, "grad_norm": 748926.1941504646, "learning_rate": 1.8426825446568473e-07, "logits/chosen": -0.35043638944625854, "logits/rejected": -1.1868419647216797, "logps/chosen": -59.269996643066406, "logps/rejected": -581.2828369140625, "loss": 17352.5563, "rewards/accuracies": 1.0, "rewards/chosen": 0.16442957520484924, "rewards/margins": 0.5158518552780151, "rewards/rejected": -0.3514222800731659, "step": 2370 }, { "epoch": 12.050632911392405, "grad_norm": 924736.9233899026, "learning_rate": 1.827013475399561e-07, "logits/chosen": 0.09384210407733917, "logits/rejected": 0.38976824283599854, "logps/chosen": -60.1981315612793, "logps/rejected": -569.2012329101562, "loss": 16551.6906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1783694624900818, "rewards/margins": 0.5092591047286987, "rewards/rejected": -0.33088964223861694, "step": 2380 }, { "epoch": 12.10126582278481, "grad_norm": 453683.3343967912, "learning_rate": 1.811344406142275e-07, "logits/chosen": -0.1967567503452301, "logits/rejected": 0.26000285148620605, "logps/chosen": -51.80207443237305, "logps/rejected": -586.1417846679688, "loss": 16650.6516, "rewards/accuracies": 1.0, "rewards/chosen": 0.19160635769367218, "rewards/margins": 0.5359978079795837, "rewards/rejected": -0.34439152479171753, "step": 2390 }, { "epoch": 12.151898734177216, "grad_norm": 760637.6347084254, "learning_rate": 1.7956753368849888e-07, "logits/chosen": -2.4950621128082275, "logits/rejected": -1.7182337045669556, "logps/chosen": -54.441162109375, "logps/rejected": -569.5804443359375, "loss": 16525.3187, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1771778166294098, "rewards/margins": 0.5123227834701538, "rewards/rejected": -0.335144966840744, "step": 2400 }, { "epoch": 12.20253164556962, "grad_norm": 760695.8247001156, "learning_rate": 1.7800062676277027e-07, "logits/chosen": 2.4408202171325684, "logits/rejected": 1.941209077835083, "logps/chosen": -50.47087097167969, "logps/rejected": -550.1649169921875, "loss": 16281.4594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1683485209941864, "rewards/margins": 0.5019410848617554, "rewards/rejected": -0.33359256386756897, "step": 2410 }, { "epoch": 12.253164556962025, "grad_norm": 501646.8806860111, "learning_rate": 1.7643371983704165e-07, "logits/chosen": -1.7683095932006836, "logits/rejected": -1.838817834854126, "logps/chosen": -53.41362762451172, "logps/rejected": -574.3419799804688, "loss": 16772.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.18047122657299042, "rewards/margins": 0.5231555700302124, "rewards/rejected": -0.342684268951416, "step": 2420 }, { "epoch": 12.30379746835443, "grad_norm": 705638.6344046313, "learning_rate": 1.7486681291131307e-07, "logits/chosen": 0.6870694756507874, "logits/rejected": 0.9879606366157532, "logps/chosen": -60.645713806152344, "logps/rejected": -565.5677490234375, "loss": 16990.1125, "rewards/accuracies": 1.0, "rewards/chosen": 0.180276021361351, "rewards/margins": 0.5076194405555725, "rewards/rejected": -0.3273434340953827, "step": 2430 }, { "epoch": 12.354430379746836, "grad_norm": 583239.6869039454, "learning_rate": 1.7329990598558445e-07, "logits/chosen": -0.015002572908997536, "logits/rejected": 0.6669713258743286, "logps/chosen": -59.69384765625, "logps/rejected": -595.3045654296875, "loss": 16570.7625, "rewards/accuracies": 1.0, "rewards/chosen": 0.19047938287258148, "rewards/margins": 0.5352143049240112, "rewards/rejected": -0.34473496675491333, "step": 2440 }, { "epoch": 12.405063291139241, "grad_norm": 717458.0522613698, "learning_rate": 1.7173299905985584e-07, "logits/chosen": -1.5561044216156006, "logits/rejected": -1.511528730392456, "logps/chosen": -48.24024200439453, "logps/rejected": -585.71484375, "loss": 16296.25, "rewards/accuracies": 1.0, "rewards/chosen": 0.18336063623428345, "rewards/margins": 0.5371404886245728, "rewards/rejected": -0.3537798523902893, "step": 2450 }, { "epoch": 12.455696202531646, "grad_norm": 1561201.446100151, "learning_rate": 1.7016609213412722e-07, "logits/chosen": -0.5445646047592163, "logits/rejected": 0.5015290379524231, "logps/chosen": -57.12273025512695, "logps/rejected": -596.54248046875, "loss": 17012.2562, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1852089911699295, "rewards/margins": 0.5424550771713257, "rewards/rejected": -0.35724616050720215, "step": 2460 }, { "epoch": 12.50632911392405, "grad_norm": 576931.8180998629, "learning_rate": 1.685991852083986e-07, "logits/chosen": 0.7103387713432312, "logits/rejected": 0.5729061365127563, "logps/chosen": -45.429290771484375, "logps/rejected": -540.9015502929688, "loss": 17545.0859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17228493094444275, "rewards/margins": 0.49700021743774414, "rewards/rejected": -0.3247153162956238, "step": 2470 }, { "epoch": 12.556962025316455, "grad_norm": 790199.4841189157, "learning_rate": 1.6703227828267e-07, "logits/chosen": 0.757542610168457, "logits/rejected": 1.3497235774993896, "logps/chosen": -60.74102020263672, "logps/rejected": -570.23583984375, "loss": 17645.0094, "rewards/accuracies": 1.0, "rewards/chosen": 0.17620857059955597, "rewards/margins": 0.5084448456764221, "rewards/rejected": -0.33223623037338257, "step": 2480 }, { "epoch": 12.60759493670886, "grad_norm": 1168730.408088866, "learning_rate": 1.6546537135694138e-07, "logits/chosen": 1.1095263957977295, "logits/rejected": 1.6450704336166382, "logps/chosen": -55.1762580871582, "logps/rejected": -562.0362548828125, "loss": 17481.3469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1728857308626175, "rewards/margins": 0.5043104887008667, "rewards/rejected": -0.3314247727394104, "step": 2490 }, { "epoch": 12.658227848101266, "grad_norm": 492108.78941813926, "learning_rate": 1.6389846443121277e-07, "logits/chosen": 0.4340684413909912, "logits/rejected": 0.34048348665237427, "logps/chosen": -56.212928771972656, "logps/rejected": -578.192138671875, "loss": 16462.5594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17624449729919434, "rewards/margins": 0.5216260552406311, "rewards/rejected": -0.3453815281391144, "step": 2500 }, { "epoch": 12.708860759493671, "grad_norm": 513189.7522025148, "learning_rate": 1.6233155750548415e-07, "logits/chosen": -0.21513333916664124, "logits/rejected": -0.05444493144750595, "logps/chosen": -60.96831512451172, "logps/rejected": -583.4918823242188, "loss": 16903.7125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1871432662010193, "rewards/margins": 0.5204809904098511, "rewards/rejected": -0.3333377242088318, "step": 2510 }, { "epoch": 12.759493670886076, "grad_norm": 527855.7040773877, "learning_rate": 1.6076465057975556e-07, "logits/chosen": -1.166076421737671, "logits/rejected": -0.5938941240310669, "logps/chosen": -66.41789245605469, "logps/rejected": -565.521728515625, "loss": 16873.3, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18609380722045898, "rewards/margins": 0.5067971348762512, "rewards/rejected": -0.32070332765579224, "step": 2520 }, { "epoch": 12.810126582278482, "grad_norm": 454333.8693268159, "learning_rate": 1.5919774365402695e-07, "logits/chosen": -3.2188408374786377, "logits/rejected": -2.827929735183716, "logps/chosen": -64.64167785644531, "logps/rejected": -578.556396484375, "loss": 17413.3594, "rewards/accuracies": 1.0, "rewards/chosen": 0.1842392235994339, "rewards/margins": 0.5160521268844604, "rewards/rejected": -0.33181288838386536, "step": 2530 }, { "epoch": 12.860759493670885, "grad_norm": 613283.375359761, "learning_rate": 1.5763083672829833e-07, "logits/chosen": -2.0415351390838623, "logits/rejected": -1.1543810367584229, "logps/chosen": -56.55009841918945, "logps/rejected": -565.3232421875, "loss": 16952.7828, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17874039709568024, "rewards/margins": 0.5064669847488403, "rewards/rejected": -0.3277265429496765, "step": 2540 }, { "epoch": 12.91139240506329, "grad_norm": 973991.6151861927, "learning_rate": 1.5606392980256972e-07, "logits/chosen": -1.9052120447158813, "logits/rejected": -1.2125427722930908, "logps/chosen": -56.37163162231445, "logps/rejected": -575.3190307617188, "loss": 17272.6656, "rewards/accuracies": 1.0, "rewards/chosen": 0.18349668383598328, "rewards/margins": 0.5194507837295532, "rewards/rejected": -0.33595409989356995, "step": 2550 }, { "epoch": 12.962025316455696, "grad_norm": 1049016.1677939103, "learning_rate": 1.544970228768411e-07, "logits/chosen": -0.479561984539032, "logits/rejected": -0.6837025284767151, "logps/chosen": -56.96269989013672, "logps/rejected": -579.6213989257812, "loss": 17023.0859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1867980808019638, "rewards/margins": 0.5234028100967407, "rewards/rejected": -0.3366047739982605, "step": 2560 }, { "epoch": 13.012658227848101, "grad_norm": 335161.21326055715, "learning_rate": 1.529301159511125e-07, "logits/chosen": 0.09210095554590225, "logits/rejected": 0.2885093688964844, "logps/chosen": -52.608367919921875, "logps/rejected": -558.9227294921875, "loss": 15959.725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1778368204832077, "rewards/margins": 0.5030940175056458, "rewards/rejected": -0.32525718212127686, "step": 2570 }, { "epoch": 13.063291139240507, "grad_norm": 771775.1017807113, "learning_rate": 1.5136320902538388e-07, "logits/chosen": -1.3265520334243774, "logits/rejected": -0.9296306371688843, "logps/chosen": -62.875038146972656, "logps/rejected": -560.3228759765625, "loss": 15567.6344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18662917613983154, "rewards/margins": 0.49883994460105896, "rewards/rejected": -0.31221073865890503, "step": 2580 }, { "epoch": 13.113924050632912, "grad_norm": 446168.3148918395, "learning_rate": 1.4979630209965526e-07, "logits/chosen": -0.11115183681249619, "logits/rejected": 0.8431870341300964, "logps/chosen": -46.82927703857422, "logps/rejected": -552.5628051757812, "loss": 16255.3438, "rewards/accuracies": 1.0, "rewards/chosen": 0.17291709780693054, "rewards/margins": 0.5057471990585327, "rewards/rejected": -0.3328301012516022, "step": 2590 }, { "epoch": 13.164556962025316, "grad_norm": 586122.4453174556, "learning_rate": 1.4822939517392665e-07, "logits/chosen": -0.757349967956543, "logits/rejected": 0.037270687520504, "logps/chosen": -55.21142578125, "logps/rejected": -557.4276123046875, "loss": 16720.8172, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1850253939628601, "rewards/margins": 0.5111584663391113, "rewards/rejected": -0.32613304257392883, "step": 2600 }, { "epoch": 13.215189873417721, "grad_norm": 420628.2693101698, "learning_rate": 1.4666248824819803e-07, "logits/chosen": -0.11379202455282211, "logits/rejected": -0.11788152158260345, "logps/chosen": -49.00257110595703, "logps/rejected": -576.3326416015625, "loss": 16306.0688, "rewards/accuracies": 1.0, "rewards/chosen": 0.18590961396694183, "rewards/margins": 0.5257736444473267, "rewards/rejected": -0.33986401557922363, "step": 2610 }, { "epoch": 13.265822784810126, "grad_norm": 436219.2086299041, "learning_rate": 1.4509558132246945e-07, "logits/chosen": -0.7918820977210999, "logits/rejected": -0.14419230818748474, "logps/chosen": -56.56486892700195, "logps/rejected": -584.7669677734375, "loss": 16369.2719, "rewards/accuracies": 1.0, "rewards/chosen": 0.18918678164482117, "rewards/margins": 0.5305701494216919, "rewards/rejected": -0.3413834273815155, "step": 2620 }, { "epoch": 13.316455696202532, "grad_norm": 596793.3073449759, "learning_rate": 1.4352867439674083e-07, "logits/chosen": 1.9564087390899658, "logits/rejected": 2.246692180633545, "logps/chosen": -51.851722717285156, "logps/rejected": -548.3530883789062, "loss": 16796.1063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18290123343467712, "rewards/margins": 0.4980129599571228, "rewards/rejected": -0.3151116371154785, "step": 2630 }, { "epoch": 13.367088607594937, "grad_norm": 474733.1664905385, "learning_rate": 1.4196176747101222e-07, "logits/chosen": 0.530455470085144, "logits/rejected": 0.14751790463924408, "logps/chosen": -48.55830001831055, "logps/rejected": -558.3150024414062, "loss": 16144.2906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17938682436943054, "rewards/margins": 0.5066471695899963, "rewards/rejected": -0.3272603154182434, "step": 2640 }, { "epoch": 13.417721518987342, "grad_norm": 1649837.8712191964, "learning_rate": 1.403948605452836e-07, "logits/chosen": -0.03671743720769882, "logits/rejected": 0.7579118013381958, "logps/chosen": -42.065242767333984, "logps/rejected": -554.230224609375, "loss": 16118.8047, "rewards/accuracies": 1.0, "rewards/chosen": 0.18058671057224274, "rewards/margins": 0.5129930377006531, "rewards/rejected": -0.3324064016342163, "step": 2650 }, { "epoch": 13.468354430379748, "grad_norm": 594890.10809389, "learning_rate": 1.38827953619555e-07, "logits/chosen": 0.288557231426239, "logits/rejected": 0.2958771288394928, "logps/chosen": -52.33495330810547, "logps/rejected": -561.2686157226562, "loss": 15733.7453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1808079034090042, "rewards/margins": 0.5136345028877258, "rewards/rejected": -0.3328266143798828, "step": 2660 }, { "epoch": 13.518987341772151, "grad_norm": 467820.0894028926, "learning_rate": 1.3726104669382637e-07, "logits/chosen": -0.39889806509017944, "logits/rejected": 0.02098376676440239, "logps/chosen": -53.63391876220703, "logps/rejected": -556.4556884765625, "loss": 15584.0406, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18383657932281494, "rewards/margins": 0.5030336976051331, "rewards/rejected": -0.3191971182823181, "step": 2670 }, { "epoch": 13.569620253164556, "grad_norm": 349641.6736805019, "learning_rate": 1.3569413976809776e-07, "logits/chosen": -1.0416258573532104, "logits/rejected": -0.687407374382019, "logps/chosen": -40.50030517578125, "logps/rejected": -560.5548706054688, "loss": 15275.5312, "rewards/accuracies": 1.0, "rewards/chosen": 0.18312379717826843, "rewards/margins": 0.5221952199935913, "rewards/rejected": -0.33907145261764526, "step": 2680 }, { "epoch": 13.620253164556962, "grad_norm": 769040.8085386351, "learning_rate": 1.3412723284236915e-07, "logits/chosen": 1.7483727931976318, "logits/rejected": 2.3238413333892822, "logps/chosen": -49.73235321044922, "logps/rejected": -559.8514404296875, "loss": 16850.175, "rewards/accuracies": 1.0, "rewards/chosen": 0.18260039389133453, "rewards/margins": 0.5106431245803833, "rewards/rejected": -0.3280427157878876, "step": 2690 }, { "epoch": 13.670886075949367, "grad_norm": 459226.17158416886, "learning_rate": 1.3256032591664053e-07, "logits/chosen": -0.2809019684791565, "logits/rejected": 0.43121522665023804, "logps/chosen": -58.69781494140625, "logps/rejected": -588.9169921875, "loss": 15404.6109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19193768501281738, "rewards/margins": 0.5343278646469116, "rewards/rejected": -0.34239014983177185, "step": 2700 }, { "epoch": 13.721518987341772, "grad_norm": 339517.3364374988, "learning_rate": 1.3099341899091192e-07, "logits/chosen": 0.3717317283153534, "logits/rejected": 0.5634896159172058, "logps/chosen": -60.52980422973633, "logps/rejected": -555.2349243164062, "loss": 15341.8219, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17079493403434753, "rewards/margins": 0.489946186542511, "rewards/rejected": -0.31915122270584106, "step": 2710 }, { "epoch": 13.772151898734178, "grad_norm": 1157921.1375110236, "learning_rate": 1.2942651206518333e-07, "logits/chosen": -1.758825659751892, "logits/rejected": -1.0223956108093262, "logps/chosen": -48.61360549926758, "logps/rejected": -562.5768432617188, "loss": 16196.7625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.180302232503891, "rewards/margins": 0.5197224020957947, "rewards/rejected": -0.3394201397895813, "step": 2720 }, { "epoch": 13.822784810126583, "grad_norm": 434777.104877517, "learning_rate": 1.2785960513945471e-07, "logits/chosen": -0.3282082676887512, "logits/rejected": 0.4013535976409912, "logps/chosen": -50.629215240478516, "logps/rejected": -582.4617309570312, "loss": 15710.8641, "rewards/accuracies": 1.0, "rewards/chosen": 0.18200094997882843, "rewards/margins": 0.5299168825149536, "rewards/rejected": -0.3479159474372864, "step": 2730 }, { "epoch": 13.873417721518987, "grad_norm": 677123.1021845904, "learning_rate": 1.262926982137261e-07, "logits/chosen": -0.9533359408378601, "logits/rejected": -0.11374642699956894, "logps/chosen": -50.710845947265625, "logps/rejected": -568.776611328125, "loss": 16490.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.18456825613975525, "rewards/margins": 0.5208636522293091, "rewards/rejected": -0.3362954258918762, "step": 2740 }, { "epoch": 13.924050632911392, "grad_norm": 608241.5399016802, "learning_rate": 1.2472579128799749e-07, "logits/chosen": -0.009487760253250599, "logits/rejected": 0.5674014091491699, "logps/chosen": -47.34721755981445, "logps/rejected": -558.3707275390625, "loss": 16114.125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18486423790454865, "rewards/margins": 0.5096093416213989, "rewards/rejected": -0.3247450888156891, "step": 2750 }, { "epoch": 13.974683544303797, "grad_norm": 510265.43069577636, "learning_rate": 1.2315888436226887e-07, "logits/chosen": -1.1760886907577515, "logits/rejected": -0.8848980665206909, "logps/chosen": -50.471961975097656, "logps/rejected": -569.0016479492188, "loss": 15240.5234, "rewards/accuracies": 1.0, "rewards/chosen": 0.1889052391052246, "rewards/margins": 0.5153056383132935, "rewards/rejected": -0.3264002799987793, "step": 2760 }, { "epoch": 14.025316455696203, "grad_norm": 454762.9481647176, "learning_rate": 1.2159197743654026e-07, "logits/chosen": 2.4223504066467285, "logits/rejected": 3.487738847732544, "logps/chosen": -44.93278503417969, "logps/rejected": -561.3870849609375, "loss": 16557.4125, "rewards/accuracies": 1.0, "rewards/chosen": 0.1806286722421646, "rewards/margins": 0.5195534229278564, "rewards/rejected": -0.33892473578453064, "step": 2770 }, { "epoch": 14.075949367088608, "grad_norm": 487680.4985531969, "learning_rate": 1.2002507051081164e-07, "logits/chosen": 1.9585473537445068, "logits/rejected": 2.446890354156494, "logps/chosen": -39.52117919921875, "logps/rejected": -561.9512939453125, "loss": 15203.5906, "rewards/accuracies": 1.0, "rewards/chosen": 0.1834731251001358, "rewards/margins": 0.5265246629714966, "rewards/rejected": -0.343051552772522, "step": 2780 }, { "epoch": 14.126582278481013, "grad_norm": 335633.29006652284, "learning_rate": 1.1845816358508304e-07, "logits/chosen": -0.2361418753862381, "logits/rejected": 0.4229121804237366, "logps/chosen": -56.944580078125, "logps/rejected": -581.4995727539062, "loss": 14980.4906, "rewards/accuracies": 1.0, "rewards/chosen": 0.18962779641151428, "rewards/margins": 0.5268105268478394, "rewards/rejected": -0.3371827304363251, "step": 2790 }, { "epoch": 14.177215189873417, "grad_norm": 433336.52566667914, "learning_rate": 1.1689125665935443e-07, "logits/chosen": -0.8853734135627747, "logits/rejected": 0.24162235856056213, "logps/chosen": -49.96304702758789, "logps/rejected": -587.9956665039062, "loss": 15952.2594, "rewards/accuracies": 1.0, "rewards/chosen": 0.1883043497800827, "rewards/margins": 0.5334208607673645, "rewards/rejected": -0.345116525888443, "step": 2800 }, { "epoch": 14.227848101265822, "grad_norm": 352832.2810093542, "learning_rate": 1.1532434973362581e-07, "logits/chosen": -0.9270970225334167, "logits/rejected": -0.8106321096420288, "logps/chosen": -50.61150360107422, "logps/rejected": -579.4258422851562, "loss": 15482.3031, "rewards/accuracies": 1.0, "rewards/chosen": 0.18148374557495117, "rewards/margins": 0.5241626501083374, "rewards/rejected": -0.34267887473106384, "step": 2810 }, { "epoch": 14.278481012658228, "grad_norm": 518734.4787371263, "learning_rate": 1.137574428078972e-07, "logits/chosen": 2.115744113922119, "logits/rejected": 2.9750027656555176, "logps/chosen": -41.601097106933594, "logps/rejected": -573.6159057617188, "loss": 15787.4719, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18657180666923523, "rewards/margins": 0.5376033186912537, "rewards/rejected": -0.35103151202201843, "step": 2820 }, { "epoch": 14.329113924050633, "grad_norm": 637771.2756103254, "learning_rate": 1.1219053588216858e-07, "logits/chosen": -0.09557388722896576, "logits/rejected": -0.5708149671554565, "logps/chosen": -44.071807861328125, "logps/rejected": -585.6417236328125, "loss": 15660.4813, "rewards/accuracies": 1.0, "rewards/chosen": 0.18580812215805054, "rewards/margins": 0.5347784757614136, "rewards/rejected": -0.34897032380104065, "step": 2830 }, { "epoch": 14.379746835443038, "grad_norm": 469592.5817335632, "learning_rate": 1.1062362895643998e-07, "logits/chosen": 0.14405778050422668, "logits/rejected": 0.6720622181892395, "logps/chosen": -45.77620315551758, "logps/rejected": -562.7021484375, "loss": 15265.0797, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18696969747543335, "rewards/margins": 0.5200961828231812, "rewards/rejected": -0.3331265151500702, "step": 2840 }, { "epoch": 14.430379746835444, "grad_norm": 381405.89470487926, "learning_rate": 1.0905672203071137e-07, "logits/chosen": -0.46474942564964294, "logits/rejected": -0.6803582906723022, "logps/chosen": -43.475257873535156, "logps/rejected": -578.9302978515625, "loss": 15502.7, "rewards/accuracies": 1.0, "rewards/chosen": 0.18612739443778992, "rewards/margins": 0.5332227945327759, "rewards/rejected": -0.34709542989730835, "step": 2850 }, { "epoch": 14.481012658227849, "grad_norm": 389034.05049605225, "learning_rate": 1.0748981510498275e-07, "logits/chosen": 0.192867711186409, "logits/rejected": 0.04235720634460449, "logps/chosen": -45.57283020019531, "logps/rejected": -573.7398071289062, "loss": 16059.1625, "rewards/accuracies": 1.0, "rewards/chosen": 0.18987932801246643, "rewards/margins": 0.5239830613136292, "rewards/rejected": -0.33410370349884033, "step": 2860 }, { "epoch": 14.531645569620252, "grad_norm": 1027736.0673764712, "learning_rate": 1.0592290817925414e-07, "logits/chosen": -0.14229407906532288, "logits/rejected": 0.4352554380893707, "logps/chosen": -52.69159698486328, "logps/rejected": -584.4544067382812, "loss": 15405.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.19550864398479462, "rewards/margins": 0.5430904626846313, "rewards/rejected": -0.34758180379867554, "step": 2870 }, { "epoch": 14.582278481012658, "grad_norm": 384385.74028987245, "learning_rate": 1.0435600125352554e-07, "logits/chosen": -2.178337335586548, "logits/rejected": -0.7508569955825806, "logps/chosen": -59.098426818847656, "logps/rejected": -576.6027221679688, "loss": 14664.3844, "rewards/accuracies": 1.0, "rewards/chosen": 0.18826426565647125, "rewards/margins": 0.5217211842536926, "rewards/rejected": -0.33345693349838257, "step": 2880 }, { "epoch": 14.632911392405063, "grad_norm": 329341.72262227273, "learning_rate": 1.0278909432779692e-07, "logits/chosen": -0.5238679647445679, "logits/rejected": 0.5422592163085938, "logps/chosen": -45.037288665771484, "logps/rejected": -568.7276000976562, "loss": 15557.125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18480226397514343, "rewards/margins": 0.5315712094306946, "rewards/rejected": -0.34676894545555115, "step": 2890 }, { "epoch": 14.683544303797468, "grad_norm": 543441.6169659087, "learning_rate": 1.0122218740206831e-07, "logits/chosen": -1.954636812210083, "logits/rejected": -1.2880172729492188, "logps/chosen": -42.44208908081055, "logps/rejected": -553.3023681640625, "loss": 15342.95, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17853178083896637, "rewards/margins": 0.5080317258834839, "rewards/rejected": -0.32949990034103394, "step": 2900 }, { "epoch": 14.734177215189874, "grad_norm": 485286.8133606422, "learning_rate": 9.96552804763397e-08, "logits/chosen": -0.10534539073705673, "logits/rejected": -0.22817449271678925, "logps/chosen": -58.41508102416992, "logps/rejected": -589.82861328125, "loss": 14829.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.19155274331569672, "rewards/margins": 0.5371214747428894, "rewards/rejected": -0.3455687165260315, "step": 2910 }, { "epoch": 14.784810126582279, "grad_norm": 443260.47292018944, "learning_rate": 9.808837355061108e-08, "logits/chosen": 0.06932596862316132, "logits/rejected": -0.2167021781206131, "logps/chosen": -47.265785217285156, "logps/rejected": -564.3973388671875, "loss": 15330.0641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1784828007221222, "rewards/margins": 0.5103118419647217, "rewards/rejected": -0.3318290710449219, "step": 2920 }, { "epoch": 14.835443037974684, "grad_norm": 483368.1079372665, "learning_rate": 9.652146662488248e-08, "logits/chosen": -0.06790392100811005, "logits/rejected": 0.29011401534080505, "logps/chosen": -54.78580856323242, "logps/rejected": -574.620361328125, "loss": 15093.3531, "rewards/accuracies": 1.0, "rewards/chosen": 0.18937523663043976, "rewards/margins": 0.5257763862609863, "rewards/rejected": -0.33640116453170776, "step": 2930 }, { "epoch": 14.886075949367088, "grad_norm": 955906.0887958824, "learning_rate": 9.495455969915387e-08, "logits/chosen": 1.4835760593414307, "logits/rejected": 1.6735947132110596, "logps/chosen": -46.26830291748047, "logps/rejected": -551.6137084960938, "loss": 15061.7437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1782112419605255, "rewards/margins": 0.5047247409820557, "rewards/rejected": -0.32651349902153015, "step": 2940 }, { "epoch": 14.936708860759493, "grad_norm": 389874.4777367002, "learning_rate": 9.338765277342525e-08, "logits/chosen": -0.45253458619117737, "logits/rejected": 0.04955162853002548, "logps/chosen": -44.50522994995117, "logps/rejected": -556.4650268554688, "loss": 15850.6094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1812363862991333, "rewards/margins": 0.5129731893539429, "rewards/rejected": -0.33173683285713196, "step": 2950 }, { "epoch": 14.987341772151899, "grad_norm": 880494.2982969056, "learning_rate": 9.182074584769664e-08, "logits/chosen": -1.3114904165267944, "logits/rejected": -0.3469497859477997, "logps/chosen": -48.75851821899414, "logps/rejected": -542.8458862304688, "loss": 14465.8125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1728508621454239, "rewards/margins": 0.49641647934913635, "rewards/rejected": -0.32356563210487366, "step": 2960 }, { "epoch": 15.037974683544304, "grad_norm": 518700.7292410764, "learning_rate": 9.025383892196802e-08, "logits/chosen": 1.0516235828399658, "logits/rejected": 1.4486608505249023, "logps/chosen": -50.19924545288086, "logps/rejected": -568.3731689453125, "loss": 15371.2547, "rewards/accuracies": 1.0, "rewards/chosen": 0.1881760060787201, "rewards/margins": 0.5164635181427002, "rewards/rejected": -0.3282875716686249, "step": 2970 }, { "epoch": 15.08860759493671, "grad_norm": 331391.7564792058, "learning_rate": 8.868693199623942e-08, "logits/chosen": 2.2234063148498535, "logits/rejected": 2.0345654487609863, "logps/chosen": -52.14508819580078, "logps/rejected": -595.8091430664062, "loss": 14717.8656, "rewards/accuracies": 1.0, "rewards/chosen": 0.1904282122850418, "rewards/margins": 0.5425348877906799, "rewards/rejected": -0.3521067202091217, "step": 2980 }, { "epoch": 15.139240506329115, "grad_norm": 245591.75428222032, "learning_rate": 8.712002507051081e-08, "logits/chosen": -0.6622523069381714, "logits/rejected": -0.06956877559423447, "logps/chosen": -52.00910186767578, "logps/rejected": -563.0474853515625, "loss": 15161.7313, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1884680539369583, "rewards/margins": 0.5108307003974915, "rewards/rejected": -0.32236260175704956, "step": 2990 }, { "epoch": 15.189873417721518, "grad_norm": 310549.6440256543, "learning_rate": 8.555311814478219e-08, "logits/chosen": 0.2366395890712738, "logits/rejected": 0.44344860315322876, "logps/chosen": -41.386192321777344, "logps/rejected": -572.7687377929688, "loss": 14740.5063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1788499653339386, "rewards/margins": 0.5284099578857422, "rewards/rejected": -0.3495599925518036, "step": 3000 }, { "epoch": 15.240506329113924, "grad_norm": 306008.0109626414, "learning_rate": 8.398621121905358e-08, "logits/chosen": 0.007425785064697266, "logits/rejected": 0.6882709264755249, "logps/chosen": -61.54619598388672, "logps/rejected": -565.9954833984375, "loss": 14890.1531, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18634898960590363, "rewards/margins": 0.5029118061065674, "rewards/rejected": -0.31656283140182495, "step": 3010 }, { "epoch": 15.291139240506329, "grad_norm": 542292.2583731171, "learning_rate": 8.241930429332496e-08, "logits/chosen": -1.8317344188690186, "logits/rejected": -1.2810354232788086, "logps/chosen": -55.94157791137695, "logps/rejected": -610.6949462890625, "loss": 14922.1328, "rewards/accuracies": 1.0, "rewards/chosen": 0.20198726654052734, "rewards/margins": 0.5547267198562622, "rewards/rejected": -0.3527393639087677, "step": 3020 }, { "epoch": 15.341772151898734, "grad_norm": 246111.44147055785, "learning_rate": 8.085239736759636e-08, "logits/chosen": 0.38201937079429626, "logits/rejected": 0.48218441009521484, "logps/chosen": -49.771148681640625, "logps/rejected": -579.5675048828125, "loss": 14315.8422, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18888349831104279, "rewards/margins": 0.52850741147995, "rewards/rejected": -0.33962392807006836, "step": 3030 }, { "epoch": 15.39240506329114, "grad_norm": 365392.8501035466, "learning_rate": 7.928549044186775e-08, "logits/chosen": 0.2196371853351593, "logits/rejected": 0.5740281939506531, "logps/chosen": -37.870933532714844, "logps/rejected": -532.795166015625, "loss": 14228.8297, "rewards/accuracies": 1.0, "rewards/chosen": 0.17593248188495636, "rewards/margins": 0.4975932538509369, "rewards/rejected": -0.3216607868671417, "step": 3040 }, { "epoch": 15.443037974683545, "grad_norm": 601622.5104727764, "learning_rate": 7.771858351613913e-08, "logits/chosen": -0.6718970537185669, "logits/rejected": -0.666345477104187, "logps/chosen": -44.54059600830078, "logps/rejected": -578.719482421875, "loss": 15052.1406, "rewards/accuracies": 1.0, "rewards/chosen": 0.19072814285755157, "rewards/margins": 0.5325638055801392, "rewards/rejected": -0.3418356776237488, "step": 3050 }, { "epoch": 15.49367088607595, "grad_norm": 343253.0399713909, "learning_rate": 7.615167659041052e-08, "logits/chosen": -1.7298717498779297, "logits/rejected": -1.107236385345459, "logps/chosen": -48.916072845458984, "logps/rejected": -581.4259643554688, "loss": 15088.4312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18928301334381104, "rewards/margins": 0.5350446701049805, "rewards/rejected": -0.34576165676116943, "step": 3060 }, { "epoch": 15.544303797468354, "grad_norm": 228770.67672990158, "learning_rate": 7.45847696646819e-08, "logits/chosen": 1.368043303489685, "logits/rejected": 2.1229677200317383, "logps/chosen": -49.823055267333984, "logps/rejected": -576.06103515625, "loss": 13555.7672, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18895366787910461, "rewards/margins": 0.5293976664543152, "rewards/rejected": -0.34044402837753296, "step": 3070 }, { "epoch": 15.594936708860759, "grad_norm": 292818.2312129945, "learning_rate": 7.30178627389533e-08, "logits/chosen": -0.7066992521286011, "logits/rejected": 0.058099888265132904, "logps/chosen": -52.58687210083008, "logps/rejected": -577.005859375, "loss": 14893.6594, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19148708879947662, "rewards/margins": 0.5295326113700867, "rewards/rejected": -0.33804553747177124, "step": 3080 }, { "epoch": 15.645569620253164, "grad_norm": 275063.1192623706, "learning_rate": 7.145095581322469e-08, "logits/chosen": 0.057862140238285065, "logits/rejected": -0.10827471315860748, "logps/chosen": -51.52691650390625, "logps/rejected": -598.4918212890625, "loss": 14740.6531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1917671114206314, "rewards/margins": 0.5416404008865356, "rewards/rejected": -0.34987324476242065, "step": 3090 }, { "epoch": 15.69620253164557, "grad_norm": 270643.231235499, "learning_rate": 6.988404888749608e-08, "logits/chosen": 0.49672946333885193, "logits/rejected": 0.9934390187263489, "logps/chosen": -53.964393615722656, "logps/rejected": -592.7462158203125, "loss": 14747.2812, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19860555231571198, "rewards/margins": 0.5442546010017395, "rewards/rejected": -0.3456490635871887, "step": 3100 }, { "epoch": 15.746835443037975, "grad_norm": 366703.97931916115, "learning_rate": 6.831714196176746e-08, "logits/chosen": -1.272958517074585, "logits/rejected": -1.2677191495895386, "logps/chosen": -46.67731475830078, "logps/rejected": -578.444091796875, "loss": 14561.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.19132201373577118, "rewards/margins": 0.5392004251480103, "rewards/rejected": -0.3478783965110779, "step": 3110 }, { "epoch": 15.79746835443038, "grad_norm": 363431.4061189904, "learning_rate": 6.675023503603886e-08, "logits/chosen": -0.16689462959766388, "logits/rejected": 0.6665533781051636, "logps/chosen": -49.408546447753906, "logps/rejected": -587.0728759765625, "loss": 14602.2328, "rewards/accuracies": 1.0, "rewards/chosen": 0.1951448619365692, "rewards/margins": 0.538873553276062, "rewards/rejected": -0.3437287211418152, "step": 3120 }, { "epoch": 15.848101265822784, "grad_norm": 1925815.481070705, "learning_rate": 6.518332811031025e-08, "logits/chosen": -0.1888163536787033, "logits/rejected": -0.3901883661746979, "logps/chosen": -37.012611389160156, "logps/rejected": -553.5242919921875, "loss": 15093.5328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18000957369804382, "rewards/margins": 0.5157765746116638, "rewards/rejected": -0.3357670307159424, "step": 3130 }, { "epoch": 15.89873417721519, "grad_norm": 406865.81368112064, "learning_rate": 6.361642118458163e-08, "logits/chosen": -1.0143232345581055, "logits/rejected": -1.1421440839767456, "logps/chosen": -39.294063568115234, "logps/rejected": -572.8070068359375, "loss": 15857.7219, "rewards/accuracies": 1.0, "rewards/chosen": 0.18329963088035583, "rewards/margins": 0.5344266891479492, "rewards/rejected": -0.351127028465271, "step": 3140 }, { "epoch": 15.949367088607595, "grad_norm": 283773.4922827141, "learning_rate": 6.204951425885302e-08, "logits/chosen": 0.45898357033729553, "logits/rejected": 1.1897245645523071, "logps/chosen": -47.45745086669922, "logps/rejected": -564.1045532226562, "loss": 15274.2656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17995783686637878, "rewards/margins": 0.516915500164032, "rewards/rejected": -0.3369576930999756, "step": 3150 }, { "epoch": 16.0, "grad_norm": 338639.8303682123, "learning_rate": 6.04826073331244e-08, "logits/chosen": -1.1235512495040894, "logits/rejected": 0.0012889147037640214, "logps/chosen": -41.902889251708984, "logps/rejected": -569.4451293945312, "loss": 15055.2062, "rewards/accuracies": 1.0, "rewards/chosen": 0.18416796624660492, "rewards/margins": 0.5268322825431824, "rewards/rejected": -0.34266436100006104, "step": 3160 }, { "epoch": 16.050632911392405, "grad_norm": 256869.56003810524, "learning_rate": 5.8915700407395795e-08, "logits/chosen": -1.1983295679092407, "logits/rejected": -0.22695603966712952, "logps/chosen": -41.12403106689453, "logps/rejected": -573.8383178710938, "loss": 14636.0719, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1912733018398285, "rewards/margins": 0.5368129014968872, "rewards/rejected": -0.3455396294593811, "step": 3170 }, { "epoch": 16.10126582278481, "grad_norm": 251620.82775792846, "learning_rate": 5.734879348166719e-08, "logits/chosen": -0.662868082523346, "logits/rejected": 0.3795197606086731, "logps/chosen": -38.75691604614258, "logps/rejected": -555.0902709960938, "loss": 14758.6562, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1840089112520218, "rewards/margins": 0.5191300511360168, "rewards/rejected": -0.335121214389801, "step": 3180 }, { "epoch": 16.151898734177216, "grad_norm": 386320.34193101624, "learning_rate": 5.5781886555938573e-08, "logits/chosen": 0.9088973999023438, "logits/rejected": 1.0200951099395752, "logps/chosen": -37.841434478759766, "logps/rejected": -549.9398193359375, "loss": 14645.3125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18419453501701355, "rewards/margins": 0.5178717374801636, "rewards/rejected": -0.3336772620677948, "step": 3190 }, { "epoch": 16.20253164556962, "grad_norm": 323738.56127307797, "learning_rate": 5.421497963020996e-08, "logits/chosen": 1.6748106479644775, "logits/rejected": 1.7903064489364624, "logps/chosen": -43.683780670166016, "logps/rejected": -559.7962036132812, "loss": 14378.5187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1856391578912735, "rewards/margins": 0.5182951092720032, "rewards/rejected": -0.3326559364795685, "step": 3200 }, { "epoch": 16.253164556962027, "grad_norm": 254204.27494940045, "learning_rate": 5.264807270448135e-08, "logits/chosen": -0.028285836800932884, "logits/rejected": 0.47511911392211914, "logps/chosen": -46.74934005737305, "logps/rejected": -582.1607666015625, "loss": 14203.1469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19257526099681854, "rewards/margins": 0.5342021584510803, "rewards/rejected": -0.3416268825531006, "step": 3210 }, { "epoch": 16.303797468354432, "grad_norm": 295536.9430947363, "learning_rate": 5.108116577875274e-08, "logits/chosen": 0.9740939140319824, "logits/rejected": 0.8530548810958862, "logps/chosen": -43.95893478393555, "logps/rejected": -566.3425903320312, "loss": 14617.1531, "rewards/accuracies": 1.0, "rewards/chosen": 0.18452490866184235, "rewards/margins": 0.5231844782829285, "rewards/rejected": -0.3386596143245697, "step": 3220 }, { "epoch": 16.354430379746834, "grad_norm": 228442.89270088554, "learning_rate": 4.951425885302413e-08, "logits/chosen": -0.6641544699668884, "logits/rejected": -0.42437514662742615, "logps/chosen": -42.97655487060547, "logps/rejected": -572.6472778320312, "loss": 14575.375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19143202900886536, "rewards/margins": 0.5323026776313782, "rewards/rejected": -0.34087061882019043, "step": 3230 }, { "epoch": 16.40506329113924, "grad_norm": 280822.1227003712, "learning_rate": 4.7947351927295515e-08, "logits/chosen": 1.1500619649887085, "logits/rejected": 1.5377223491668701, "logps/chosen": -40.756866455078125, "logps/rejected": -555.7669067382812, "loss": 14355.8438, "rewards/accuracies": 1.0, "rewards/chosen": 0.18818344175815582, "rewards/margins": 0.5185222029685974, "rewards/rejected": -0.3303387761116028, "step": 3240 }, { "epoch": 16.455696202531644, "grad_norm": 211726.7404787661, "learning_rate": 4.63804450015669e-08, "logits/chosen": -0.1092449203133583, "logits/rejected": 0.2951999306678772, "logps/chosen": -42.441200256347656, "logps/rejected": -545.1079711914062, "loss": 14375.5266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18679597973823547, "rewards/margins": 0.5060458779335022, "rewards/rejected": -0.31924980878829956, "step": 3250 }, { "epoch": 16.50632911392405, "grad_norm": 356888.551437776, "learning_rate": 4.481353807583829e-08, "logits/chosen": -1.3785438537597656, "logits/rejected": -1.0880242586135864, "logps/chosen": -54.5753288269043, "logps/rejected": -585.0982666015625, "loss": 13676.1484, "rewards/accuracies": 1.0, "rewards/chosen": 0.19741004705429077, "rewards/margins": 0.5335227251052856, "rewards/rejected": -0.3361126780509949, "step": 3260 }, { "epoch": 16.556962025316455, "grad_norm": 364581.3025715214, "learning_rate": 4.324663115010968e-08, "logits/chosen": -0.7049742341041565, "logits/rejected": -0.23324167728424072, "logps/chosen": -51.56848907470703, "logps/rejected": -578.4015502929688, "loss": 14484.6266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18512576818466187, "rewards/margins": 0.5236076712608337, "rewards/rejected": -0.3384818732738495, "step": 3270 }, { "epoch": 16.60759493670886, "grad_norm": 336864.8330615521, "learning_rate": 4.167972422438107e-08, "logits/chosen": -0.9721381068229675, "logits/rejected": -1.1028145551681519, "logps/chosen": -55.94579315185547, "logps/rejected": -583.2372436523438, "loss": 14945.2641, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19380484521389008, "rewards/margins": 0.5321142673492432, "rewards/rejected": -0.3383094370365143, "step": 3280 }, { "epoch": 16.658227848101266, "grad_norm": 310564.956837095, "learning_rate": 4.0112817298652456e-08, "logits/chosen": -0.6065518260002136, "logits/rejected": -0.21473164856433868, "logps/chosen": -46.307228088378906, "logps/rejected": -586.7664184570312, "loss": 14667.4531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19551894068717957, "rewards/margins": 0.5414855480194092, "rewards/rejected": -0.345966637134552, "step": 3290 }, { "epoch": 16.70886075949367, "grad_norm": 329301.5108160766, "learning_rate": 3.854591037292385e-08, "logits/chosen": 0.40292587876319885, "logits/rejected": 1.5396214723587036, "logps/chosen": -40.793739318847656, "logps/rejected": -570.8857421875, "loss": 14524.3094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18828611075878143, "rewards/margins": 0.5341116189956665, "rewards/rejected": -0.3458254337310791, "step": 3300 }, { "epoch": 16.759493670886076, "grad_norm": 389871.220870713, "learning_rate": 3.6979003447195234e-08, "logits/chosen": -0.2180454283952713, "logits/rejected": 0.63756263256073, "logps/chosen": -48.842628479003906, "logps/rejected": -596.3530883789062, "loss": 15026.0328, "rewards/accuracies": 1.0, "rewards/chosen": 0.19478780031204224, "rewards/margins": 0.5423206090927124, "rewards/rejected": -0.34753280878067017, "step": 3310 }, { "epoch": 16.810126582278482, "grad_norm": 297091.2945334893, "learning_rate": 3.541209652146662e-08, "logits/chosen": -0.4556306302547455, "logits/rejected": 0.1757240742444992, "logps/chosen": -52.64439010620117, "logps/rejected": -598.89990234375, "loss": 14405.2531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19450917840003967, "rewards/margins": 0.5458864569664001, "rewards/rejected": -0.3513772487640381, "step": 3320 }, { "epoch": 16.860759493670887, "grad_norm": 1094427.122685082, "learning_rate": 3.384518959573801e-08, "logits/chosen": -0.09430136531591415, "logits/rejected": 0.669711709022522, "logps/chosen": -48.170013427734375, "logps/rejected": -584.9744873046875, "loss": 15005.1063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1912693828344345, "rewards/margins": 0.5353102087974548, "rewards/rejected": -0.34404081106185913, "step": 3330 }, { "epoch": 16.911392405063292, "grad_norm": 266675.6307359935, "learning_rate": 3.22782826700094e-08, "logits/chosen": -0.09551366418600082, "logits/rejected": -0.07008041441440582, "logps/chosen": -36.88441848754883, "logps/rejected": -568.5509033203125, "loss": 13823.6516, "rewards/accuracies": 1.0, "rewards/chosen": 0.18999743461608887, "rewards/margins": 0.5339778661727905, "rewards/rejected": -0.34398046135902405, "step": 3340 }, { "epoch": 16.962025316455698, "grad_norm": 562034.347414135, "learning_rate": 3.071137574428079e-08, "logits/chosen": 0.6763383746147156, "logits/rejected": 0.4948856830596924, "logps/chosen": -46.25956726074219, "logps/rejected": -565.7184448242188, "loss": 14414.3859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18556642532348633, "rewards/margins": 0.5159622430801392, "rewards/rejected": -0.33039581775665283, "step": 3350 }, { "epoch": 17.0126582278481, "grad_norm": 218651.56901322177, "learning_rate": 2.9144468818552176e-08, "logits/chosen": 0.41573429107666016, "logits/rejected": 1.103547215461731, "logps/chosen": -37.6799201965332, "logps/rejected": -569.5391235351562, "loss": 14029.3563, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18778078258037567, "rewards/margins": 0.5316546559333801, "rewards/rejected": -0.34387388825416565, "step": 3360 }, { "epoch": 17.063291139240505, "grad_norm": 236719.0916690887, "learning_rate": 2.7577561892823564e-08, "logits/chosen": -0.09267449378967285, "logits/rejected": 0.3535307049751282, "logps/chosen": -43.02147674560547, "logps/rejected": -571.3306884765625, "loss": 14216.225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18815621733665466, "rewards/margins": 0.52230304479599, "rewards/rejected": -0.33414679765701294, "step": 3370 }, { "epoch": 17.11392405063291, "grad_norm": 151995.97062770248, "learning_rate": 2.6010654967094953e-08, "logits/chosen": 1.3600900173187256, "logits/rejected": 0.45606088638305664, "logps/chosen": -33.07421112060547, "logps/rejected": -574.2869262695312, "loss": 14569.3875, "rewards/accuracies": 1.0, "rewards/chosen": 0.1843741536140442, "rewards/margins": 0.533474862575531, "rewards/rejected": -0.3491007089614868, "step": 3380 }, { "epoch": 17.164556962025316, "grad_norm": 229039.39535517112, "learning_rate": 2.4443748041366342e-08, "logits/chosen": 0.012326288037002087, "logits/rejected": -0.24337856471538544, "logps/chosen": -48.85834503173828, "logps/rejected": -591.9976806640625, "loss": 15141.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.19536466896533966, "rewards/margins": 0.5443064570426941, "rewards/rejected": -0.34894177317619324, "step": 3390 }, { "epoch": 17.21518987341772, "grad_norm": 224579.22425486994, "learning_rate": 2.2876841115637728e-08, "logits/chosen": -0.07731113582849503, "logits/rejected": 0.8038260340690613, "logps/chosen": -42.96089172363281, "logps/rejected": -587.9930419921875, "loss": 13962.9406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18969421088695526, "rewards/margins": 0.5471119284629822, "rewards/rejected": -0.3574177622795105, "step": 3400 }, { "epoch": 17.265822784810126, "grad_norm": 194108.45632178357, "learning_rate": 2.1309934189909117e-08, "logits/chosen": -1.735790491104126, "logits/rejected": -0.8417277336120605, "logps/chosen": -40.28795623779297, "logps/rejected": -577.9163208007812, "loss": 14457.5328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19212636351585388, "rewards/margins": 0.54021155834198, "rewards/rejected": -0.3480851650238037, "step": 3410 }, { "epoch": 17.31645569620253, "grad_norm": 323871.0912725565, "learning_rate": 1.9743027264180506e-08, "logits/chosen": 1.0423898696899414, "logits/rejected": 1.1823880672454834, "logps/chosen": -50.077327728271484, "logps/rejected": -565.8704223632812, "loss": 14191.8531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18371161818504333, "rewards/margins": 0.5181502103805542, "rewards/rejected": -0.3344385623931885, "step": 3420 }, { "epoch": 17.367088607594937, "grad_norm": 207973.13380554292, "learning_rate": 1.8176120338451895e-08, "logits/chosen": -0.8037737011909485, "logits/rejected": -0.8005819320678711, "logps/chosen": -45.626670837402344, "logps/rejected": -544.7116088867188, "loss": 14114.0906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1760983169078827, "rewards/margins": 0.5017568469047546, "rewards/rejected": -0.32565850019454956, "step": 3430 }, { "epoch": 17.417721518987342, "grad_norm": 191156.31750064602, "learning_rate": 1.6609213412723284e-08, "logits/chosen": 1.2277637720108032, "logits/rejected": 0.573845386505127, "logps/chosen": -50.492279052734375, "logps/rejected": -586.3282470703125, "loss": 13957.7594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19206462800502777, "rewards/margins": 0.5328875184059143, "rewards/rejected": -0.34082287549972534, "step": 3440 }, { "epoch": 17.468354430379748, "grad_norm": 262791.84599779843, "learning_rate": 1.5042306486994673e-08, "logits/chosen": 0.29228338599205017, "logits/rejected": 0.9747223854064941, "logps/chosen": -37.640201568603516, "logps/rejected": -557.47119140625, "loss": 14478.4906, "rewards/accuracies": 1.0, "rewards/chosen": 0.18818514049053192, "rewards/margins": 0.5214470624923706, "rewards/rejected": -0.3332619369029999, "step": 3450 }, { "epoch": 17.518987341772153, "grad_norm": 227441.5548714142, "learning_rate": 1.347539956126606e-08, "logits/chosen": -0.060483645647764206, "logits/rejected": 0.41309136152267456, "logps/chosen": -46.32054138183594, "logps/rejected": -588.6563720703125, "loss": 14804.9047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19712397456169128, "rewards/margins": 0.5437620878219604, "rewards/rejected": -0.34663814306259155, "step": 3460 }, { "epoch": 17.569620253164558, "grad_norm": 378558.8588589865, "learning_rate": 1.1908492635537449e-08, "logits/chosen": 2.0075535774230957, "logits/rejected": 2.772726058959961, "logps/chosen": -46.09113693237305, "logps/rejected": -582.0597534179688, "loss": 14645.9562, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18593838810920715, "rewards/margins": 0.5331605076789856, "rewards/rejected": -0.34722214937210083, "step": 3470 }, { "epoch": 17.620253164556964, "grad_norm": 263891.6462573049, "learning_rate": 1.0341585709808836e-08, "logits/chosen": 0.273967444896698, "logits/rejected": 1.9021276235580444, "logps/chosen": -34.29851531982422, "logps/rejected": -567.2389526367188, "loss": 15085.2313, "rewards/accuracies": 1.0, "rewards/chosen": 0.18956169486045837, "rewards/margins": 0.5355597734451294, "rewards/rejected": -0.3459981083869934, "step": 3480 }, { "epoch": 17.67088607594937, "grad_norm": 276267.9285814808, "learning_rate": 8.774678784080225e-09, "logits/chosen": -0.02632077969610691, "logits/rejected": 0.4594387114048004, "logps/chosen": -45.098960876464844, "logps/rejected": -568.720947265625, "loss": 13750.4469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19129987061023712, "rewards/margins": 0.5272942781448364, "rewards/rejected": -0.33599433302879333, "step": 3490 }, { "epoch": 17.72151898734177, "grad_norm": 156087.69298121333, "learning_rate": 7.207771858351613e-09, "logits/chosen": 0.04748225212097168, "logits/rejected": 0.4610685408115387, "logps/chosen": -49.872169494628906, "logps/rejected": -603.3367919921875, "loss": 13778.1469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19287212193012238, "rewards/margins": 0.5504390001296997, "rewards/rejected": -0.3575669229030609, "step": 3500 }, { "epoch": 17.772151898734176, "grad_norm": 209667.9634643516, "learning_rate": 5.6408649326230014e-09, "logits/chosen": 1.4883615970611572, "logits/rejected": 2.2038960456848145, "logps/chosen": -46.18961715698242, "logps/rejected": -575.4703369140625, "loss": 13653.9672, "rewards/accuracies": 1.0, "rewards/chosen": 0.18806029856204987, "rewards/margins": 0.5310976505279541, "rewards/rejected": -0.34303733706474304, "step": 3510 }, { "epoch": 17.82278481012658, "grad_norm": 222056.5820151951, "learning_rate": 4.07395800689439e-09, "logits/chosen": -0.582931637763977, "logits/rejected": -0.23906604945659637, "logps/chosen": -60.795921325683594, "logps/rejected": -590.2525024414062, "loss": 14149.5938, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19504059851169586, "rewards/margins": 0.5360020399093628, "rewards/rejected": -0.34096142649650574, "step": 3520 }, { "epoch": 17.873417721518987, "grad_norm": 213324.38139465638, "learning_rate": 2.5070510811657785e-09, "logits/chosen": -0.3791787028312683, "logits/rejected": 0.26259681582450867, "logps/chosen": -48.315147399902344, "logps/rejected": -579.376220703125, "loss": 14028.4, "rewards/accuracies": 1.0, "rewards/chosen": 0.19789119064807892, "rewards/margins": 0.5311988592147827, "rewards/rejected": -0.333307683467865, "step": 3530 }, { "epoch": 17.924050632911392, "grad_norm": 207695.40556695752, "learning_rate": 9.40144155437167e-10, "logits/chosen": 2.0871522426605225, "logits/rejected": 2.378633975982666, "logps/chosen": -36.07915115356445, "logps/rejected": -560.524169921875, "loss": 13942.7234, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18208447098731995, "rewards/margins": 0.5249064564704895, "rewards/rejected": -0.34282201528549194, "step": 3540 }, { "epoch": 18.020253164556962, "grad_norm": 633377.3531549113, "learning_rate": 2.774992165465371e-07, "logits/chosen": 0.778042197227478, "logits/rejected": 0.4570779800415039, "logps/chosen": -30.87795639038086, "logps/rejected": -562.3123168945312, "loss": 14823.5117, "rewards/accuracies": 1.0, "rewards/chosen": 0.19000156223773956, "rewards/margins": 0.5307614803314209, "rewards/rejected": -0.34075987339019775, "step": 3550 }, { "epoch": 18.070886075949367, "grad_norm": 536501.4949459385, "learning_rate": 2.767157630836728e-07, "logits/chosen": -1.453107476234436, "logits/rejected": -1.1603299379348755, "logps/chosen": -48.59767532348633, "logps/rejected": -607.5595703125, "loss": 14268.2156, "rewards/accuracies": 1.0, "rewards/chosen": 0.19779345393180847, "rewards/margins": 0.5535213351249695, "rewards/rejected": -0.3557279109954834, "step": 3560 }, { "epoch": 18.121518987341773, "grad_norm": 613929.4964505757, "learning_rate": 2.7593230962080847e-07, "logits/chosen": -1.0204923152923584, "logits/rejected": -1.006306529045105, "logps/chosen": -40.379703521728516, "logps/rejected": -586.8853759765625, "loss": 14124.1406, "rewards/accuracies": 1.0, "rewards/chosen": 0.18828515708446503, "rewards/margins": 0.5416163206100464, "rewards/rejected": -0.35333114862442017, "step": 3570 }, { "epoch": 18.172151898734178, "grad_norm": 453188.0208924516, "learning_rate": 2.751488561579442e-07, "logits/chosen": 0.978573203086853, "logits/rejected": 1.6422239542007446, "logps/chosen": -40.75902557373047, "logps/rejected": -571.6940307617188, "loss": 14028.8266, "rewards/accuracies": 1.0, "rewards/chosen": 0.19008655846118927, "rewards/margins": 0.5349593758583069, "rewards/rejected": -0.34487277269363403, "step": 3580 }, { "epoch": 18.222784810126583, "grad_norm": 470617.1864493106, "learning_rate": 2.743654026950799e-07, "logits/chosen": 0.612755298614502, "logits/rejected": 1.586531639099121, "logps/chosen": -47.43413162231445, "logps/rejected": -567.2514038085938, "loss": 14305.0953, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18671520054340363, "rewards/margins": 0.5188931226730347, "rewards/rejected": -0.33217787742614746, "step": 3590 }, { "epoch": 18.27341772151899, "grad_norm": 568328.2123455897, "learning_rate": 2.7358194923221564e-07, "logits/chosen": 2.5831315517425537, "logits/rejected": 2.3743977546691895, "logps/chosen": -36.72047805786133, "logps/rejected": -561.4580688476562, "loss": 14931.7812, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18578791618347168, "rewards/margins": 0.5219975113868713, "rewards/rejected": -0.33620959520339966, "step": 3600 }, { "epoch": 18.324050632911394, "grad_norm": 258649.85824251673, "learning_rate": 2.727984957693513e-07, "logits/chosen": -0.6456964612007141, "logits/rejected": 0.10119187831878662, "logps/chosen": -45.66813659667969, "logps/rejected": -584.33984375, "loss": 13962.2891, "rewards/accuracies": 1.0, "rewards/chosen": 0.19075247645378113, "rewards/margins": 0.5430020093917847, "rewards/rejected": -0.35224950313568115, "step": 3610 }, { "epoch": 18.374683544303796, "grad_norm": 523823.39531677734, "learning_rate": 2.72015042306487e-07, "logits/chosen": -0.1337634027004242, "logits/rejected": 0.3194190561771393, "logps/chosen": -43.2452278137207, "logps/rejected": -576.6324462890625, "loss": 14478.6656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19283099472522736, "rewards/margins": 0.5422399640083313, "rewards/rejected": -0.34940892457962036, "step": 3620 }, { "epoch": 18.4253164556962, "grad_norm": 369527.7483340646, "learning_rate": 2.712315888436227e-07, "logits/chosen": -0.5704905390739441, "logits/rejected": -0.24132680892944336, "logps/chosen": -39.81604766845703, "logps/rejected": -579.3060302734375, "loss": 14853.9188, "rewards/accuracies": 1.0, "rewards/chosen": 0.1893097311258316, "rewards/margins": 0.5385677218437195, "rewards/rejected": -0.3492580056190491, "step": 3630 }, { "epoch": 18.475949367088607, "grad_norm": 487722.91173438437, "learning_rate": 2.704481353807584e-07, "logits/chosen": 0.30203062295913696, "logits/rejected": 1.367623209953308, "logps/chosen": -43.79780578613281, "logps/rejected": -575.3096313476562, "loss": 14337.7125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18831488490104675, "rewards/margins": 0.5326144099235535, "rewards/rejected": -0.3442995548248291, "step": 3640 }, { "epoch": 18.526582278481012, "grad_norm": 769147.1132735502, "learning_rate": 2.6966468191789406e-07, "logits/chosen": 0.5818338990211487, "logits/rejected": 0.8189504742622375, "logps/chosen": -40.80295944213867, "logps/rejected": -569.6201171875, "loss": 14414.5, "rewards/accuracies": 1.0, "rewards/chosen": 0.19092252850532532, "rewards/margins": 0.5284001231193542, "rewards/rejected": -0.3374776244163513, "step": 3650 }, { "epoch": 18.577215189873417, "grad_norm": 423741.6615039136, "learning_rate": 2.6888122845502977e-07, "logits/chosen": -2.1757419109344482, "logits/rejected": -1.7465986013412476, "logps/chosen": -33.543739318847656, "logps/rejected": -566.9044189453125, "loss": 13990.4406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18767623603343964, "rewards/margins": 0.5356841683387756, "rewards/rejected": -0.3480078876018524, "step": 3660 }, { "epoch": 18.627848101265823, "grad_norm": 405282.2937016151, "learning_rate": 2.680977749921655e-07, "logits/chosen": -0.054244786500930786, "logits/rejected": 0.9029023051261902, "logps/chosen": -49.31962966918945, "logps/rejected": -585.48779296875, "loss": 14779.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.19979842007160187, "rewards/margins": 0.5441454648971558, "rewards/rejected": -0.3443470597267151, "step": 3670 }, { "epoch": 18.678481012658228, "grad_norm": 468937.7683958159, "learning_rate": 2.673143215293012e-07, "logits/chosen": -0.046643782407045364, "logits/rejected": -0.1421128809452057, "logps/chosen": -40.85643768310547, "logps/rejected": -577.6583862304688, "loss": 14531.35, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.195206418633461, "rewards/margins": 0.5361508131027222, "rewards/rejected": -0.3409443199634552, "step": 3680 }, { "epoch": 18.729113924050633, "grad_norm": 627917.5959141933, "learning_rate": 2.6653086806643683e-07, "logits/chosen": 1.5284700393676758, "logits/rejected": 1.2886362075805664, "logps/chosen": -48.694664001464844, "logps/rejected": -579.7990112304688, "loss": 15195.4844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1851346641778946, "rewards/margins": 0.531388521194458, "rewards/rejected": -0.346253901720047, "step": 3690 }, { "epoch": 18.77974683544304, "grad_norm": 511207.857422736, "learning_rate": 2.6574741460357254e-07, "logits/chosen": 0.03928997367620468, "logits/rejected": 0.5418666005134583, "logps/chosen": -50.31745529174805, "logps/rejected": -593.0169677734375, "loss": 14929.1469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19712677597999573, "rewards/margins": 0.5447811484336853, "rewards/rejected": -0.3476543724536896, "step": 3700 }, { "epoch": 18.830379746835444, "grad_norm": 568133.4282182837, "learning_rate": 2.6496396114070825e-07, "logits/chosen": -0.7848063707351685, "logits/rejected": -0.8312255144119263, "logps/chosen": -39.726234436035156, "logps/rejected": -566.0286254882812, "loss": 14112.2844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18878208100795746, "rewards/margins": 0.5250921249389648, "rewards/rejected": -0.3363099992275238, "step": 3710 }, { "epoch": 18.88101265822785, "grad_norm": 293062.3175283677, "learning_rate": 2.6418050767784395e-07, "logits/chosen": -0.22776488959789276, "logits/rejected": -0.043119143694639206, "logps/chosen": -47.83971405029297, "logps/rejected": -575.6166381835938, "loss": 14345.3813, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19344884157180786, "rewards/margins": 0.5261351466178894, "rewards/rejected": -0.33268633484840393, "step": 3720 }, { "epoch": 18.931645569620255, "grad_norm": 369584.46121245134, "learning_rate": 2.633970542149796e-07, "logits/chosen": 0.6460098028182983, "logits/rejected": 0.6165057420730591, "logps/chosen": -53.0880126953125, "logps/rejected": -602.0147705078125, "loss": 14143.9609, "rewards/accuracies": 1.0, "rewards/chosen": 0.19915179908275604, "rewards/margins": 0.5483053922653198, "rewards/rejected": -0.3491537272930145, "step": 3730 }, { "epoch": 18.98227848101266, "grad_norm": 328959.5337312854, "learning_rate": 2.626136007521153e-07, "logits/chosen": 0.25958794355392456, "logits/rejected": 0.5823850631713867, "logps/chosen": -49.16436004638672, "logps/rejected": -584.5070190429688, "loss": 14187.4844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19965310394763947, "rewards/margins": 0.5432143211364746, "rewards/rejected": -0.34356123208999634, "step": 3740 }, { "epoch": 19.03291139240506, "grad_norm": 1626740.8696455131, "learning_rate": 2.61830147289251e-07, "logits/chosen": -0.6601130366325378, "logits/rejected": -0.8405634164810181, "logps/chosen": -46.10778045654297, "logps/rejected": -587.3377685546875, "loss": 14051.6469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1946493685245514, "rewards/margins": 0.54271399974823, "rewards/rejected": -0.34806469082832336, "step": 3750 }, { "epoch": 19.083544303797467, "grad_norm": 786920.4959477714, "learning_rate": 2.610466938263867e-07, "logits/chosen": 0.280475914478302, "logits/rejected": 1.5355632305145264, "logps/chosen": -40.83550262451172, "logps/rejected": -576.2233276367188, "loss": 14524.6172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19496676325798035, "rewards/margins": 0.5368971228599548, "rewards/rejected": -0.3419303297996521, "step": 3760 }, { "epoch": 19.134177215189872, "grad_norm": 670222.9584254185, "learning_rate": 2.602632403635224e-07, "logits/chosen": 1.6073856353759766, "logits/rejected": 2.1679255962371826, "logps/chosen": -48.07741928100586, "logps/rejected": -568.386962890625, "loss": 16064.1922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19028018414974213, "rewards/margins": 0.5232519507408142, "rewards/rejected": -0.3329717516899109, "step": 3770 }, { "epoch": 19.184810126582278, "grad_norm": 779401.4265683588, "learning_rate": 2.594797869006581e-07, "logits/chosen": -1.2690767049789429, "logits/rejected": -0.7741214036941528, "logps/chosen": -35.147666931152344, "logps/rejected": -588.05810546875, "loss": 14594.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.20034465193748474, "rewards/margins": 0.5537833571434021, "rewards/rejected": -0.3534386456012726, "step": 3780 }, { "epoch": 19.235443037974683, "grad_norm": 677896.0436831466, "learning_rate": 2.586963334377938e-07, "logits/chosen": 0.381600558757782, "logits/rejected": 0.3627360761165619, "logps/chosen": -47.129329681396484, "logps/rejected": -583.4297485351562, "loss": 14673.1125, "rewards/accuracies": 1.0, "rewards/chosen": 0.19634023308753967, "rewards/margins": 0.5409786105155945, "rewards/rejected": -0.3446383774280548, "step": 3790 }, { "epoch": 19.28607594936709, "grad_norm": 1708590.8406628803, "learning_rate": 2.579128799749295e-07, "logits/chosen": -0.6463128924369812, "logits/rejected": -0.1966671198606491, "logps/chosen": -51.58148193359375, "logps/rejected": -571.8802490234375, "loss": 14855.8094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1947019398212433, "rewards/margins": 0.5256696343421936, "rewards/rejected": -0.3309677243232727, "step": 3800 }, { "epoch": 19.336708860759494, "grad_norm": 906394.5199246205, "learning_rate": 2.5712942651206515e-07, "logits/chosen": 0.6537224054336548, "logits/rejected": 1.356911301612854, "logps/chosen": -37.791786193847656, "logps/rejected": -541.84912109375, "loss": 14494.675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18946874141693115, "rewards/margins": 0.5098165273666382, "rewards/rejected": -0.3203478455543518, "step": 3810 }, { "epoch": 19.3873417721519, "grad_norm": 1248788.3894635146, "learning_rate": 2.5634597304920085e-07, "logits/chosen": -1.4148962497711182, "logits/rejected": -0.616938591003418, "logps/chosen": -39.15003204345703, "logps/rejected": -567.9779052734375, "loss": 14511.9828, "rewards/accuracies": 1.0, "rewards/chosen": 0.19468382000923157, "rewards/margins": 0.5306459665298462, "rewards/rejected": -0.33596211671829224, "step": 3820 }, { "epoch": 19.437974683544304, "grad_norm": 699507.4776687805, "learning_rate": 2.5556251958633656e-07, "logits/chosen": -0.786666214466095, "logits/rejected": -0.8524150848388672, "logps/chosen": -37.31165313720703, "logps/rejected": -559.5811767578125, "loss": 15226.8953, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18610945343971252, "rewards/margins": 0.5259476900100708, "rewards/rejected": -0.3398382067680359, "step": 3830 }, { "epoch": 19.48860759493671, "grad_norm": 750946.845865734, "learning_rate": 2.5477906612347227e-07, "logits/chosen": -0.5914249420166016, "logits/rejected": -0.1790940761566162, "logps/chosen": -41.875919342041016, "logps/rejected": -580.2433471679688, "loss": 15077.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.1914350688457489, "rewards/margins": 0.5402361154556274, "rewards/rejected": -0.34880098700523376, "step": 3840 }, { "epoch": 19.539240506329115, "grad_norm": 1438213.362152031, "learning_rate": 2.539956126606079e-07, "logits/chosen": -1.4764426946640015, "logits/rejected": -1.0852867364883423, "logps/chosen": -45.64619064331055, "logps/rejected": -574.2334594726562, "loss": 15001.6922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19227740168571472, "rewards/margins": 0.527544379234314, "rewards/rejected": -0.33526697754859924, "step": 3850 }, { "epoch": 19.58987341772152, "grad_norm": 1015656.6585732017, "learning_rate": 2.532121591977436e-07, "logits/chosen": 0.0265532024204731, "logits/rejected": 0.4305901527404785, "logps/chosen": -40.221275329589844, "logps/rejected": -580.7332763671875, "loss": 15005.4062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1900371015071869, "rewards/margins": 0.5353468656539917, "rewards/rejected": -0.3453097939491272, "step": 3860 }, { "epoch": 19.640506329113926, "grad_norm": 1480021.6334817603, "learning_rate": 2.5242870573487933e-07, "logits/chosen": -2.3115265369415283, "logits/rejected": -1.9450628757476807, "logps/chosen": -43.31880187988281, "logps/rejected": -592.5906372070312, "loss": 14681.3906, "rewards/accuracies": 1.0, "rewards/chosen": 0.20574085414409637, "rewards/margins": 0.5489095449447632, "rewards/rejected": -0.3431686758995056, "step": 3870 }, { "epoch": 19.691139240506327, "grad_norm": 652464.7916313735, "learning_rate": 2.5164525227201504e-07, "logits/chosen": 0.63951176404953, "logits/rejected": 1.3804535865783691, "logps/chosen": -33.52408981323242, "logps/rejected": -556.9231567382812, "loss": 15124.1328, "rewards/accuracies": 1.0, "rewards/chosen": 0.1876874566078186, "rewards/margins": 0.5311328172683716, "rewards/rejected": -0.343445360660553, "step": 3880 }, { "epoch": 19.741772151898733, "grad_norm": 697435.0328174214, "learning_rate": 2.508617988091507e-07, "logits/chosen": -1.7422069311141968, "logits/rejected": -1.3413903713226318, "logps/chosen": -42.224788665771484, "logps/rejected": -584.3877563476562, "loss": 15307.875, "rewards/accuracies": 1.0, "rewards/chosen": 0.2013184279203415, "rewards/margins": 0.5388418436050415, "rewards/rejected": -0.3375234305858612, "step": 3890 }, { "epoch": 19.792405063291138, "grad_norm": 680395.3386900029, "learning_rate": 2.500783453462864e-07, "logits/chosen": 0.6034026741981506, "logits/rejected": 1.1066893339157104, "logps/chosen": -39.37774658203125, "logps/rejected": -585.9308471679688, "loss": 15434.6031, "rewards/accuracies": 1.0, "rewards/chosen": 0.20041151344776154, "rewards/margins": 0.545585036277771, "rewards/rejected": -0.34517353773117065, "step": 3900 }, { "epoch": 19.843037974683543, "grad_norm": 1036480.9072027011, "learning_rate": 2.492948918834221e-07, "logits/chosen": -0.37202078104019165, "logits/rejected": -0.6633853316307068, "logps/chosen": -50.845218658447266, "logps/rejected": -565.1788330078125, "loss": 14732.9813, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1860923022031784, "rewards/margins": 0.5141801834106445, "rewards/rejected": -0.32808783650398254, "step": 3910 }, { "epoch": 19.89367088607595, "grad_norm": 960769.0916438915, "learning_rate": 2.485114384205578e-07, "logits/chosen": -2.4451048374176025, "logits/rejected": -1.8602224588394165, "logps/chosen": -49.44445037841797, "logps/rejected": -588.972900390625, "loss": 14954.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20443923771381378, "rewards/margins": 0.5393208265304565, "rewards/rejected": -0.33488157391548157, "step": 3920 }, { "epoch": 19.944303797468354, "grad_norm": 637831.6626185304, "learning_rate": 2.477279849576935e-07, "logits/chosen": -1.1525195837020874, "logits/rejected": -0.6883751153945923, "logps/chosen": -37.11662673950195, "logps/rejected": -576.5172729492188, "loss": 14910.0938, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20432814955711365, "rewards/margins": 0.5393826961517334, "rewards/rejected": -0.33505457639694214, "step": 3930 }, { "epoch": 19.99493670886076, "grad_norm": 926025.3002487151, "learning_rate": 2.4694453149482917e-07, "logits/chosen": 0.10631950944662094, "logits/rejected": 0.8977824449539185, "logps/chosen": -43.12347412109375, "logps/rejected": -561.4703369140625, "loss": 15041.1219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18227019906044006, "rewards/margins": 0.5218333005905151, "rewards/rejected": -0.3395631015300751, "step": 3940 }, { "epoch": 20.045569620253165, "grad_norm": 585485.6374993185, "learning_rate": 2.461610780319649e-07, "logits/chosen": -0.45847588777542114, "logits/rejected": -0.7163432836532593, "logps/chosen": -52.16509246826172, "logps/rejected": -574.7341918945312, "loss": 14472.725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18465857207775116, "rewards/margins": 0.5204340219497681, "rewards/rejected": -0.3357754647731781, "step": 3950 }, { "epoch": 20.09620253164557, "grad_norm": 732893.4730793714, "learning_rate": 2.453776245691006e-07, "logits/chosen": -0.8532247543334961, "logits/rejected": -1.0177998542785645, "logps/chosen": -34.215248107910156, "logps/rejected": -545.2357788085938, "loss": 14258.1938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1821087896823883, "rewards/margins": 0.5106922388076782, "rewards/rejected": -0.3285834789276123, "step": 3960 }, { "epoch": 20.146835443037975, "grad_norm": 566993.8027250646, "learning_rate": 2.445941711062363e-07, "logits/chosen": 0.08969805389642715, "logits/rejected": 0.7759960889816284, "logps/chosen": -44.99492645263672, "logps/rejected": -555.2744140625, "loss": 13412.7156, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1912485808134079, "rewards/margins": 0.5124030113220215, "rewards/rejected": -0.3211544454097748, "step": 3970 }, { "epoch": 20.19746835443038, "grad_norm": 694029.5528637858, "learning_rate": 2.4381071764337194e-07, "logits/chosen": -1.096225380897522, "logits/rejected": -0.07990212738513947, "logps/chosen": -50.788089752197266, "logps/rejected": -593.2210083007812, "loss": 14393.8781, "rewards/accuracies": 1.0, "rewards/chosen": 0.20465774834156036, "rewards/margins": 0.5500935316085815, "rewards/rejected": -0.34543576836586, "step": 3980 }, { "epoch": 20.248101265822786, "grad_norm": 732062.5147915592, "learning_rate": 2.430272641805077e-07, "logits/chosen": 2.356168270111084, "logits/rejected": 2.851551055908203, "logps/chosen": -47.21477508544922, "logps/rejected": -550.4632568359375, "loss": 14544.5953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1797908991575241, "rewards/margins": 0.5041374564170837, "rewards/rejected": -0.3243466317653656, "step": 3990 }, { "epoch": 20.29873417721519, "grad_norm": 679068.526799364, "learning_rate": 2.4224381071764335e-07, "logits/chosen": -0.03650767728686333, "logits/rejected": 0.4142078459262848, "logps/chosen": -56.95387649536133, "logps/rejected": -573.4332885742188, "loss": 14252.7656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19543471932411194, "rewards/margins": 0.5279492139816284, "rewards/rejected": -0.3325144648551941, "step": 4000 }, { "epoch": 20.349367088607593, "grad_norm": 654544.0953281225, "learning_rate": 2.4146035725477906e-07, "logits/chosen": -1.5134330987930298, "logits/rejected": -1.2428243160247803, "logps/chosen": -38.626304626464844, "logps/rejected": -569.9034423828125, "loss": 14105.2687, "rewards/accuracies": 1.0, "rewards/chosen": 0.19262897968292236, "rewards/margins": 0.5324395298957825, "rewards/rejected": -0.3398105204105377, "step": 4010 }, { "epoch": 20.4, "grad_norm": 497489.50957681873, "learning_rate": 2.4067690379191476e-07, "logits/chosen": -0.28921595215797424, "logits/rejected": 0.4628073573112488, "logps/chosen": -35.01650619506836, "logps/rejected": -566.645263671875, "loss": 14669.4656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19555968046188354, "rewards/margins": 0.5380457639694214, "rewards/rejected": -0.3424859941005707, "step": 4020 }, { "epoch": 20.450632911392404, "grad_norm": 813543.9506414626, "learning_rate": 2.3989345032905047e-07, "logits/chosen": 0.39950722455978394, "logits/rejected": 0.7022187113761902, "logps/chosen": -34.654640197753906, "logps/rejected": -572.8896484375, "loss": 14513.1688, "rewards/accuracies": 1.0, "rewards/chosen": 0.19765284657478333, "rewards/margins": 0.5385235548019409, "rewards/rejected": -0.34087073802948, "step": 4030 }, { "epoch": 20.50126582278481, "grad_norm": 492881.9600409883, "learning_rate": 2.391099968661861e-07, "logits/chosen": 0.8350554704666138, "logits/rejected": 1.3902348279953003, "logps/chosen": -55.87943649291992, "logps/rejected": -570.520751953125, "loss": 14474.9922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19199691712856293, "rewards/margins": 0.5237919092178345, "rewards/rejected": -0.33179494738578796, "step": 4040 }, { "epoch": 20.551898734177215, "grad_norm": 550156.3808352741, "learning_rate": 2.3832654340332183e-07, "logits/chosen": 0.10426521301269531, "logits/rejected": -0.0343349352478981, "logps/chosen": -41.53984451293945, "logps/rejected": -587.2073974609375, "loss": 14114.8438, "rewards/accuracies": 1.0, "rewards/chosen": 0.20071008801460266, "rewards/margins": 0.5482142567634583, "rewards/rejected": -0.3475041091442108, "step": 4050 }, { "epoch": 20.60253164556962, "grad_norm": 442438.75618485885, "learning_rate": 2.375430899404575e-07, "logits/chosen": 0.6523551344871521, "logits/rejected": 1.2190606594085693, "logps/chosen": -46.92839431762695, "logps/rejected": -584.9151000976562, "loss": 14072.4, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19940249621868134, "rewards/margins": 0.5458526611328125, "rewards/rejected": -0.3464500606060028, "step": 4060 }, { "epoch": 20.653164556962025, "grad_norm": 1289102.185567105, "learning_rate": 2.3675963647759321e-07, "logits/chosen": -0.5604708790779114, "logits/rejected": -0.4172240197658539, "logps/chosen": -41.39154815673828, "logps/rejected": -568.0210571289062, "loss": 14125.9813, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18824639916419983, "rewards/margins": 0.5325849652290344, "rewards/rejected": -0.3443385362625122, "step": 4070 }, { "epoch": 20.70379746835443, "grad_norm": 796999.4756244586, "learning_rate": 2.3597618301472892e-07, "logits/chosen": -0.1419040858745575, "logits/rejected": 0.26791125535964966, "logps/chosen": -37.67776870727539, "logps/rejected": -578.685302734375, "loss": 14390.0187, "rewards/accuracies": 1.0, "rewards/chosen": 0.1924736052751541, "rewards/margins": 0.5395643711090088, "rewards/rejected": -0.34709078073501587, "step": 4080 }, { "epoch": 20.754430379746836, "grad_norm": 730146.0659684967, "learning_rate": 2.3519272955186463e-07, "logits/chosen": -2.1902015209198, "logits/rejected": -1.9143873453140259, "logps/chosen": -49.27891159057617, "logps/rejected": -583.9379272460938, "loss": 14559.9, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19109120965003967, "rewards/margins": 0.5332453846931458, "rewards/rejected": -0.3421540856361389, "step": 4090 }, { "epoch": 20.80506329113924, "grad_norm": 722476.022933799, "learning_rate": 2.344092760890003e-07, "logits/chosen": -2.4936270713806152, "logits/rejected": -2.54288649559021, "logps/chosen": -38.10564041137695, "logps/rejected": -595.3966064453125, "loss": 14070.8406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20286759734153748, "rewards/margins": 0.5558962821960449, "rewards/rejected": -0.35302871465682983, "step": 4100 }, { "epoch": 20.855696202531647, "grad_norm": 569046.2654479475, "learning_rate": 2.33625822626136e-07, "logits/chosen": -1.9905935525894165, "logits/rejected": -1.920189619064331, "logps/chosen": -34.30987548828125, "logps/rejected": -561.2472534179688, "loss": 14892.8141, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.18726976215839386, "rewards/margins": 0.5297659039497375, "rewards/rejected": -0.3424961268901825, "step": 4110 }, { "epoch": 20.906329113924052, "grad_norm": 428009.1713524517, "learning_rate": 2.328423691632717e-07, "logits/chosen": -1.5145037174224854, "logits/rejected": -1.5116671323776245, "logps/chosen": -45.871498107910156, "logps/rejected": -565.3566284179688, "loss": 13932.3125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1972968429327011, "rewards/margins": 0.5235085487365723, "rewards/rejected": -0.3262116611003876, "step": 4120 }, { "epoch": 20.956962025316457, "grad_norm": 648441.2224283866, "learning_rate": 2.320589157004074e-07, "logits/chosen": -0.6003355383872986, "logits/rejected": 0.15587857365608215, "logps/chosen": -56.75775146484375, "logps/rejected": -600.1107177734375, "loss": 14185.1125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20023982226848602, "rewards/margins": 0.5419777631759644, "rewards/rejected": -0.34173792600631714, "step": 4130 }, { "epoch": 21.00759493670886, "grad_norm": 610764.2416650191, "learning_rate": 2.3127546223754308e-07, "logits/chosen": -0.7396095395088196, "logits/rejected": -0.635381281375885, "logps/chosen": -43.1786994934082, "logps/rejected": -567.4010620117188, "loss": 13604.9312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19861166179180145, "rewards/margins": 0.5279361605644226, "rewards/rejected": -0.32932454347610474, "step": 4140 }, { "epoch": 21.058227848101264, "grad_norm": 475061.8763801183, "learning_rate": 2.3049200877467878e-07, "logits/chosen": -0.8914744257926941, "logits/rejected": -0.421735942363739, "logps/chosen": -35.262672424316406, "logps/rejected": -577.42919921875, "loss": 14494.0125, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1971307098865509, "rewards/margins": 0.5426384210586548, "rewards/rejected": -0.3455076515674591, "step": 4150 }, { "epoch": 21.10886075949367, "grad_norm": 645277.3437759997, "learning_rate": 2.2970855531181446e-07, "logits/chosen": -0.8478671908378601, "logits/rejected": -0.030678223818540573, "logps/chosen": -41.63772964477539, "logps/rejected": -578.0343017578125, "loss": 14695.0688, "rewards/accuracies": 1.0, "rewards/chosen": 0.2051258385181427, "rewards/margins": 0.5436097979545593, "rewards/rejected": -0.338483989238739, "step": 4160 }, { "epoch": 21.159493670886075, "grad_norm": 585994.7230623597, "learning_rate": 2.2892510184895017e-07, "logits/chosen": -0.41463834047317505, "logits/rejected": 0.02628953382372856, "logps/chosen": -42.1683464050293, "logps/rejected": -583.2857055664062, "loss": 14024.5219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19314590096473694, "rewards/margins": 0.542751669883728, "rewards/rejected": -0.3496057987213135, "step": 4170 }, { "epoch": 21.21012658227848, "grad_norm": 484376.19543389586, "learning_rate": 2.2814164838608585e-07, "logits/chosen": -1.1306734085083008, "logits/rejected": -1.8836634159088135, "logps/chosen": -37.82279968261719, "logps/rejected": -602.1395874023438, "loss": 14403.9312, "rewards/accuracies": 1.0, "rewards/chosen": 0.20070350170135498, "rewards/margins": 0.561983048915863, "rewards/rejected": -0.36127954721450806, "step": 4180 }, { "epoch": 21.260759493670886, "grad_norm": 1094104.6320160846, "learning_rate": 2.2735819492322155e-07, "logits/chosen": -1.173640251159668, "logits/rejected": -1.2174631357192993, "logps/chosen": -41.484169006347656, "logps/rejected": -603.4227294921875, "loss": 13658.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.20206312835216522, "rewards/margins": 0.5591098070144653, "rewards/rejected": -0.3570466637611389, "step": 4190 }, { "epoch": 21.31139240506329, "grad_norm": 479054.6529765657, "learning_rate": 2.2657474146035723e-07, "logits/chosen": 0.22174246609210968, "logits/rejected": 0.7805773615837097, "logps/chosen": -36.59177780151367, "logps/rejected": -585.4400634765625, "loss": 13599.2641, "rewards/accuracies": 1.0, "rewards/chosen": 0.20017173886299133, "rewards/margins": 0.5513723492622375, "rewards/rejected": -0.3512006402015686, "step": 4200 }, { "epoch": 21.362025316455696, "grad_norm": 397662.76305916836, "learning_rate": 2.2579128799749294e-07, "logits/chosen": -0.08221355825662613, "logits/rejected": 0.6834055185317993, "logps/chosen": -34.845909118652344, "logps/rejected": -582.8208618164062, "loss": 13710.1328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19616791605949402, "rewards/margins": 0.548448920249939, "rewards/rejected": -0.35228094458580017, "step": 4210 }, { "epoch": 21.4126582278481, "grad_norm": 558368.8389175668, "learning_rate": 2.2500783453462862e-07, "logits/chosen": -1.6244313716888428, "logits/rejected": -1.1929272413253784, "logps/chosen": -40.19694900512695, "logps/rejected": -582.0238647460938, "loss": 13864.4125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20411738753318787, "rewards/margins": 0.5427777767181396, "rewards/rejected": -0.3386603891849518, "step": 4220 }, { "epoch": 21.463291139240507, "grad_norm": 677734.2581549523, "learning_rate": 2.2422438107176433e-07, "logits/chosen": -1.662096381187439, "logits/rejected": -1.0047109127044678, "logps/chosen": -35.72515106201172, "logps/rejected": -602.2333374023438, "loss": 14032.5453, "rewards/accuracies": 1.0, "rewards/chosen": 0.2048303186893463, "rewards/margins": 0.5686389207839966, "rewards/rejected": -0.36380860209465027, "step": 4230 }, { "epoch": 21.513924050632912, "grad_norm": 1168191.91590756, "learning_rate": 2.234409276089e-07, "logits/chosen": -1.4097559452056885, "logits/rejected": -1.5273138284683228, "logps/chosen": -41.850120544433594, "logps/rejected": -581.159912109375, "loss": 14159.3531, "rewards/accuracies": 1.0, "rewards/chosen": 0.19850656390190125, "rewards/margins": 0.5419961810112, "rewards/rejected": -0.3434896469116211, "step": 4240 }, { "epoch": 21.564556962025318, "grad_norm": 458950.46774975123, "learning_rate": 2.226574741460357e-07, "logits/chosen": 0.21833649277687073, "logits/rejected": 0.5494757294654846, "logps/chosen": -34.96293258666992, "logps/rejected": -556.4089965820312, "loss": 14176.0922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.189616397023201, "rewards/margins": 0.5252028107643127, "rewards/rejected": -0.33558645844459534, "step": 4250 }, { "epoch": 21.615189873417723, "grad_norm": 421239.1758950125, "learning_rate": 2.218740206831714e-07, "logits/chosen": -1.4874672889709473, "logits/rejected": -0.608989417552948, "logps/chosen": -41.988407135009766, "logps/rejected": -571.3652954101562, "loss": 13889.2, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19781097769737244, "rewards/margins": 0.534324586391449, "rewards/rejected": -0.33651357889175415, "step": 4260 }, { "epoch": 21.665822784810125, "grad_norm": 454253.5294425473, "learning_rate": 2.2109056722030712e-07, "logits/chosen": -2.9821505546569824, "logits/rejected": -2.786928415298462, "logps/chosen": -34.64727020263672, "logps/rejected": -584.9675903320312, "loss": 13704.1344, "rewards/accuracies": 1.0, "rewards/chosen": 0.20551709830760956, "rewards/margins": 0.5548884868621826, "rewards/rejected": -0.3493713140487671, "step": 4270 }, { "epoch": 21.71645569620253, "grad_norm": 452082.6783455646, "learning_rate": 2.203071137574428e-07, "logits/chosen": 0.3382144868373871, "logits/rejected": 0.4318400025367737, "logps/chosen": -45.613182067871094, "logps/rejected": -586.7680053710938, "loss": 13852.6578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20163491368293762, "rewards/margins": 0.5407856702804565, "rewards/rejected": -0.3391507565975189, "step": 4280 }, { "epoch": 21.767088607594935, "grad_norm": 514094.8538962621, "learning_rate": 2.195236602945785e-07, "logits/chosen": -0.4016874432563782, "logits/rejected": 0.6381920576095581, "logps/chosen": -43.911903381347656, "logps/rejected": -575.604736328125, "loss": 13636.45, "rewards/accuracies": 1.0, "rewards/chosen": 0.19898727536201477, "rewards/margins": 0.5374493598937988, "rewards/rejected": -0.33846214413642883, "step": 4290 }, { "epoch": 21.81772151898734, "grad_norm": 566656.255024603, "learning_rate": 2.187402068317142e-07, "logits/chosen": 1.3056986331939697, "logits/rejected": 1.5420660972595215, "logps/chosen": -26.3211669921875, "logps/rejected": -560.0467529296875, "loss": 13776.3141, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19464771449565887, "rewards/margins": 0.5358431935310364, "rewards/rejected": -0.3411955237388611, "step": 4300 }, { "epoch": 21.868354430379746, "grad_norm": 615988.2584437894, "learning_rate": 2.179567533688499e-07, "logits/chosen": -1.382976770401001, "logits/rejected": -1.09381902217865, "logps/chosen": -39.40836715698242, "logps/rejected": -589.1639404296875, "loss": 14209.4484, "rewards/accuracies": 1.0, "rewards/chosen": 0.20153430104255676, "rewards/margins": 0.5491803884506226, "rewards/rejected": -0.3476461172103882, "step": 4310 }, { "epoch": 21.91898734177215, "grad_norm": 537804.0354341647, "learning_rate": 2.1717329990598557e-07, "logits/chosen": -0.1827346831560135, "logits/rejected": 0.2996447682380676, "logps/chosen": -45.03169250488281, "logps/rejected": -602.3880615234375, "loss": 13524.8656, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20245595276355743, "rewards/margins": 0.5563360452651978, "rewards/rejected": -0.3538800776004791, "step": 4320 }, { "epoch": 21.969620253164557, "grad_norm": 1296539.8068217032, "learning_rate": 2.1638984644312128e-07, "logits/chosen": -1.9990384578704834, "logits/rejected": -2.4059531688690186, "logps/chosen": -43.4345588684082, "logps/rejected": -604.287841796875, "loss": 13463.9719, "rewards/accuracies": 1.0, "rewards/chosen": 0.21004056930541992, "rewards/margins": 0.5605840086936951, "rewards/rejected": -0.35054340958595276, "step": 4330 }, { "epoch": 22.020253164556962, "grad_norm": 844000.7864919893, "learning_rate": 2.1560639298025696e-07, "logits/chosen": -0.2103087455034256, "logits/rejected": 0.07530391216278076, "logps/chosen": -30.565990447998047, "logps/rejected": -547.0203857421875, "loss": 14383.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.1880597323179245, "rewards/margins": 0.5152319073677063, "rewards/rejected": -0.3271721601486206, "step": 4340 }, { "epoch": 22.070886075949367, "grad_norm": 597784.613899612, "learning_rate": 2.1482293951739267e-07, "logits/chosen": -0.721124529838562, "logits/rejected": -0.21510323882102966, "logps/chosen": -37.94996643066406, "logps/rejected": -587.461181640625, "loss": 13822.8656, "rewards/accuracies": 1.0, "rewards/chosen": 0.19957685470581055, "rewards/margins": 0.5496448278427124, "rewards/rejected": -0.3500679135322571, "step": 4350 }, { "epoch": 22.121518987341773, "grad_norm": 468430.91971069, "learning_rate": 2.1403948605452835e-07, "logits/chosen": -1.418505072593689, "logits/rejected": -0.8604210019111633, "logps/chosen": -38.54343795776367, "logps/rejected": -585.6038818359375, "loss": 13499.3969, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2021259367465973, "rewards/margins": 0.5458530187606812, "rewards/rejected": -0.34372708201408386, "step": 4360 }, { "epoch": 22.172151898734178, "grad_norm": 838303.6265575557, "learning_rate": 2.1325603259166405e-07, "logits/chosen": -0.013787698931992054, "logits/rejected": -0.22224357724189758, "logps/chosen": -33.32988357543945, "logps/rejected": -576.55224609375, "loss": 13816.5812, "rewards/accuracies": 1.0, "rewards/chosen": 0.1937212496995926, "rewards/margins": 0.5373150110244751, "rewards/rejected": -0.3435937762260437, "step": 4370 }, { "epoch": 22.222784810126583, "grad_norm": 524213.07765733794, "learning_rate": 2.1247257912879973e-07, "logits/chosen": 0.3687540888786316, "logits/rejected": 0.8078397512435913, "logps/chosen": -38.822872161865234, "logps/rejected": -553.8123168945312, "loss": 12435.0875, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18458959460258484, "rewards/margins": 0.5130779147148132, "rewards/rejected": -0.32848840951919556, "step": 4380 }, { "epoch": 22.27341772151899, "grad_norm": 476932.283051178, "learning_rate": 2.1168912566593544e-07, "logits/chosen": 0.6524232029914856, "logits/rejected": 0.6763177514076233, "logps/chosen": -41.4456901550293, "logps/rejected": -586.055419921875, "loss": 14132.7062, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20029637217521667, "rewards/margins": 0.5451359152793884, "rewards/rejected": -0.34483957290649414, "step": 4390 }, { "epoch": 22.324050632911394, "grad_norm": 568972.1382617814, "learning_rate": 2.1090567220307112e-07, "logits/chosen": -0.3675435781478882, "logits/rejected": 0.2508888840675354, "logps/chosen": -37.127281188964844, "logps/rejected": -571.7310180664062, "loss": 13226.8641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19992589950561523, "rewards/margins": 0.5378109812736511, "rewards/rejected": -0.33788514137268066, "step": 4400 }, { "epoch": 22.374683544303796, "grad_norm": 549953.3378298564, "learning_rate": 2.1012221874020682e-07, "logits/chosen": -0.3316110372543335, "logits/rejected": 0.12318412959575653, "logps/chosen": -45.176429748535156, "logps/rejected": -601.1099243164062, "loss": 13357.3594, "rewards/accuracies": 1.0, "rewards/chosen": 0.2056044340133667, "rewards/margins": 0.5584502220153809, "rewards/rejected": -0.35284581780433655, "step": 4410 }, { "epoch": 22.4253164556962, "grad_norm": 487398.89046152594, "learning_rate": 2.093387652773425e-07, "logits/chosen": -1.0198824405670166, "logits/rejected": -0.21292218565940857, "logps/chosen": -36.835960388183594, "logps/rejected": -577.4632568359375, "loss": 13915.3031, "rewards/accuracies": 1.0, "rewards/chosen": 0.20356829464435577, "rewards/margins": 0.543707013130188, "rewards/rejected": -0.3401387631893158, "step": 4420 }, { "epoch": 22.475949367088607, "grad_norm": 477361.2573301333, "learning_rate": 2.085553118144782e-07, "logits/chosen": 0.3704206943511963, "logits/rejected": 0.693733811378479, "logps/chosen": -46.64609146118164, "logps/rejected": -594.5731811523438, "loss": 13106.9359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20661070942878723, "rewards/margins": 0.5531338453292847, "rewards/rejected": -0.34652310609817505, "step": 4430 }, { "epoch": 22.526582278481012, "grad_norm": 597606.9724370906, "learning_rate": 2.077718583516139e-07, "logits/chosen": -0.6012102365493774, "logits/rejected": -0.6212292909622192, "logps/chosen": -36.24720001220703, "logps/rejected": -570.0081787109375, "loss": 13390.8625, "rewards/accuracies": 1.0, "rewards/chosen": 0.20318233966827393, "rewards/margins": 0.5344886779785156, "rewards/rejected": -0.3313063085079193, "step": 4440 }, { "epoch": 22.577215189873417, "grad_norm": 469529.248927815, "learning_rate": 2.069884048887496e-07, "logits/chosen": -0.041382573544979095, "logits/rejected": 0.7878470420837402, "logps/chosen": -43.38654708862305, "logps/rejected": -568.7279052734375, "loss": 13333.4188, "rewards/accuracies": 1.0, "rewards/chosen": 0.19029700756072998, "rewards/margins": 0.5311275124549866, "rewards/rejected": -0.3408304750919342, "step": 4450 }, { "epoch": 22.627848101265823, "grad_norm": 402623.00766789017, "learning_rate": 2.0620495142588527e-07, "logits/chosen": -0.8500850796699524, "logits/rejected": 0.10065221786499023, "logps/chosen": -31.52435302734375, "logps/rejected": -562.478271484375, "loss": 13787.8797, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1997881382703781, "rewards/margins": 0.5369755029678345, "rewards/rejected": -0.33718740940093994, "step": 4460 }, { "epoch": 22.678481012658228, "grad_norm": 373755.1797101064, "learning_rate": 2.05421497963021e-07, "logits/chosen": -1.3291960954666138, "logits/rejected": -1.2023630142211914, "logps/chosen": -34.12505340576172, "logps/rejected": -600.3700561523438, "loss": 13297.9406, "rewards/accuracies": 1.0, "rewards/chosen": 0.20627860724925995, "rewards/margins": 0.5676389336585999, "rewards/rejected": -0.3613602817058563, "step": 4470 }, { "epoch": 22.729113924050633, "grad_norm": 402761.92027776636, "learning_rate": 2.0463804450015669e-07, "logits/chosen": -1.5893421173095703, "logits/rejected": -1.3823096752166748, "logps/chosen": -30.61318588256836, "logps/rejected": -584.5267944335938, "loss": 14326.1875, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20079275965690613, "rewards/margins": 0.5540488958358765, "rewards/rejected": -0.35325610637664795, "step": 4480 }, { "epoch": 22.77974683544304, "grad_norm": 547067.5872175789, "learning_rate": 2.038545910372924e-07, "logits/chosen": 0.4800703525543213, "logits/rejected": 1.4792516231536865, "logps/chosen": -27.049495697021484, "logps/rejected": -563.0673217773438, "loss": 14447.3891, "rewards/accuracies": 1.0, "rewards/chosen": 0.19703736901283264, "rewards/margins": 0.5409034490585327, "rewards/rejected": -0.3438660502433777, "step": 4490 }, { "epoch": 22.830379746835444, "grad_norm": 672757.480231201, "learning_rate": 2.0307113757442807e-07, "logits/chosen": 0.13832028210163116, "logits/rejected": 0.6534411907196045, "logps/chosen": -48.682350158691406, "logps/rejected": -608.1444091796875, "loss": 13146.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.20029571652412415, "rewards/margins": 0.5563368797302246, "rewards/rejected": -0.35604116320610046, "step": 4500 }, { "epoch": 22.88101265822785, "grad_norm": 369986.02432868385, "learning_rate": 2.0228768411156378e-07, "logits/chosen": -1.8307338953018188, "logits/rejected": -1.2095929384231567, "logps/chosen": -45.19769287109375, "logps/rejected": -578.0750122070312, "loss": 14329.1656, "rewards/accuracies": 1.0, "rewards/chosen": 0.1950627863407135, "rewards/margins": 0.5377144813537598, "rewards/rejected": -0.3426516652107239, "step": 4510 }, { "epoch": 22.931645569620255, "grad_norm": 699107.7543808775, "learning_rate": 2.0150423064869946e-07, "logits/chosen": -0.08685462176799774, "logits/rejected": 0.9019424319267273, "logps/chosen": -45.735721588134766, "logps/rejected": -577.2432861328125, "loss": 13698.3734, "rewards/accuracies": 1.0, "rewards/chosen": 0.19573049247264862, "rewards/margins": 0.5354525446891785, "rewards/rejected": -0.33972200751304626, "step": 4520 }, { "epoch": 22.98227848101266, "grad_norm": 406293.788277243, "learning_rate": 2.0072077718583516e-07, "logits/chosen": -0.8599483370780945, "logits/rejected": 0.11351003497838974, "logps/chosen": -26.267419815063477, "logps/rejected": -555.6368408203125, "loss": 13659.9219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19417151808738708, "rewards/margins": 0.5340765714645386, "rewards/rejected": -0.3399050235748291, "step": 4530 }, { "epoch": 23.03291139240506, "grad_norm": 260712.73606555417, "learning_rate": 1.9993732372297084e-07, "logits/chosen": -0.3511297106742859, "logits/rejected": 0.3242552876472473, "logps/chosen": -36.45670700073242, "logps/rejected": -593.9841918945312, "loss": 13041.3578, "rewards/accuracies": 1.0, "rewards/chosen": 0.20808692276477814, "rewards/margins": 0.5588291883468628, "rewards/rejected": -0.3507421910762787, "step": 4540 }, { "epoch": 23.083544303797467, "grad_norm": 814422.569024492, "learning_rate": 1.9915387026010655e-07, "logits/chosen": -1.235445499420166, "logits/rejected": -1.077894926071167, "logps/chosen": -35.420509338378906, "logps/rejected": -552.5203857421875, "loss": 13824.6812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19372673332691193, "rewards/margins": 0.5206524729728699, "rewards/rejected": -0.32692575454711914, "step": 4550 }, { "epoch": 23.134177215189872, "grad_norm": 403075.73058182886, "learning_rate": 1.9837041679724223e-07, "logits/chosen": -1.2545719146728516, "logits/rejected": -0.1615985631942749, "logps/chosen": -39.13911056518555, "logps/rejected": -572.5064697265625, "loss": 13053.8422, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20484952628612518, "rewards/margins": 0.5411498546600342, "rewards/rejected": -0.3363003432750702, "step": 4560 }, { "epoch": 23.184810126582278, "grad_norm": 410323.6766664692, "learning_rate": 1.9758696333437793e-07, "logits/chosen": 0.23864197731018066, "logits/rejected": 0.84992915391922, "logps/chosen": -27.040796279907227, "logps/rejected": -566.3431396484375, "loss": 13967.5844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1973811388015747, "rewards/margins": 0.5440403819084167, "rewards/rejected": -0.34665924310684204, "step": 4570 }, { "epoch": 23.235443037974683, "grad_norm": 1678425.4580624043, "learning_rate": 1.9680350987151361e-07, "logits/chosen": -1.6769014596939087, "logits/rejected": -1.3361175060272217, "logps/chosen": -41.234596252441406, "logps/rejected": -571.8701782226562, "loss": 14487.3484, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19792771339416504, "rewards/margins": 0.5364667177200317, "rewards/rejected": -0.3385389745235443, "step": 4580 }, { "epoch": 23.28607594936709, "grad_norm": 449840.24063538713, "learning_rate": 1.9602005640864932e-07, "logits/chosen": 0.17428772151470184, "logits/rejected": 0.36653703451156616, "logps/chosen": -42.2408561706543, "logps/rejected": -559.9259033203125, "loss": 13708.7656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.18780556321144104, "rewards/margins": 0.5175566673278809, "rewards/rejected": -0.3297511339187622, "step": 4590 }, { "epoch": 23.336708860759494, "grad_norm": 679657.3800551172, "learning_rate": 1.95236602945785e-07, "logits/chosen": -1.7364814281463623, "logits/rejected": -0.4961363673210144, "logps/chosen": -38.69525909423828, "logps/rejected": -572.1868896484375, "loss": 13593.8281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19368386268615723, "rewards/margins": 0.5372828245162964, "rewards/rejected": -0.3435989320278168, "step": 4600 }, { "epoch": 23.3873417721519, "grad_norm": 494252.98244154564, "learning_rate": 1.944531494829207e-07, "logits/chosen": -1.2254236936569214, "logits/rejected": -0.549937903881073, "logps/chosen": -48.976341247558594, "logps/rejected": -586.7686767578125, "loss": 13090.5094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2046699821949005, "rewards/margins": 0.5475128889083862, "rewards/rejected": -0.34284287691116333, "step": 4610 }, { "epoch": 23.437974683544304, "grad_norm": 465753.7874146901, "learning_rate": 1.9366969602005639e-07, "logits/chosen": -0.32544824481010437, "logits/rejected": 0.21593210101127625, "logps/chosen": -44.010986328125, "logps/rejected": -600.7352905273438, "loss": 13414.7938, "rewards/accuracies": 1.0, "rewards/chosen": 0.19957385957241058, "rewards/margins": 0.5576717257499695, "rewards/rejected": -0.3580978512763977, "step": 4620 }, { "epoch": 23.48860759493671, "grad_norm": 481484.43193068507, "learning_rate": 1.928862425571921e-07, "logits/chosen": 1.6490085124969482, "logits/rejected": 2.4715371131896973, "logps/chosen": -33.72826385498047, "logps/rejected": -570.8497314453125, "loss": 12998.5312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20028643310070038, "rewards/margins": 0.542258083820343, "rewards/rejected": -0.34197166562080383, "step": 4630 }, { "epoch": 23.539240506329115, "grad_norm": 332996.8100920917, "learning_rate": 1.9210278909432777e-07, "logits/chosen": 0.4085458219051361, "logits/rejected": 0.9639450907707214, "logps/chosen": -34.42941665649414, "logps/rejected": -557.9277954101562, "loss": 13038.3531, "rewards/accuracies": 1.0, "rewards/chosen": 0.19136182963848114, "rewards/margins": 0.5279586911201477, "rewards/rejected": -0.33659690618515015, "step": 4640 }, { "epoch": 23.58987341772152, "grad_norm": 417794.5245281789, "learning_rate": 1.913193356314635e-07, "logits/chosen": 0.9038169980049133, "logits/rejected": 1.2894458770751953, "logps/chosen": -36.74773406982422, "logps/rejected": -562.9734497070312, "loss": 13522.8828, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19096803665161133, "rewards/margins": 0.528801679611206, "rewards/rejected": -0.3378336727619171, "step": 4650 }, { "epoch": 23.640506329113926, "grad_norm": 676256.0388850861, "learning_rate": 1.9053588216859918e-07, "logits/chosen": -1.2860909700393677, "logits/rejected": -0.9226953387260437, "logps/chosen": -38.81789016723633, "logps/rejected": -582.3718872070312, "loss": 12973.8531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19840100407600403, "rewards/margins": 0.5444675087928772, "rewards/rejected": -0.34606653451919556, "step": 4660 }, { "epoch": 23.691139240506327, "grad_norm": 554334.5417021438, "learning_rate": 1.897524287057349e-07, "logits/chosen": -0.9978511929512024, "logits/rejected": -1.0807132720947266, "logps/chosen": -37.58687210083008, "logps/rejected": -592.8406982421875, "loss": 13047.7078, "rewards/accuracies": 1.0, "rewards/chosen": 0.21029341220855713, "rewards/margins": 0.5588668584823608, "rewards/rejected": -0.34857338666915894, "step": 4670 }, { "epoch": 23.741772151898733, "grad_norm": 369259.1253714009, "learning_rate": 1.8896897524287057e-07, "logits/chosen": -0.2619388997554779, "logits/rejected": 0.5041999816894531, "logps/chosen": -30.875507354736328, "logps/rejected": -593.271484375, "loss": 13440.1, "rewards/accuracies": 1.0, "rewards/chosen": 0.20856580138206482, "rewards/margins": 0.5670603513717651, "rewards/rejected": -0.3584945499897003, "step": 4680 }, { "epoch": 23.792405063291138, "grad_norm": 521963.4531232378, "learning_rate": 1.8818552178000628e-07, "logits/chosen": 1.4849811792373657, "logits/rejected": 2.5439536571502686, "logps/chosen": -48.661781311035156, "logps/rejected": -584.5633544921875, "loss": 12945.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2017756700515747, "rewards/margins": 0.5462977886199951, "rewards/rejected": -0.34452205896377563, "step": 4690 }, { "epoch": 23.843037974683543, "grad_norm": 268733.66014746006, "learning_rate": 1.8740206831714195e-07, "logits/chosen": -3.035019636154175, "logits/rejected": -2.2716848850250244, "logps/chosen": -37.333274841308594, "logps/rejected": -573.26806640625, "loss": 12908.0664, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19613531231880188, "rewards/margins": 0.5401136875152588, "rewards/rejected": -0.34397831559181213, "step": 4700 }, { "epoch": 23.89367088607595, "grad_norm": 408014.58113424503, "learning_rate": 1.8661861485427766e-07, "logits/chosen": -1.863054871559143, "logits/rejected": -1.9231150150299072, "logps/chosen": -31.946712493896484, "logps/rejected": -559.3107299804688, "loss": 13674.8922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19874636828899384, "rewards/margins": 0.5276685357093811, "rewards/rejected": -0.32892221212387085, "step": 4710 }, { "epoch": 23.944303797468354, "grad_norm": 642355.0358722768, "learning_rate": 1.8583516139141334e-07, "logits/chosen": 0.2536182999610901, "logits/rejected": 0.7465096712112427, "logps/chosen": -30.321950912475586, "logps/rejected": -569.6714477539062, "loss": 13267.9422, "rewards/accuracies": 1.0, "rewards/chosen": 0.19373981654644012, "rewards/margins": 0.5428478717803955, "rewards/rejected": -0.3491080403327942, "step": 4720 }, { "epoch": 23.99493670886076, "grad_norm": 342588.7538586878, "learning_rate": 1.8505170792854905e-07, "logits/chosen": 0.4239419400691986, "logits/rejected": 1.2003661394119263, "logps/chosen": -37.75706100463867, "logps/rejected": -594.6668701171875, "loss": 12895.3687, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2065034806728363, "rewards/margins": 0.5577437877655029, "rewards/rejected": -0.3512403070926666, "step": 4730 }, { "epoch": 24.045569620253165, "grad_norm": 627241.9523500776, "learning_rate": 1.8426825446568473e-07, "logits/chosen": -0.28766584396362305, "logits/rejected": -0.6269916296005249, "logps/chosen": -44.65516662597656, "logps/rejected": -577.6954345703125, "loss": 12950.232, "rewards/accuracies": 1.0, "rewards/chosen": 0.20441746711730957, "rewards/margins": 0.5374675393104553, "rewards/rejected": -0.33305004239082336, "step": 4740 }, { "epoch": 24.09620253164557, "grad_norm": 347781.48168387014, "learning_rate": 1.8348480100282043e-07, "logits/chosen": -0.7881828546524048, "logits/rejected": 0.06337795406579971, "logps/chosen": -27.82383155822754, "logps/rejected": -579.8468017578125, "loss": 12986.6266, "rewards/accuracies": 1.0, "rewards/chosen": 0.2079242467880249, "rewards/margins": 0.5530039668083191, "rewards/rejected": -0.3450797498226166, "step": 4750 }, { "epoch": 24.146835443037975, "grad_norm": 263636.0742414822, "learning_rate": 1.827013475399561e-07, "logits/chosen": -1.2877452373504639, "logits/rejected": -0.24622194468975067, "logps/chosen": -30.940113067626953, "logps/rejected": -566.517333984375, "loss": 12716.3281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20519797503948212, "rewards/margins": 0.5418139696121216, "rewards/rejected": -0.33661606907844543, "step": 4760 }, { "epoch": 24.19746835443038, "grad_norm": 343217.0970891512, "learning_rate": 1.8191789407709182e-07, "logits/chosen": -1.0294177532196045, "logits/rejected": -0.4815802574157715, "logps/chosen": -31.3253231048584, "logps/rejected": -560.1129150390625, "loss": 13408.0094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19244466722011566, "rewards/margins": 0.5293484926223755, "rewards/rejected": -0.3369038701057434, "step": 4770 }, { "epoch": 24.248101265822786, "grad_norm": 504214.9301429895, "learning_rate": 1.811344406142275e-07, "logits/chosen": -0.604789137840271, "logits/rejected": 0.22590136528015137, "logps/chosen": -41.899864196777344, "logps/rejected": -555.2015380859375, "loss": 13531.9625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.190764918923378, "rewards/margins": 0.5197011232376099, "rewards/rejected": -0.32893624901771545, "step": 4780 }, { "epoch": 24.29873417721519, "grad_norm": 376293.61129873747, "learning_rate": 1.803509871513632e-07, "logits/chosen": -0.3547658324241638, "logits/rejected": -0.13969659805297852, "logps/chosen": -28.85129737854004, "logps/rejected": -577.5880126953125, "loss": 12898.8125, "rewards/accuracies": 1.0, "rewards/chosen": 0.19991345703601837, "rewards/margins": 0.5475735664367676, "rewards/rejected": -0.3476601243019104, "step": 4790 }, { "epoch": 24.349367088607593, "grad_norm": 414135.66871189536, "learning_rate": 1.7956753368849888e-07, "logits/chosen": 0.09798486530780792, "logits/rejected": 0.8055311441421509, "logps/chosen": -34.26675796508789, "logps/rejected": -584.9435424804688, "loss": 12507.7328, "rewards/accuracies": 1.0, "rewards/chosen": 0.20852184295654297, "rewards/margins": 0.5590152144432068, "rewards/rejected": -0.3504934012889862, "step": 4800 }, { "epoch": 24.4, "grad_norm": 343912.9213203651, "learning_rate": 1.787840802256346e-07, "logits/chosen": -0.12874791026115417, "logits/rejected": 0.15529172122478485, "logps/chosen": -32.1009635925293, "logps/rejected": -574.7623291015625, "loss": 13372.5594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1985681653022766, "rewards/margins": 0.546955943107605, "rewards/rejected": -0.34838777780532837, "step": 4810 }, { "epoch": 24.450632911392404, "grad_norm": 697134.5088322331, "learning_rate": 1.7800062676277027e-07, "logits/chosen": 1.3246450424194336, "logits/rejected": 1.760595679283142, "logps/chosen": -32.29949188232422, "logps/rejected": -575.516357421875, "loss": 12415.9672, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19931714236736298, "rewards/margins": 0.5466721057891846, "rewards/rejected": -0.3473549485206604, "step": 4820 }, { "epoch": 24.50126582278481, "grad_norm": 389676.90431780973, "learning_rate": 1.7721717329990597e-07, "logits/chosen": -1.3381322622299194, "logits/rejected": -0.6404735445976257, "logps/chosen": -34.635719299316406, "logps/rejected": -587.3080444335938, "loss": 13101.6227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20362886786460876, "rewards/margins": 0.5546956658363342, "rewards/rejected": -0.3510667383670807, "step": 4830 }, { "epoch": 24.551898734177215, "grad_norm": 314600.5637340995, "learning_rate": 1.7643371983704165e-07, "logits/chosen": 0.9953921437263489, "logits/rejected": 0.9643779993057251, "logps/chosen": -30.646331787109375, "logps/rejected": -570.855712890625, "loss": 12974.9633, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19645583629608154, "rewards/margins": 0.5343093872070312, "rewards/rejected": -0.3378535211086273, "step": 4840 }, { "epoch": 24.60253164556962, "grad_norm": 327013.6839426029, "learning_rate": 1.7565026637417739e-07, "logits/chosen": -0.7217426300048828, "logits/rejected": -0.7290517091751099, "logps/chosen": -37.666648864746094, "logps/rejected": -563.5865478515625, "loss": 13273.0266, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.197954460978508, "rewards/margins": 0.5282526612281799, "rewards/rejected": -0.33029812574386597, "step": 4850 }, { "epoch": 24.653164556962025, "grad_norm": 425662.97201424866, "learning_rate": 1.7486681291131307e-07, "logits/chosen": -0.32900291681289673, "logits/rejected": 0.18864622712135315, "logps/chosen": -32.36582565307617, "logps/rejected": -566.5547485351562, "loss": 13350.5594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19610336422920227, "rewards/margins": 0.5321984887123108, "rewards/rejected": -0.3360951244831085, "step": 4860 }, { "epoch": 24.70379746835443, "grad_norm": 402351.7657170625, "learning_rate": 1.7408335944844877e-07, "logits/chosen": -2.112635850906372, "logits/rejected": -1.4337832927703857, "logps/chosen": -36.27765655517578, "logps/rejected": -588.708740234375, "loss": 13690.0375, "rewards/accuracies": 1.0, "rewards/chosen": 0.20721419155597687, "rewards/margins": 0.5532296299934387, "rewards/rejected": -0.34601545333862305, "step": 4870 }, { "epoch": 24.754430379746836, "grad_norm": 357871.7969812476, "learning_rate": 1.7329990598558445e-07, "logits/chosen": -0.34599849581718445, "logits/rejected": 0.0005200624582357705, "logps/chosen": -28.50592041015625, "logps/rejected": -560.6458740234375, "loss": 13133.4, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1971847414970398, "rewards/margins": 0.5326961278915405, "rewards/rejected": -0.33551135659217834, "step": 4880 }, { "epoch": 24.80506329113924, "grad_norm": 610016.4055058825, "learning_rate": 1.7251645252272016e-07, "logits/chosen": -1.225339651107788, "logits/rejected": -0.8243592977523804, "logps/chosen": -34.17732620239258, "logps/rejected": -581.1436767578125, "loss": 12571.1, "rewards/accuracies": 1.0, "rewards/chosen": 0.20419716835021973, "rewards/margins": 0.5520066022872925, "rewards/rejected": -0.347809374332428, "step": 4890 }, { "epoch": 24.855696202531647, "grad_norm": 379732.159808034, "learning_rate": 1.7173299905985584e-07, "logits/chosen": -1.817439317703247, "logits/rejected": -1.4895861148834229, "logps/chosen": -30.775598526000977, "logps/rejected": -570.484130859375, "loss": 13212.3781, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20220620930194855, "rewards/margins": 0.5392990112304688, "rewards/rejected": -0.3370928466320038, "step": 4900 }, { "epoch": 24.906329113924052, "grad_norm": 324902.93726753094, "learning_rate": 1.7094954559699154e-07, "logits/chosen": -0.6947388648986816, "logits/rejected": -0.4560522437095642, "logps/chosen": -40.327796936035156, "logps/rejected": -581.7532348632812, "loss": 12891.9016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20357458293437958, "rewards/margins": 0.5383836030960083, "rewards/rejected": -0.33480900526046753, "step": 4910 }, { "epoch": 24.956962025316457, "grad_norm": 294429.7500390618, "learning_rate": 1.7016609213412722e-07, "logits/chosen": -0.9121583104133606, "logits/rejected": 0.40684938430786133, "logps/chosen": -28.22664451599121, "logps/rejected": -580.9795532226562, "loss": 13184.5812, "rewards/accuracies": 1.0, "rewards/chosen": 0.20078356564044952, "rewards/margins": 0.5547462701797485, "rewards/rejected": -0.3539626896381378, "step": 4920 }, { "epoch": 25.00759493670886, "grad_norm": 308388.08709269366, "learning_rate": 1.6938263867126293e-07, "logits/chosen": -1.6532137393951416, "logits/rejected": -1.572850227355957, "logps/chosen": -41.12345886230469, "logps/rejected": -613.5958862304688, "loss": 12755.7383, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21259479224681854, "rewards/margins": 0.5684026479721069, "rewards/rejected": -0.3558078408241272, "step": 4930 }, { "epoch": 25.058227848101264, "grad_norm": 320761.03886897856, "learning_rate": 1.685991852083986e-07, "logits/chosen": -0.11034099757671356, "logits/rejected": -0.06293153762817383, "logps/chosen": -34.010704040527344, "logps/rejected": -583.318359375, "loss": 13300.3922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19742931425571442, "rewards/margins": 0.5516862273216248, "rewards/rejected": -0.35425692796707153, "step": 4940 }, { "epoch": 25.10886075949367, "grad_norm": 282559.397671993, "learning_rate": 1.6781573174553431e-07, "logits/chosen": 0.5274404883384705, "logits/rejected": 1.2507613897323608, "logps/chosen": -29.299930572509766, "logps/rejected": -554.8450927734375, "loss": 12685.2523, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19330081343650818, "rewards/margins": 0.5271843671798706, "rewards/rejected": -0.3338836431503296, "step": 4950 }, { "epoch": 25.159493670886075, "grad_norm": 248533.31024359175, "learning_rate": 1.6703227828267e-07, "logits/chosen": -1.2484452724456787, "logits/rejected": -0.5531445741653442, "logps/chosen": -42.44970703125, "logps/rejected": -591.9672241210938, "loss": 12525.2, "rewards/accuracies": 1.0, "rewards/chosen": 0.20557789504528046, "rewards/margins": 0.5516069531440735, "rewards/rejected": -0.34602901339530945, "step": 4960 }, { "epoch": 25.21012658227848, "grad_norm": 365840.3682606488, "learning_rate": 1.662488248198057e-07, "logits/chosen": -1.5047721862792969, "logits/rejected": -1.5158735513687134, "logps/chosen": -31.838958740234375, "logps/rejected": -581.0045166015625, "loss": 13041.882, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.201541468501091, "rewards/margins": 0.5492666959762573, "rewards/rejected": -0.3477252125740051, "step": 4970 }, { "epoch": 25.260759493670886, "grad_norm": 364119.66442401055, "learning_rate": 1.6546537135694138e-07, "logits/chosen": -2.0333914756774902, "logits/rejected": -2.0420191287994385, "logps/chosen": -33.426788330078125, "logps/rejected": -577.18212890625, "loss": 13218.8875, "rewards/accuracies": 1.0, "rewards/chosen": 0.20231468975543976, "rewards/margins": 0.5456961989402771, "rewards/rejected": -0.3433815836906433, "step": 4980 }, { "epoch": 25.31139240506329, "grad_norm": 434691.5380135347, "learning_rate": 1.6468191789407709e-07, "logits/chosen": -0.23437795042991638, "logits/rejected": -0.03313719108700752, "logps/chosen": -33.025386810302734, "logps/rejected": -587.5833740234375, "loss": 12003.9711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19747456908226013, "rewards/margins": 0.553167998790741, "rewards/rejected": -0.3556934595108032, "step": 4990 }, { "epoch": 25.362025316455696, "grad_norm": 257881.6224659914, "learning_rate": 1.6389846443121277e-07, "logits/chosen": 1.229998230934143, "logits/rejected": 1.8426265716552734, "logps/chosen": -31.151538848876953, "logps/rejected": -575.4852905273438, "loss": 13412.7078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1954251229763031, "rewards/margins": 0.5429095029830933, "rewards/rejected": -0.34748440980911255, "step": 5000 }, { "epoch": 25.4126582278481, "grad_norm": 425285.73032920854, "learning_rate": 1.6311501096834847e-07, "logits/chosen": -1.241003155708313, "logits/rejected": -0.7176898121833801, "logps/chosen": -31.115795135498047, "logps/rejected": -558.19873046875, "loss": 13301.7094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19612053036689758, "rewards/margins": 0.5247890949249268, "rewards/rejected": -0.3286685347557068, "step": 5010 }, { "epoch": 25.463291139240507, "grad_norm": 372695.4381119174, "learning_rate": 1.6233155750548415e-07, "logits/chosen": -1.8982555866241455, "logits/rejected": -1.494901180267334, "logps/chosen": -28.403858184814453, "logps/rejected": -562.348388671875, "loss": 13093.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.203691765666008, "rewards/margins": 0.5354448556900024, "rewards/rejected": -0.33175310492515564, "step": 5020 }, { "epoch": 25.513924050632912, "grad_norm": 291137.30920257524, "learning_rate": 1.6154810404261986e-07, "logits/chosen": -0.2861802577972412, "logits/rejected": -0.4479186534881592, "logps/chosen": -23.825702667236328, "logps/rejected": -559.0096435546875, "loss": 12589.4609, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1913156658411026, "rewards/margins": 0.5378258228302002, "rewards/rejected": -0.346510112285614, "step": 5030 }, { "epoch": 25.564556962025318, "grad_norm": 273297.2570355529, "learning_rate": 1.6076465057975556e-07, "logits/chosen": -2.0077948570251465, "logits/rejected": -1.546903371810913, "logps/chosen": -34.178993225097656, "logps/rejected": -599.1771240234375, "loss": 12277.0906, "rewards/accuracies": 1.0, "rewards/chosen": 0.20620949566364288, "rewards/margins": 0.5666217803955078, "rewards/rejected": -0.36041226983070374, "step": 5040 }, { "epoch": 25.615189873417723, "grad_norm": 287331.7702661688, "learning_rate": 1.5998119711689127e-07, "logits/chosen": -0.9829635620117188, "logits/rejected": -0.3811960220336914, "logps/chosen": -32.14269256591797, "logps/rejected": -580.4415283203125, "loss": 12507.3219, "rewards/accuracies": 1.0, "rewards/chosen": 0.20907440781593323, "rewards/margins": 0.5523373484611511, "rewards/rejected": -0.3432629406452179, "step": 5050 }, { "epoch": 25.665822784810125, "grad_norm": 896554.0294317787, "learning_rate": 1.5919774365402695e-07, "logits/chosen": -1.3259598016738892, "logits/rejected": -0.9525947570800781, "logps/chosen": -25.666656494140625, "logps/rejected": -573.1832885742188, "loss": 12955.9469, "rewards/accuracies": 1.0, "rewards/chosen": 0.19553272426128387, "rewards/margins": 0.5421277284622192, "rewards/rejected": -0.3465949594974518, "step": 5060 }, { "epoch": 25.71645569620253, "grad_norm": 360559.08966435614, "learning_rate": 1.5841429019116266e-07, "logits/chosen": -2.50518536567688, "logits/rejected": -2.6326870918273926, "logps/chosen": -40.73974609375, "logps/rejected": -598.9993896484375, "loss": 13192.7609, "rewards/accuracies": 1.0, "rewards/chosen": 0.2110958993434906, "rewards/margins": 0.559829592704773, "rewards/rejected": -0.34873366355895996, "step": 5070 }, { "epoch": 25.767088607594935, "grad_norm": 354200.8480985467, "learning_rate": 1.5763083672829833e-07, "logits/chosen": 0.24985246360301971, "logits/rejected": 0.11640717834234238, "logps/chosen": -30.384597778320312, "logps/rejected": -595.9378662109375, "loss": 13357.8156, "rewards/accuracies": 1.0, "rewards/chosen": 0.20246371626853943, "rewards/margins": 0.5666370391845703, "rewards/rejected": -0.3641732633113861, "step": 5080 }, { "epoch": 25.81772151898734, "grad_norm": 419630.4907858681, "learning_rate": 1.5684738326543404e-07, "logits/chosen": -2.382422924041748, "logits/rejected": -1.6780860424041748, "logps/chosen": -32.89704132080078, "logps/rejected": -596.2845458984375, "loss": 13075.125, "rewards/accuracies": 1.0, "rewards/chosen": 0.20516617596149445, "rewards/margins": 0.5647061467170715, "rewards/rejected": -0.3595399558544159, "step": 5090 }, { "epoch": 25.868354430379746, "grad_norm": 239893.19190802056, "learning_rate": 1.5606392980256972e-07, "logits/chosen": -1.5904518365859985, "logits/rejected": -1.162544846534729, "logps/chosen": -29.703998565673828, "logps/rejected": -562.6304321289062, "loss": 12907.7898, "rewards/accuracies": 1.0, "rewards/chosen": 0.1967582404613495, "rewards/margins": 0.5360093116760254, "rewards/rejected": -0.3392511010169983, "step": 5100 }, { "epoch": 25.91898734177215, "grad_norm": 2769163.91672907, "learning_rate": 1.5528047633970543e-07, "logits/chosen": -0.4542008936405182, "logits/rejected": 0.3750479519367218, "logps/chosen": -40.263450622558594, "logps/rejected": -569.8021240234375, "loss": 12356.1203, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19747862219810486, "rewards/margins": 0.5331242680549622, "rewards/rejected": -0.3356456160545349, "step": 5110 }, { "epoch": 25.969620253164557, "grad_norm": 414959.45582905615, "learning_rate": 1.544970228768411e-07, "logits/chosen": -2.780273914337158, "logits/rejected": -2.477725028991699, "logps/chosen": -34.733909606933594, "logps/rejected": -598.5794677734375, "loss": 12866.1969, "rewards/accuracies": 1.0, "rewards/chosen": 0.22083155810832977, "rewards/margins": 0.5664650797843933, "rewards/rejected": -0.3456335663795471, "step": 5120 }, { "epoch": 26.020253164556962, "grad_norm": 437459.9001544906, "learning_rate": 1.537135694139768e-07, "logits/chosen": -1.4238073825836182, "logits/rejected": -1.5467934608459473, "logps/chosen": -32.6416015625, "logps/rejected": -585.3292236328125, "loss": 12902.432, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2033694088459015, "rewards/margins": 0.5521097779273987, "rewards/rejected": -0.3487403094768524, "step": 5130 }, { "epoch": 26.070886075949367, "grad_norm": 461726.55326627713, "learning_rate": 1.529301159511125e-07, "logits/chosen": -1.0017569065093994, "logits/rejected": -0.677699089050293, "logps/chosen": -33.68021011352539, "logps/rejected": -586.3854370117188, "loss": 12206.9266, "rewards/accuracies": 1.0, "rewards/chosen": 0.20325596630573273, "rewards/margins": 0.5559948682785034, "rewards/rejected": -0.35273900628089905, "step": 5140 }, { "epoch": 26.121518987341773, "grad_norm": 223445.63437535468, "learning_rate": 1.521466624882482e-07, "logits/chosen": -1.4141124486923218, "logits/rejected": -0.6017986536026001, "logps/chosen": -29.84651756286621, "logps/rejected": -580.4603271484375, "loss": 12104.9586, "rewards/accuracies": 1.0, "rewards/chosen": 0.20142440497875214, "rewards/margins": 0.5523154139518738, "rewards/rejected": -0.35089102387428284, "step": 5150 }, { "epoch": 26.172151898734178, "grad_norm": 232119.6879833388, "learning_rate": 1.5136320902538388e-07, "logits/chosen": -0.7647647857666016, "logits/rejected": -0.6229702830314636, "logps/chosen": -34.456138610839844, "logps/rejected": -574.6653442382812, "loss": 12524.6094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2094695270061493, "rewards/margins": 0.539394199848175, "rewards/rejected": -0.32992464303970337, "step": 5160 }, { "epoch": 26.222784810126583, "grad_norm": 478076.71264027077, "learning_rate": 1.5057975556251958e-07, "logits/chosen": -2.230821132659912, "logits/rejected": -2.297372579574585, "logps/chosen": -29.98971176147461, "logps/rejected": -588.6803588867188, "loss": 12188.2758, "rewards/accuracies": 1.0, "rewards/chosen": 0.21123230457305908, "rewards/margins": 0.5585904121398926, "rewards/rejected": -0.3473580479621887, "step": 5170 }, { "epoch": 26.27341772151899, "grad_norm": 287477.38205394626, "learning_rate": 1.4979630209965526e-07, "logits/chosen": 0.2648393511772156, "logits/rejected": 1.2140284776687622, "logps/chosen": -26.058353424072266, "logps/rejected": -575.137939453125, "loss": 13004.7297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20219556987285614, "rewards/margins": 0.5560811758041382, "rewards/rejected": -0.35388559103012085, "step": 5180 }, { "epoch": 26.324050632911394, "grad_norm": 448400.95809014083, "learning_rate": 1.4901284863679097e-07, "logits/chosen": -0.47244685888290405, "logits/rejected": 0.34987983107566833, "logps/chosen": -47.41560745239258, "logps/rejected": -588.7420654296875, "loss": 12302.9594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20305314660072327, "rewards/margins": 0.5488015413284302, "rewards/rejected": -0.3457483947277069, "step": 5190 }, { "epoch": 26.374683544303796, "grad_norm": 290914.6200870196, "learning_rate": 1.4822939517392665e-07, "logits/chosen": -1.5243618488311768, "logits/rejected": -0.6017967462539673, "logps/chosen": -33.99588394165039, "logps/rejected": -590.6222534179688, "loss": 12878.768, "rewards/accuracies": 1.0, "rewards/chosen": 0.20388083159923553, "rewards/margins": 0.5592610836029053, "rewards/rejected": -0.35538023710250854, "step": 5200 }, { "epoch": 26.4253164556962, "grad_norm": 715122.6528862711, "learning_rate": 1.4744594171106235e-07, "logits/chosen": -1.3308216333389282, "logits/rejected": -1.0356947183609009, "logps/chosen": -29.031147003173828, "logps/rejected": -595.821533203125, "loss": 12466.6016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.212965726852417, "rewards/margins": 0.5688080191612244, "rewards/rejected": -0.3558422923088074, "step": 5210 }, { "epoch": 26.475949367088607, "grad_norm": 266006.16874610144, "learning_rate": 1.4666248824819803e-07, "logits/chosen": -0.2248738706111908, "logits/rejected": 0.37806427478790283, "logps/chosen": -35.31621551513672, "logps/rejected": -578.3462524414062, "loss": 12517.1375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20571331679821014, "rewards/margins": 0.5493656396865845, "rewards/rejected": -0.3436523675918579, "step": 5220 }, { "epoch": 26.526582278481012, "grad_norm": 296131.0633758982, "learning_rate": 1.4587903478533377e-07, "logits/chosen": -3.198024272918701, "logits/rejected": -2.1562371253967285, "logps/chosen": -24.365009307861328, "logps/rejected": -589.4319458007812, "loss": 12258.343, "rewards/accuracies": 1.0, "rewards/chosen": 0.2150738686323166, "rewards/margins": 0.5671111345291138, "rewards/rejected": -0.352037250995636, "step": 5230 }, { "epoch": 26.577215189873417, "grad_norm": 310894.2430575026, "learning_rate": 1.4509558132246945e-07, "logits/chosen": 1.5686824321746826, "logits/rejected": 1.7765287160873413, "logps/chosen": -25.171403884887695, "logps/rejected": -559.4937744140625, "loss": 13451.6969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18740372359752655, "rewards/margins": 0.5345771312713623, "rewards/rejected": -0.3471735119819641, "step": 5240 }, { "epoch": 26.627848101265823, "grad_norm": 273385.34239455353, "learning_rate": 1.4431212785960515e-07, "logits/chosen": 0.779743492603302, "logits/rejected": 0.5761479139328003, "logps/chosen": -24.774072647094727, "logps/rejected": -552.44775390625, "loss": 13444.4844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1935535967350006, "rewards/margins": 0.5272840857505798, "rewards/rejected": -0.3337305188179016, "step": 5250 }, { "epoch": 26.678481012658228, "grad_norm": 292701.1225306038, "learning_rate": 1.4352867439674083e-07, "logits/chosen": -2.055417060852051, "logits/rejected": -1.5558016300201416, "logps/chosen": -34.77043151855469, "logps/rejected": -578.9781494140625, "loss": 12698.0, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20298035442829132, "rewards/margins": 0.543838381767273, "rewards/rejected": -0.3408580422401428, "step": 5260 }, { "epoch": 26.729113924050633, "grad_norm": 274251.20733361214, "learning_rate": 1.4274522093387654e-07, "logits/chosen": -0.7560523152351379, "logits/rejected": -0.4179345667362213, "logps/chosen": -35.23884201049805, "logps/rejected": -578.6085815429688, "loss": 12311.3711, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19854024052619934, "rewards/margins": 0.5389177799224854, "rewards/rejected": -0.3403775095939636, "step": 5270 }, { "epoch": 26.77974683544304, "grad_norm": 540941.0207588519, "learning_rate": 1.4196176747101222e-07, "logits/chosen": -2.318772792816162, "logits/rejected": -2.123133420944214, "logps/chosen": -32.09846878051758, "logps/rejected": -575.356201171875, "loss": 12401.8453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2020426243543625, "rewards/margins": 0.5411572456359863, "rewards/rejected": -0.33911454677581787, "step": 5280 }, { "epoch": 26.830379746835444, "grad_norm": 441696.9404493494, "learning_rate": 1.4117831400814792e-07, "logits/chosen": -2.1685147285461426, "logits/rejected": -1.5242393016815186, "logps/chosen": -22.024688720703125, "logps/rejected": -543.53076171875, "loss": 13786.8516, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1891135424375534, "rewards/margins": 0.523938775062561, "rewards/rejected": -0.33482515811920166, "step": 5290 }, { "epoch": 26.88101265822785, "grad_norm": 328168.9416709712, "learning_rate": 1.403948605452836e-07, "logits/chosen": -2.390831708908081, "logits/rejected": -1.6773532629013062, "logps/chosen": -37.75607681274414, "logps/rejected": -572.0828247070312, "loss": 13110.5859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20247995853424072, "rewards/margins": 0.539789617061615, "rewards/rejected": -0.33730968832969666, "step": 5300 }, { "epoch": 26.931645569620255, "grad_norm": 342604.3694047161, "learning_rate": 1.396114070824193e-07, "logits/chosen": -0.8812211751937866, "logits/rejected": -0.7407415509223938, "logps/chosen": -31.870285034179688, "logps/rejected": -576.1883544921875, "loss": 12753.4875, "rewards/accuracies": 1.0, "rewards/chosen": 0.2047419548034668, "rewards/margins": 0.5459688901901245, "rewards/rejected": -0.3412269353866577, "step": 5310 }, { "epoch": 26.98227848101266, "grad_norm": 327636.2077886267, "learning_rate": 1.38827953619555e-07, "logits/chosen": -1.1729528903961182, "logits/rejected": -0.7522214651107788, "logps/chosen": -41.73408508300781, "logps/rejected": -603.9337158203125, "loss": 11920.0102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22109094262123108, "rewards/margins": 0.5666243433952332, "rewards/rejected": -0.3455334007740021, "step": 5320 }, { "epoch": 27.03291139240506, "grad_norm": 306486.1159229183, "learning_rate": 1.380445001566907e-07, "logits/chosen": -0.2942148447036743, "logits/rejected": 0.29008275270462036, "logps/chosen": -28.0673770904541, "logps/rejected": -591.0224609375, "loss": 12393.8094, "rewards/accuracies": 1.0, "rewards/chosen": 0.21129322052001953, "rewards/margins": 0.562667965888977, "rewards/rejected": -0.35137468576431274, "step": 5330 }, { "epoch": 27.083544303797467, "grad_norm": 291301.0379049935, "learning_rate": 1.3726104669382637e-07, "logits/chosen": -0.04897233098745346, "logits/rejected": 0.2625051736831665, "logps/chosen": -28.288782119750977, "logps/rejected": -600.4498291015625, "loss": 12295.5109, "rewards/accuracies": 1.0, "rewards/chosen": 0.20883643627166748, "rewards/margins": 0.5699074864387512, "rewards/rejected": -0.36107105016708374, "step": 5340 }, { "epoch": 27.134177215189872, "grad_norm": 336826.5711799587, "learning_rate": 1.3647759323096208e-07, "logits/chosen": -3.574831008911133, "logits/rejected": -3.1615543365478516, "logps/chosen": -28.667476654052734, "logps/rejected": -610.046630859375, "loss": 12205.2984, "rewards/accuracies": 1.0, "rewards/chosen": 0.21462281048297882, "rewards/margins": 0.5811195373535156, "rewards/rejected": -0.3664968013763428, "step": 5350 }, { "epoch": 27.184810126582278, "grad_norm": 253108.22870561373, "learning_rate": 1.3569413976809776e-07, "logits/chosen": -1.275773048400879, "logits/rejected": -0.2816539406776428, "logps/chosen": -27.488027572631836, "logps/rejected": -576.04443359375, "loss": 12752.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.20755627751350403, "rewards/margins": 0.5584858059883118, "rewards/rejected": -0.35092949867248535, "step": 5360 }, { "epoch": 27.235443037974683, "grad_norm": 378986.1297500305, "learning_rate": 1.3491068630523347e-07, "logits/chosen": -0.7276864051818848, "logits/rejected": -0.2372014820575714, "logps/chosen": -27.652713775634766, "logps/rejected": -574.1866455078125, "loss": 12491.7875, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19472074508666992, "rewards/margins": 0.5458452701568604, "rewards/rejected": -0.35112449526786804, "step": 5370 }, { "epoch": 27.28607594936709, "grad_norm": 355029.2404666128, "learning_rate": 1.3412723284236915e-07, "logits/chosen": 0.03503293916583061, "logits/rejected": 0.09463844448328018, "logps/chosen": -20.01060676574707, "logps/rejected": -571.03369140625, "loss": 12906.6961, "rewards/accuracies": 1.0, "rewards/chosen": 0.1995813399553299, "rewards/margins": 0.5492128133773804, "rewards/rejected": -0.3496314287185669, "step": 5380 }, { "epoch": 27.336708860759494, "grad_norm": 174005.9141855672, "learning_rate": 1.3334377937950485e-07, "logits/chosen": -1.0307856798171997, "logits/rejected": -0.8787088394165039, "logps/chosen": -27.538768768310547, "logps/rejected": -584.1838989257812, "loss": 12431.2086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20768491923809052, "rewards/margins": 0.558754026889801, "rewards/rejected": -0.3510691225528717, "step": 5390 }, { "epoch": 27.3873417721519, "grad_norm": 333107.0988957162, "learning_rate": 1.3256032591664053e-07, "logits/chosen": 0.49966010451316833, "logits/rejected": 1.4367059469223022, "logps/chosen": -22.20120620727539, "logps/rejected": -573.7286987304688, "loss": 12624.7586, "rewards/accuracies": 1.0, "rewards/chosen": 0.19961531460285187, "rewards/margins": 0.5520228743553162, "rewards/rejected": -0.35240763425827026, "step": 5400 }, { "epoch": 27.437974683544304, "grad_norm": 189125.20245582235, "learning_rate": 1.3177687245377624e-07, "logits/chosen": -0.491058886051178, "logits/rejected": -0.4180983603000641, "logps/chosen": -24.668697357177734, "logps/rejected": -574.7880249023438, "loss": 12818.4906, "rewards/accuracies": 1.0, "rewards/chosen": 0.203078955411911, "rewards/margins": 0.5475345849990845, "rewards/rejected": -0.34445568919181824, "step": 5410 }, { "epoch": 27.48860759493671, "grad_norm": 255453.1741505276, "learning_rate": 1.3099341899091192e-07, "logits/chosen": -1.3983430862426758, "logits/rejected": -1.0761035680770874, "logps/chosen": -28.14908790588379, "logps/rejected": -567.4022216796875, "loss": 12266.7156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19684790074825287, "rewards/margins": 0.5404728055000305, "rewards/rejected": -0.34362491965293884, "step": 5420 }, { "epoch": 27.539240506329115, "grad_norm": 199249.17490991156, "learning_rate": 1.3020996552804765e-07, "logits/chosen": -1.3831968307495117, "logits/rejected": -0.9957733154296875, "logps/chosen": -34.38856887817383, "logps/rejected": -583.5364379882812, "loss": 12353.943, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20319747924804688, "rewards/margins": 0.5525364875793457, "rewards/rejected": -0.34933900833129883, "step": 5430 }, { "epoch": 27.58987341772152, "grad_norm": 372801.7448533588, "learning_rate": 1.2942651206518333e-07, "logits/chosen": 0.7253493070602417, "logits/rejected": 0.6416251063346863, "logps/chosen": -36.44821548461914, "logps/rejected": -557.3819580078125, "loss": 12762.9742, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19811172783374786, "rewards/margins": 0.5241624116897583, "rewards/rejected": -0.32605066895484924, "step": 5440 }, { "epoch": 27.640506329113926, "grad_norm": 250437.30987597498, "learning_rate": 1.2864305860231904e-07, "logits/chosen": -1.6367158889770508, "logits/rejected": -0.9662375450134277, "logps/chosen": -32.858455657958984, "logps/rejected": -566.0185546875, "loss": 13013.8914, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20383331179618835, "rewards/margins": 0.5383815169334412, "rewards/rejected": -0.3345482349395752, "step": 5450 }, { "epoch": 27.691139240506327, "grad_norm": 395640.3468149828, "learning_rate": 1.2785960513945471e-07, "logits/chosen": -1.0696049928665161, "logits/rejected": -0.7029746770858765, "logps/chosen": -27.334697723388672, "logps/rejected": -572.7042846679688, "loss": 12608.9328, "rewards/accuracies": 1.0, "rewards/chosen": 0.2039167582988739, "rewards/margins": 0.5491318106651306, "rewards/rejected": -0.3452150225639343, "step": 5460 }, { "epoch": 27.741772151898733, "grad_norm": 737045.8738711793, "learning_rate": 1.2707615167659042e-07, "logits/chosen": -1.4398880004882812, "logits/rejected": -0.3085852265357971, "logps/chosen": -20.763835906982422, "logps/rejected": -557.7874755859375, "loss": 12662.9484, "rewards/accuracies": 1.0, "rewards/chosen": 0.20201142132282257, "rewards/margins": 0.5384231209754944, "rewards/rejected": -0.336411714553833, "step": 5470 }, { "epoch": 27.792405063291138, "grad_norm": 286929.61277431983, "learning_rate": 1.262926982137261e-07, "logits/chosen": -0.6262455582618713, "logits/rejected": -0.4802684783935547, "logps/chosen": -23.99846076965332, "logps/rejected": -575.8038940429688, "loss": 12141.5641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19609448313713074, "rewards/margins": 0.551897406578064, "rewards/rejected": -0.355802983045578, "step": 5480 }, { "epoch": 27.843037974683543, "grad_norm": 749583.0814867924, "learning_rate": 1.255092447508618e-07, "logits/chosen": -1.7006927728652954, "logits/rejected": -1.0466101169586182, "logps/chosen": -29.710596084594727, "logps/rejected": -591.6888427734375, "loss": 12764.4375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20634475350379944, "rewards/margins": 0.5570266842842102, "rewards/rejected": -0.3506819009780884, "step": 5490 }, { "epoch": 27.89367088607595, "grad_norm": 380933.42642122327, "learning_rate": 1.2472579128799749e-07, "logits/chosen": -1.4751110076904297, "logits/rejected": -0.9937122464179993, "logps/chosen": -34.70015335083008, "logps/rejected": -571.9727172851562, "loss": 12012.7281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2022305727005005, "rewards/margins": 0.5379850268363953, "rewards/rejected": -0.3357544541358948, "step": 5500 }, { "epoch": 27.944303797468354, "grad_norm": 258509.47313842815, "learning_rate": 1.2394233782513317e-07, "logits/chosen": -1.372650384902954, "logits/rejected": -1.0075037479400635, "logps/chosen": -35.64197540283203, "logps/rejected": -593.0923461914062, "loss": 11889.8898, "rewards/accuracies": 1.0, "rewards/chosen": 0.2117808610200882, "rewards/margins": 0.5591001510620117, "rewards/rejected": -0.34731921553611755, "step": 5510 }, { "epoch": 27.99493670886076, "grad_norm": 208938.2840249938, "learning_rate": 1.2315888436226887e-07, "logits/chosen": -2.1264805793762207, "logits/rejected": -1.4703245162963867, "logps/chosen": -32.981266021728516, "logps/rejected": -597.6434936523438, "loss": 12257.6922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21187356114387512, "rewards/margins": 0.5627579689025879, "rewards/rejected": -0.3508843779563904, "step": 5520 }, { "epoch": 28.045569620253165, "grad_norm": 221196.22582529782, "learning_rate": 1.2237543089940458e-07, "logits/chosen": -2.0594754219055176, "logits/rejected": -0.8701013326644897, "logps/chosen": -27.682659149169922, "logps/rejected": -595.9862060546875, "loss": 12458.1609, "rewards/accuracies": 1.0, "rewards/chosen": 0.21054939925670624, "rewards/margins": 0.5732256174087524, "rewards/rejected": -0.362676203250885, "step": 5530 }, { "epoch": 28.09620253164557, "grad_norm": 270569.55775822, "learning_rate": 1.2159197743654026e-07, "logits/chosen": -1.7393264770507812, "logits/rejected": -1.1281194686889648, "logps/chosen": -24.0075626373291, "logps/rejected": -583.3763427734375, "loss": 12198.0859, "rewards/accuracies": 1.0, "rewards/chosen": 0.20713207125663757, "rewards/margins": 0.5608252286911011, "rewards/rejected": -0.35369327664375305, "step": 5540 }, { "epoch": 28.146835443037975, "grad_norm": 294202.9420634267, "learning_rate": 1.2080852397367596e-07, "logits/chosen": -1.1881496906280518, "logits/rejected": -1.9830278158187866, "logps/chosen": -27.52215003967285, "logps/rejected": -571.5521240234375, "loss": 12035.3297, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.198753222823143, "rewards/margins": 0.5402897596359253, "rewards/rejected": -0.34153658151626587, "step": 5550 }, { "epoch": 28.19746835443038, "grad_norm": 250256.82251298483, "learning_rate": 1.2002507051081164e-07, "logits/chosen": -2.069603443145752, "logits/rejected": -1.1806148290634155, "logps/chosen": -32.80065155029297, "logps/rejected": -584.74169921875, "loss": 12394.9164, "rewards/accuracies": 1.0, "rewards/chosen": 0.21380552649497986, "rewards/margins": 0.5591468811035156, "rewards/rejected": -0.345341295003891, "step": 5560 }, { "epoch": 28.248101265822786, "grad_norm": 295332.47087175207, "learning_rate": 1.1924161704794735e-07, "logits/chosen": -1.0471255779266357, "logits/rejected": -0.4857943654060364, "logps/chosen": -21.058979034423828, "logps/rejected": -572.4304809570312, "loss": 12514.1922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1976846158504486, "rewards/margins": 0.548615574836731, "rewards/rejected": -0.35093095898628235, "step": 5570 }, { "epoch": 28.29873417721519, "grad_norm": 206143.0104728106, "learning_rate": 1.1845816358508304e-07, "logits/chosen": -2.556028366088867, "logits/rejected": -1.9551620483398438, "logps/chosen": -35.4798698425293, "logps/rejected": -586.085693359375, "loss": 12969.0906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20620682835578918, "rewards/margins": 0.5480517148971558, "rewards/rejected": -0.34184494614601135, "step": 5580 }, { "epoch": 28.349367088607593, "grad_norm": 264961.57508088043, "learning_rate": 1.1767471012221873e-07, "logits/chosen": -1.3203023672103882, "logits/rejected": -0.41819173097610474, "logps/chosen": -35.34379959106445, "logps/rejected": -582.7780151367188, "loss": 11659.4484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21271836757659912, "rewards/margins": 0.5540838837623596, "rewards/rejected": -0.3413654863834381, "step": 5590 }, { "epoch": 28.4, "grad_norm": 475056.8558156038, "learning_rate": 1.1689125665935443e-07, "logits/chosen": 0.2806483507156372, "logits/rejected": 0.8025129437446594, "logps/chosen": -32.23934555053711, "logps/rejected": -571.2871704101562, "loss": 12715.6078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20215098559856415, "rewards/margins": 0.5371149778366089, "rewards/rejected": -0.33496397733688354, "step": 5600 }, { "epoch": 28.450632911392404, "grad_norm": 197134.08463352287, "learning_rate": 1.1610780319649012e-07, "logits/chosen": -0.42449599504470825, "logits/rejected": 0.20755800604820251, "logps/chosen": -30.612218856811523, "logps/rejected": -578.2881469726562, "loss": 12352.6281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2100987732410431, "rewards/margins": 0.5525861978530884, "rewards/rejected": -0.34248748421669006, "step": 5610 }, { "epoch": 28.50126582278481, "grad_norm": 345771.22083863505, "learning_rate": 1.1532434973362581e-07, "logits/chosen": -1.5660457611083984, "logits/rejected": -0.7327693700790405, "logps/chosen": -22.58321762084961, "logps/rejected": -566.7658081054688, "loss": 11851.4547, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19859129190444946, "rewards/margins": 0.5416545271873474, "rewards/rejected": -0.3430632948875427, "step": 5620 }, { "epoch": 28.551898734177215, "grad_norm": 168585.12783774585, "learning_rate": 1.145408962707615e-07, "logits/chosen": -0.2972283363342285, "logits/rejected": -0.3674158453941345, "logps/chosen": -29.131546020507812, "logps/rejected": -597.9761962890625, "loss": 11512.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.21312180161476135, "rewards/margins": 0.5691269040107727, "rewards/rejected": -0.35600510239601135, "step": 5630 }, { "epoch": 28.60253164556962, "grad_norm": 208909.02036407648, "learning_rate": 1.137574428078972e-07, "logits/chosen": -1.0681560039520264, "logits/rejected": -0.39094457030296326, "logps/chosen": -36.012596130371094, "logps/rejected": -586.0516357421875, "loss": 12808.1297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20493356883525848, "rewards/margins": 0.5560083389282227, "rewards/rejected": -0.3510746955871582, "step": 5640 }, { "epoch": 28.653164556962025, "grad_norm": 311846.42372199957, "learning_rate": 1.1297398934503289e-07, "logits/chosen": 0.5105953216552734, "logits/rejected": 1.009169101715088, "logps/chosen": -27.007156372070312, "logps/rejected": -599.7515258789062, "loss": 11991.1812, "rewards/accuracies": 1.0, "rewards/chosen": 0.21386167407035828, "rewards/margins": 0.572094202041626, "rewards/rejected": -0.3582325577735901, "step": 5650 }, { "epoch": 28.70379746835443, "grad_norm": 268717.3291778612, "learning_rate": 1.1219053588216858e-07, "logits/chosen": -0.8255645036697388, "logits/rejected": -0.8527682423591614, "logps/chosen": -21.579692840576172, "logps/rejected": -585.7736206054688, "loss": 12543.1672, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072766274213791, "rewards/margins": 0.5643941760063171, "rewards/rejected": -0.35711759328842163, "step": 5660 }, { "epoch": 28.754430379746836, "grad_norm": 251846.43966430755, "learning_rate": 1.1140708241930429e-07, "logits/chosen": -1.6031732559204102, "logits/rejected": -0.6178330779075623, "logps/chosen": -30.777385711669922, "logps/rejected": -578.4786376953125, "loss": 12538.3711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20804066956043243, "rewards/margins": 0.5539838075637817, "rewards/rejected": -0.3459431827068329, "step": 5670 }, { "epoch": 28.80506329113924, "grad_norm": 362284.87054599967, "learning_rate": 1.1062362895643998e-07, "logits/chosen": -0.6538245677947998, "logits/rejected": -0.3702305555343628, "logps/chosen": -26.261932373046875, "logps/rejected": -577.5454711914062, "loss": 13020.9297, "rewards/accuracies": 1.0, "rewards/chosen": 0.20354709029197693, "rewards/margins": 0.5529359579086304, "rewards/rejected": -0.34938886761665344, "step": 5680 }, { "epoch": 28.855696202531647, "grad_norm": 205761.31564795828, "learning_rate": 1.0984017549357568e-07, "logits/chosen": -2.4821999073028564, "logits/rejected": -2.7491514682769775, "logps/chosen": -33.985538482666016, "logps/rejected": -588.4347534179688, "loss": 12498.5609, "rewards/accuracies": 1.0, "rewards/chosen": 0.2070481777191162, "rewards/margins": 0.5542899370193481, "rewards/rejected": -0.34724172949790955, "step": 5690 }, { "epoch": 28.906329113924052, "grad_norm": 269249.39255954395, "learning_rate": 1.0905672203071137e-07, "logits/chosen": 0.24643035233020782, "logits/rejected": 0.39688020944595337, "logps/chosen": -22.365093231201172, "logps/rejected": -572.4009399414062, "loss": 12145.2008, "rewards/accuracies": 1.0, "rewards/chosen": 0.20018497109413147, "rewards/margins": 0.5481060147285461, "rewards/rejected": -0.3479210138320923, "step": 5700 }, { "epoch": 28.956962025316457, "grad_norm": 274939.7399900873, "learning_rate": 1.0827326856784706e-07, "logits/chosen": 0.24328431487083435, "logits/rejected": 0.056040357798337936, "logps/chosen": -27.859899520874023, "logps/rejected": -582.8697509765625, "loss": 12038.6203, "rewards/accuracies": 1.0, "rewards/chosen": 0.20646706223487854, "rewards/margins": 0.5581387281417847, "rewards/rejected": -0.35167163610458374, "step": 5710 }, { "epoch": 29.00759493670886, "grad_norm": 177867.60188083298, "learning_rate": 1.0748981510498275e-07, "logits/chosen": -2.0902795791625977, "logits/rejected": -1.2426658868789673, "logps/chosen": -25.984241485595703, "logps/rejected": -595.5320434570312, "loss": 12101.3188, "rewards/accuracies": 1.0, "rewards/chosen": 0.21571488678455353, "rewards/margins": 0.5755189061164856, "rewards/rejected": -0.3598039150238037, "step": 5720 }, { "epoch": 29.058227848101264, "grad_norm": 175055.77768040166, "learning_rate": 1.0670636164211845e-07, "logits/chosen": -3.0874876976013184, "logits/rejected": -1.9259151220321655, "logps/chosen": -30.317163467407227, "logps/rejected": -582.4378662109375, "loss": 12058.957, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20789256691932678, "rewards/margins": 0.5540691018104553, "rewards/rejected": -0.3461765944957733, "step": 5730 }, { "epoch": 29.10886075949367, "grad_norm": 330095.71026448795, "learning_rate": 1.0592290817925414e-07, "logits/chosen": -0.40818461775779724, "logits/rejected": -0.17450471222400665, "logps/chosen": -37.967308044433594, "logps/rejected": -574.5567626953125, "loss": 12163.9234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.199508398771286, "rewards/margins": 0.5361432433128357, "rewards/rejected": -0.3366348147392273, "step": 5740 }, { "epoch": 29.159493670886075, "grad_norm": 207868.2185307626, "learning_rate": 1.0513945471638983e-07, "logits/chosen": -1.1228978633880615, "logits/rejected": -0.8512986302375793, "logps/chosen": -36.19347381591797, "logps/rejected": -572.5546264648438, "loss": 12217.475, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20709916949272156, "rewards/margins": 0.5452824234962463, "rewards/rejected": -0.3381832540035248, "step": 5750 }, { "epoch": 29.21012658227848, "grad_norm": 180300.955366917, "learning_rate": 1.0435600125352554e-07, "logits/chosen": -2.1935715675354004, "logits/rejected": -1.450584888458252, "logps/chosen": -41.38114547729492, "logps/rejected": -551.9308471679688, "loss": 11531.2219, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.19797861576080322, "rewards/margins": 0.5134168267250061, "rewards/rejected": -0.3154382109642029, "step": 5760 }, { "epoch": 29.260759493670886, "grad_norm": 230065.76491246693, "learning_rate": 1.0357254779066123e-07, "logits/chosen": -2.1162705421447754, "logits/rejected": -1.343379020690918, "logps/chosen": -26.30475425720215, "logps/rejected": -584.0765380859375, "loss": 12178.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.21349939703941345, "rewards/margins": 0.5615987181663513, "rewards/rejected": -0.34809932112693787, "step": 5770 }, { "epoch": 29.31139240506329, "grad_norm": 150891.5620522627, "learning_rate": 1.0278909432779692e-07, "logits/chosen": -0.6437171101570129, "logits/rejected": -0.06186608225107193, "logps/chosen": -32.27136993408203, "logps/rejected": -575.0911865234375, "loss": 12350.1367, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.20223280787467957, "rewards/margins": 0.5462868213653564, "rewards/rejected": -0.3440539240837097, "step": 5780 }, { "epoch": 29.362025316455696, "grad_norm": 268215.91577526846, "learning_rate": 1.0200564086493262e-07, "logits/chosen": -2.4000306129455566, "logits/rejected": -1.5239673852920532, "logps/chosen": -44.228759765625, "logps/rejected": -603.037109375, "loss": 11602.7789, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21766121685504913, "rewards/margins": 0.5693429112434387, "rewards/rejected": -0.3516816794872284, "step": 5790 }, { "epoch": 29.4126582278481, "grad_norm": 153754.6030127712, "learning_rate": 1.0122218740206831e-07, "logits/chosen": 1.1010842323303223, "logits/rejected": 1.6098358631134033, "logps/chosen": -25.794830322265625, "logps/rejected": -580.6827392578125, "loss": 12135.457, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20671968162059784, "rewards/margins": 0.5531316995620728, "rewards/rejected": -0.3464120328426361, "step": 5800 }, { "epoch": 29.463291139240507, "grad_norm": 237857.15032498536, "learning_rate": 1.00438733939204e-07, "logits/chosen": -2.2038140296936035, "logits/rejected": -1.9258426427841187, "logps/chosen": -24.270652770996094, "logps/rejected": -592.76806640625, "loss": 12368.1, "rewards/accuracies": 1.0, "rewards/chosen": 0.20859424769878387, "rewards/margins": 0.5708917379379272, "rewards/rejected": -0.3622974455356598, "step": 5810 }, { "epoch": 29.513924050632912, "grad_norm": 229363.27347544604, "learning_rate": 9.96552804763397e-08, "logits/chosen": -1.733412742614746, "logits/rejected": -1.8426891565322876, "logps/chosen": -27.749902725219727, "logps/rejected": -591.9719848632812, "loss": 12434.6094, "rewards/accuracies": 1.0, "rewards/chosen": 0.20853643119335175, "rewards/margins": 0.559594452381134, "rewards/rejected": -0.3510579764842987, "step": 5820 }, { "epoch": 29.564556962025318, "grad_norm": 204423.82729459935, "learning_rate": 9.887182701347539e-08, "logits/chosen": -0.8372312784194946, "logits/rejected": -0.9436752200126648, "logps/chosen": -23.713529586791992, "logps/rejected": -551.91748046875, "loss": 12191.0797, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1975078582763672, "rewards/margins": 0.5233575105667114, "rewards/rejected": -0.32584962248802185, "step": 5830 }, { "epoch": 29.615189873417723, "grad_norm": 196500.42803475718, "learning_rate": 9.808837355061108e-08, "logits/chosen": -0.07084647566080093, "logits/rejected": 0.9050701856613159, "logps/chosen": -29.59817886352539, "logps/rejected": -567.6174926757812, "loss": 12194.2234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20393919944763184, "rewards/margins": 0.5439929366111755, "rewards/rejected": -0.3400537371635437, "step": 5840 }, { "epoch": 29.665822784810125, "grad_norm": 226455.28104673527, "learning_rate": 9.730492008774677e-08, "logits/chosen": -3.320272445678711, "logits/rejected": -3.3560733795166016, "logps/chosen": -28.402095794677734, "logps/rejected": -602.0023193359375, "loss": 12657.2406, "rewards/accuracies": 1.0, "rewards/chosen": 0.2187713086605072, "rewards/margins": 0.5724385976791382, "rewards/rejected": -0.35366731882095337, "step": 5850 }, { "epoch": 29.71645569620253, "grad_norm": 162035.60177504522, "learning_rate": 9.652146662488248e-08, "logits/chosen": -1.8201286792755127, "logits/rejected": -1.7938740253448486, "logps/chosen": -35.96394348144531, "logps/rejected": -611.4141845703125, "loss": 12011.9406, "rewards/accuracies": 1.0, "rewards/chosen": 0.21413405239582062, "rewards/margins": 0.5712839365005493, "rewards/rejected": -0.3571499288082123, "step": 5860 }, { "epoch": 29.767088607594935, "grad_norm": 162090.09030278528, "learning_rate": 9.573801316201817e-08, "logits/chosen": -0.6652274131774902, "logits/rejected": -0.600281834602356, "logps/chosen": -24.422576904296875, "logps/rejected": -566.0366821289062, "loss": 12593.6359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1961481273174286, "rewards/margins": 0.5393214821815491, "rewards/rejected": -0.3431733250617981, "step": 5870 }, { "epoch": 29.81772151898734, "grad_norm": 365229.93961962714, "learning_rate": 9.495455969915387e-08, "logits/chosen": -2.613847017288208, "logits/rejected": -2.108478546142578, "logps/chosen": -29.573253631591797, "logps/rejected": -577.60546875, "loss": 12424.4891, "rewards/accuracies": 1.0, "rewards/chosen": 0.20539173483848572, "rewards/margins": 0.5470829010009766, "rewards/rejected": -0.34169113636016846, "step": 5880 }, { "epoch": 29.868354430379746, "grad_norm": 173325.82955161307, "learning_rate": 9.417110623628956e-08, "logits/chosen": -1.4006824493408203, "logits/rejected": -0.5856371521949768, "logps/chosen": -27.345510482788086, "logps/rejected": -584.8424072265625, "loss": 12358.3133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2079104632139206, "rewards/margins": 0.5603929758071899, "rewards/rejected": -0.35248249769210815, "step": 5890 }, { "epoch": 29.91898734177215, "grad_norm": 287432.0969704827, "learning_rate": 9.338765277342525e-08, "logits/chosen": -0.21508927643299103, "logits/rejected": -0.1394989937543869, "logps/chosen": -30.839313507080078, "logps/rejected": -594.2600708007812, "loss": 11980.4219, "rewards/accuracies": 1.0, "rewards/chosen": 0.21064691245555878, "rewards/margins": 0.5655493140220642, "rewards/rejected": -0.354902446269989, "step": 5900 }, { "epoch": 29.969620253164557, "grad_norm": 365207.2969153869, "learning_rate": 9.260419931056094e-08, "logits/chosen": -0.40759915113449097, "logits/rejected": 0.3133270740509033, "logps/chosen": -25.633676528930664, "logps/rejected": -578.2957763671875, "loss": 12223.2844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20843228697776794, "rewards/margins": 0.5534237027168274, "rewards/rejected": -0.34499144554138184, "step": 5910 }, { "epoch": 30.020253164556962, "grad_norm": 218071.18905642498, "learning_rate": 9.182074584769664e-08, "logits/chosen": -0.20139971375465393, "logits/rejected": 0.6374796628952026, "logps/chosen": -36.04420852661133, "logps/rejected": -585.3655395507812, "loss": 12139.8164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20962996780872345, "rewards/margins": 0.5547569394111633, "rewards/rejected": -0.3451269865036011, "step": 5920 }, { "epoch": 30.070886075949367, "grad_norm": 199502.22634833233, "learning_rate": 9.103729238483233e-08, "logits/chosen": -0.5093935132026672, "logits/rejected": -0.9036226272583008, "logps/chosen": -32.292423248291016, "logps/rejected": -584.6748046875, "loss": 11463.5555, "rewards/accuracies": 1.0, "rewards/chosen": 0.20751234889030457, "rewards/margins": 0.5532687902450562, "rewards/rejected": -0.345756471157074, "step": 5930 }, { "epoch": 30.121518987341773, "grad_norm": 164683.94241544002, "learning_rate": 9.025383892196802e-08, "logits/chosen": -1.2027417421340942, "logits/rejected": -0.21418258547782898, "logps/chosen": -38.00572967529297, "logps/rejected": -542.5321044921875, "loss": 12248.6938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1922551691532135, "rewards/margins": 0.5097079277038574, "rewards/rejected": -0.3174527585506439, "step": 5940 }, { "epoch": 30.172151898734178, "grad_norm": 209885.7696817789, "learning_rate": 8.947038545910373e-08, "logits/chosen": -0.5836046934127808, "logits/rejected": -0.049278389662504196, "logps/chosen": -26.44875144958496, "logps/rejected": -577.2633056640625, "loss": 11882.8156, "rewards/accuracies": 1.0, "rewards/chosen": 0.2028985321521759, "rewards/margins": 0.548152506351471, "rewards/rejected": -0.34525397419929504, "step": 5950 }, { "epoch": 30.222784810126583, "grad_norm": 116064.20956709805, "learning_rate": 8.868693199623942e-08, "logits/chosen": -0.3441212773323059, "logits/rejected": 0.3469446897506714, "logps/chosen": -29.866031646728516, "logps/rejected": -576.8469848632812, "loss": 11899.9906, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20657262206077576, "rewards/margins": 0.5527979731559753, "rewards/rejected": -0.3462253212928772, "step": 5960 }, { "epoch": 30.27341772151899, "grad_norm": 213446.8577722312, "learning_rate": 8.790347853337511e-08, "logits/chosen": -1.195245623588562, "logits/rejected": -1.5595389604568481, "logps/chosen": -26.48971939086914, "logps/rejected": -562.9793090820312, "loss": 12288.4188, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19797027111053467, "rewards/margins": 0.5365854501724243, "rewards/rejected": -0.3386152386665344, "step": 5970 }, { "epoch": 30.324050632911394, "grad_norm": 150392.6550831942, "learning_rate": 8.712002507051081e-08, "logits/chosen": -0.636971116065979, "logits/rejected": -0.8326961398124695, "logps/chosen": -31.82355308532715, "logps/rejected": -572.6309814453125, "loss": 11735.9594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20741339027881622, "rewards/margins": 0.5429075360298157, "rewards/rejected": -0.33549413084983826, "step": 5980 }, { "epoch": 30.374683544303796, "grad_norm": 248873.00017903763, "learning_rate": 8.63365716076465e-08, "logits/chosen": -0.8763412237167358, "logits/rejected": -0.38471752405166626, "logps/chosen": -33.753509521484375, "logps/rejected": -577.4928588867188, "loss": 11981.0336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20298103988170624, "rewards/margins": 0.5445905923843384, "rewards/rejected": -0.34160953760147095, "step": 5990 }, { "epoch": 30.4253164556962, "grad_norm": 247123.70966936232, "learning_rate": 8.555311814478219e-08, "logits/chosen": -1.4638581275939941, "logits/rejected": -1.6560137271881104, "logps/chosen": -27.02420425415039, "logps/rejected": -579.2672119140625, "loss": 12743.5711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20767991244792938, "rewards/margins": 0.5485936403274536, "rewards/rejected": -0.3409137427806854, "step": 6000 }, { "epoch": 30.475949367088607, "grad_norm": 152247.200364489, "learning_rate": 8.476966468191789e-08, "logits/chosen": -1.2626516819000244, "logits/rejected": -1.3656198978424072, "logps/chosen": -30.586597442626953, "logps/rejected": -564.9951782226562, "loss": 12138.7328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19354796409606934, "rewards/margins": 0.5365390181541443, "rewards/rejected": -0.34299105405807495, "step": 6010 }, { "epoch": 30.526582278481012, "grad_norm": 153551.3953399981, "learning_rate": 8.398621121905358e-08, "logits/chosen": -0.8625293970108032, "logits/rejected": -1.6173267364501953, "logps/chosen": -23.908416748046875, "logps/rejected": -591.5709228515625, "loss": 12247.5672, "rewards/accuracies": 1.0, "rewards/chosen": 0.2063622921705246, "rewards/margins": 0.5622067451477051, "rewards/rejected": -0.3558444678783417, "step": 6020 }, { "epoch": 30.577215189873417, "grad_norm": 247558.34356145174, "learning_rate": 8.320275775618927e-08, "logits/chosen": -0.6456829309463501, "logits/rejected": -0.25254157185554504, "logps/chosen": -30.65958023071289, "logps/rejected": -572.8914794921875, "loss": 11907.7406, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20363232493400574, "rewards/margins": 0.5450500845909119, "rewards/rejected": -0.34141772985458374, "step": 6030 }, { "epoch": 30.627848101265823, "grad_norm": 162045.80468301394, "learning_rate": 8.241930429332496e-08, "logits/chosen": 0.5193571448326111, "logits/rejected": 1.0150249004364014, "logps/chosen": -21.961605072021484, "logps/rejected": -586.28369140625, "loss": 11870.2594, "rewards/accuracies": 1.0, "rewards/chosen": 0.20082764327526093, "rewards/margins": 0.5631116032600403, "rewards/rejected": -0.36228394508361816, "step": 6040 }, { "epoch": 30.678481012658228, "grad_norm": 183677.73161043233, "learning_rate": 8.163585083046067e-08, "logits/chosen": -2.2690348625183105, "logits/rejected": -1.8725353479385376, "logps/chosen": -34.20100021362305, "logps/rejected": -571.948486328125, "loss": 11952.7477, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2089938223361969, "rewards/margins": 0.542193591594696, "rewards/rejected": -0.33319979906082153, "step": 6050 }, { "epoch": 30.729113924050633, "grad_norm": 206299.51509471133, "learning_rate": 8.085239736759636e-08, "logits/chosen": -1.8886245489120483, "logits/rejected": -1.428289532661438, "logps/chosen": -33.442771911621094, "logps/rejected": -577.7710571289062, "loss": 12094.3477, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20627331733703613, "rewards/margins": 0.5446707606315613, "rewards/rejected": -0.3383975028991699, "step": 6060 }, { "epoch": 30.77974683544304, "grad_norm": 178455.47052363763, "learning_rate": 8.006894390473206e-08, "logits/chosen": -0.9159374237060547, "logits/rejected": -0.5700797438621521, "logps/chosen": -22.003402709960938, "logps/rejected": -588.1246948242188, "loss": 12967.7109, "rewards/accuracies": 1.0, "rewards/chosen": 0.20624502003192902, "rewards/margins": 0.5602100491523743, "rewards/rejected": -0.35396507382392883, "step": 6070 }, { "epoch": 30.830379746835444, "grad_norm": 188994.549896938, "learning_rate": 7.928549044186775e-08, "logits/chosen": -2.218046188354492, "logits/rejected": -2.298725128173828, "logps/chosen": -36.601036071777344, "logps/rejected": -578.4909057617188, "loss": 11942.907, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20413891971111298, "rewards/margins": 0.5424162149429321, "rewards/rejected": -0.33827728033065796, "step": 6080 }, { "epoch": 30.88101265822785, "grad_norm": 226618.0916543629, "learning_rate": 7.850203697900344e-08, "logits/chosen": -0.8958581686019897, "logits/rejected": -0.3350396454334259, "logps/chosen": -27.914409637451172, "logps/rejected": -584.2742919921875, "loss": 12020.4094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2059321403503418, "rewards/margins": 0.5559757947921753, "rewards/rejected": -0.3500436246395111, "step": 6090 }, { "epoch": 30.931645569620255, "grad_norm": 193720.76624447017, "learning_rate": 7.771858351613913e-08, "logits/chosen": -0.13203875720500946, "logits/rejected": -0.22968029975891113, "logps/chosen": -25.164508819580078, "logps/rejected": -573.2855224609375, "loss": 12096.6344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1958034336566925, "rewards/margins": 0.5465744137763977, "rewards/rejected": -0.3507709503173828, "step": 6100 }, { "epoch": 30.98227848101266, "grad_norm": 177238.22636435836, "learning_rate": 7.693513005327483e-08, "logits/chosen": -2.0759382247924805, "logits/rejected": -1.4708411693572998, "logps/chosen": -28.522485733032227, "logps/rejected": -573.73681640625, "loss": 12017.775, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20944443345069885, "rewards/margins": 0.5498504042625427, "rewards/rejected": -0.34040600061416626, "step": 6110 }, { "epoch": 31.03291139240506, "grad_norm": 111295.73044950665, "learning_rate": 7.615167659041052e-08, "logits/chosen": -0.7748550772666931, "logits/rejected": -0.973538875579834, "logps/chosen": -31.6827335357666, "logps/rejected": -594.8641357421875, "loss": 11721.4203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20987281203269958, "rewards/margins": 0.5614258050918579, "rewards/rejected": -0.3515530228614807, "step": 6120 }, { "epoch": 31.083544303797467, "grad_norm": 132943.3056647964, "learning_rate": 7.536822312754621e-08, "logits/chosen": -2.017181396484375, "logits/rejected": -1.8383163213729858, "logps/chosen": -32.51802062988281, "logps/rejected": -609.6942138671875, "loss": 12392.7875, "rewards/accuracies": 1.0, "rewards/chosen": 0.22407253086566925, "rewards/margins": 0.582473874092102, "rewards/rejected": -0.3584012985229492, "step": 6130 }, { "epoch": 31.134177215189872, "grad_norm": 174931.96319021285, "learning_rate": 7.45847696646819e-08, "logits/chosen": -0.5535727143287659, "logits/rejected": 0.6218046545982361, "logps/chosen": -26.1910457611084, "logps/rejected": -551.5840454101562, "loss": 11699.3109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20296287536621094, "rewards/margins": 0.5298973917961121, "rewards/rejected": -0.3269345760345459, "step": 6140 }, { "epoch": 31.184810126582278, "grad_norm": 168688.32644125135, "learning_rate": 7.380131620181761e-08, "logits/chosen": -1.008988618850708, "logits/rejected": -0.2778696119785309, "logps/chosen": -33.33096694946289, "logps/rejected": -607.976806640625, "loss": 11916.4016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21690383553504944, "rewards/margins": 0.5754967331886292, "rewards/rejected": -0.3585929274559021, "step": 6150 }, { "epoch": 31.235443037974683, "grad_norm": 94661.132576451, "learning_rate": 7.30178627389533e-08, "logits/chosen": -3.0997250080108643, "logits/rejected": -2.1219401359558105, "logps/chosen": -27.209686279296875, "logps/rejected": -589.7662353515625, "loss": 12111.9188, "rewards/accuracies": 1.0, "rewards/chosen": 0.21759450435638428, "rewards/margins": 0.5674911737442017, "rewards/rejected": -0.3498966693878174, "step": 6160 }, { "epoch": 31.28607594936709, "grad_norm": 129537.98682999605, "learning_rate": 7.2234409276089e-08, "logits/chosen": -2.1777210235595703, "logits/rejected": -2.1664652824401855, "logps/chosen": -29.21515464782715, "logps/rejected": -575.5145263671875, "loss": 12396.4562, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20168697834014893, "rewards/margins": 0.5468615293502808, "rewards/rejected": -0.3451746106147766, "step": 6170 }, { "epoch": 31.336708860759494, "grad_norm": 146320.37748909468, "learning_rate": 7.145095581322469e-08, "logits/chosen": -0.37119048833847046, "logits/rejected": -0.12678974866867065, "logps/chosen": -27.464313507080078, "logps/rejected": -583.199462890625, "loss": 12035.1789, "rewards/accuracies": 1.0, "rewards/chosen": 0.20687448978424072, "rewards/margins": 0.5559764504432678, "rewards/rejected": -0.3491020202636719, "step": 6180 }, { "epoch": 31.3873417721519, "grad_norm": 123464.43072965978, "learning_rate": 7.066750235036038e-08, "logits/chosen": -1.114485740661621, "logits/rejected": -0.36546590924263, "logps/chosen": -24.96463394165039, "logps/rejected": -573.1627197265625, "loss": 12102.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.2025957852602005, "rewards/margins": 0.5483575463294983, "rewards/rejected": -0.3457617163658142, "step": 6190 }, { "epoch": 31.437974683544304, "grad_norm": 182155.23164206932, "learning_rate": 6.988404888749608e-08, "logits/chosen": -1.7520939111709595, "logits/rejected": -1.4854246377944946, "logps/chosen": -29.002777099609375, "logps/rejected": -592.4381713867188, "loss": 11423.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.2187313735485077, "rewards/margins": 0.5657260417938232, "rewards/rejected": -0.34699463844299316, "step": 6200 }, { "epoch": 31.48860759493671, "grad_norm": 148737.16455364344, "learning_rate": 6.910059542463177e-08, "logits/chosen": 0.025389552116394043, "logits/rejected": -0.27969443798065186, "logps/chosen": -17.67035675048828, "logps/rejected": -546.9998168945312, "loss": 11498.325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19227565824985504, "rewards/margins": 0.5237180590629578, "rewards/rejected": -0.33144229650497437, "step": 6210 }, { "epoch": 31.539240506329115, "grad_norm": 186784.06647045226, "learning_rate": 6.831714196176746e-08, "logits/chosen": -3.0769848823547363, "logits/rejected": -2.87144136428833, "logps/chosen": -25.640066146850586, "logps/rejected": -605.6832885742188, "loss": 11701.2086, "rewards/accuracies": 1.0, "rewards/chosen": 0.21926145255565643, "rewards/margins": 0.5798953771591187, "rewards/rejected": -0.3606340289115906, "step": 6220 }, { "epoch": 31.58987341772152, "grad_norm": 108314.28535819704, "learning_rate": 6.753368849890315e-08, "logits/chosen": -0.5384847521781921, "logits/rejected": -0.6974294781684875, "logps/chosen": -26.830814361572266, "logps/rejected": -587.5255126953125, "loss": 11231.8016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20746394991874695, "rewards/margins": 0.557998776435852, "rewards/rejected": -0.3505348265171051, "step": 6230 }, { "epoch": 31.640506329113926, "grad_norm": 197387.20948770002, "learning_rate": 6.675023503603886e-08, "logits/chosen": -0.6654781103134155, "logits/rejected": -1.1572941541671753, "logps/chosen": -27.918231964111328, "logps/rejected": -592.8441162109375, "loss": 11850.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.21056847274303436, "rewards/margins": 0.5671868920326233, "rewards/rejected": -0.3566184341907501, "step": 6240 }, { "epoch": 31.691139240506327, "grad_norm": 178129.00858003844, "learning_rate": 6.596678157317455e-08, "logits/chosen": 0.17990253865718842, "logits/rejected": 0.15132752060890198, "logps/chosen": -26.486125946044922, "logps/rejected": -577.5296020507812, "loss": 12025.9992, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1961621642112732, "rewards/margins": 0.5460348725318909, "rewards/rejected": -0.34987273812294006, "step": 6250 }, { "epoch": 31.741772151898733, "grad_norm": 113204.1607298857, "learning_rate": 6.518332811031025e-08, "logits/chosen": -0.7701491117477417, "logits/rejected": -0.5652084946632385, "logps/chosen": -30.580230712890625, "logps/rejected": -575.9344482421875, "loss": 12611.7422, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20266905426979065, "rewards/margins": 0.5443531274795532, "rewards/rejected": -0.34168410301208496, "step": 6260 }, { "epoch": 31.792405063291138, "grad_norm": 170084.77349090017, "learning_rate": 6.439987464744594e-08, "logits/chosen": 0.8593052625656128, "logits/rejected": 1.1197197437286377, "logps/chosen": -26.577016830444336, "logps/rejected": -555.6820068359375, "loss": 12234.5422, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19716337323188782, "rewards/margins": 0.5328342318534851, "rewards/rejected": -0.33567091822624207, "step": 6270 }, { "epoch": 31.843037974683543, "grad_norm": 235274.58346107465, "learning_rate": 6.361642118458163e-08, "logits/chosen": -1.7307960987091064, "logits/rejected": -1.3535115718841553, "logps/chosen": -23.92806625366211, "logps/rejected": -565.2352294921875, "loss": 12517.5156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2007029801607132, "rewards/margins": 0.5428507924079895, "rewards/rejected": -0.3421478271484375, "step": 6280 }, { "epoch": 31.89367088607595, "grad_norm": 190203.888446938, "learning_rate": 6.283296772171732e-08, "logits/chosen": -0.9662951231002808, "logits/rejected": -0.45983943343162537, "logps/chosen": -26.488794326782227, "logps/rejected": -565.1602783203125, "loss": 12050.4156, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20456723868846893, "rewards/margins": 0.5392366051673889, "rewards/rejected": -0.3346693515777588, "step": 6290 }, { "epoch": 31.944303797468354, "grad_norm": 169175.47682307824, "learning_rate": 6.204951425885302e-08, "logits/chosen": -1.9982364177703857, "logits/rejected": -1.282958745956421, "logps/chosen": -25.263113021850586, "logps/rejected": -584.7576293945312, "loss": 11806.3297, "rewards/accuracies": 1.0, "rewards/chosen": 0.20914848148822784, "rewards/margins": 0.5613822937011719, "rewards/rejected": -0.35223376750946045, "step": 6300 }, { "epoch": 31.99493670886076, "grad_norm": 142938.702725119, "learning_rate": 6.126606079598871e-08, "logits/chosen": -2.084618091583252, "logits/rejected": -1.6745023727416992, "logps/chosen": -24.918956756591797, "logps/rejected": -603.4859619140625, "loss": 12022.5133, "rewards/accuracies": 1.0, "rewards/chosen": 0.21575181186199188, "rewards/margins": 0.5819977521896362, "rewards/rejected": -0.36624595522880554, "step": 6310 }, { "epoch": 32.04556962025316, "grad_norm": 146925.77007874168, "learning_rate": 6.04826073331244e-08, "logits/chosen": -1.0771139860153198, "logits/rejected": -0.38963261246681213, "logps/chosen": -25.353687286376953, "logps/rejected": -599.3104248046875, "loss": 11649.7609, "rewards/accuracies": 1.0, "rewards/chosen": 0.21451549232006073, "rewards/margins": 0.5766840577125549, "rewards/rejected": -0.3621685206890106, "step": 6320 }, { "epoch": 32.09620253164557, "grad_norm": 94333.82344683389, "learning_rate": 5.96991538702601e-08, "logits/chosen": -2.162341356277466, "logits/rejected": -1.5530678033828735, "logps/chosen": -36.120880126953125, "logps/rejected": -594.9260864257812, "loss": 11919.4, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2106127291917801, "rewards/margins": 0.558625340461731, "rewards/rejected": -0.34801262617111206, "step": 6330 }, { "epoch": 32.14683544303797, "grad_norm": 144438.33677050017, "learning_rate": 5.8915700407395795e-08, "logits/chosen": -0.8229999542236328, "logits/rejected": -0.037537313997745514, "logps/chosen": -25.43358612060547, "logps/rejected": -557.636474609375, "loss": 11297.6063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1968574970960617, "rewards/margins": 0.532370388507843, "rewards/rejected": -0.33551284670829773, "step": 6340 }, { "epoch": 32.19746835443038, "grad_norm": 109693.94525690017, "learning_rate": 5.813224694453149e-08, "logits/chosen": -3.077913761138916, "logits/rejected": -2.4543375968933105, "logps/chosen": -26.92588233947754, "logps/rejected": -583.3746337890625, "loss": 12147.5016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21152964234352112, "rewards/margins": 0.5570891499519348, "rewards/rejected": -0.3455595374107361, "step": 6350 }, { "epoch": 32.24810126582278, "grad_norm": 94464.04824246689, "learning_rate": 5.734879348166719e-08, "logits/chosen": -0.08146251738071442, "logits/rejected": -0.1943734884262085, "logps/chosen": -38.933929443359375, "logps/rejected": -599.4444580078125, "loss": 11706.7859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21242408454418182, "rewards/margins": 0.5596734881401062, "rewards/rejected": -0.34724941849708557, "step": 6360 }, { "epoch": 32.29873417721519, "grad_norm": 93779.41167523999, "learning_rate": 5.656534001880288e-08, "logits/chosen": 0.4058389663696289, "logits/rejected": 0.994676947593689, "logps/chosen": -21.240737915039062, "logps/rejected": -573.2392578125, "loss": 12153.6359, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19539888203144073, "rewards/margins": 0.550510048866272, "rewards/rejected": -0.35511118173599243, "step": 6370 }, { "epoch": 32.34936708860759, "grad_norm": 215459.26677533987, "learning_rate": 5.5781886555938573e-08, "logits/chosen": -1.0755536556243896, "logits/rejected": -0.2684146761894226, "logps/chosen": -25.781116485595703, "logps/rejected": -580.9659423828125, "loss": 11508.8133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21089033782482147, "rewards/margins": 0.5592586994171143, "rewards/rejected": -0.34836840629577637, "step": 6380 }, { "epoch": 32.4, "grad_norm": 164612.93717131627, "learning_rate": 5.4998433093074266e-08, "logits/chosen": -2.730407238006592, "logits/rejected": -2.2623066902160645, "logps/chosen": -38.27416229248047, "logps/rejected": -612.3323364257812, "loss": 10969.9328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22319836914539337, "rewards/margins": 0.573035478591919, "rewards/rejected": -0.34983712434768677, "step": 6390 }, { "epoch": 32.450632911392404, "grad_norm": 140032.81053392185, "learning_rate": 5.421497963020996e-08, "logits/chosen": -0.6492301821708679, "logits/rejected": -0.778862476348877, "logps/chosen": -28.754650115966797, "logps/rejected": -591.8221435546875, "loss": 12521.7703, "rewards/accuracies": 1.0, "rewards/chosen": 0.21057042479515076, "rewards/margins": 0.5634862780570984, "rewards/rejected": -0.35291582345962524, "step": 6400 }, { "epoch": 32.50126582278481, "grad_norm": 102205.70485715618, "learning_rate": 5.343152616734566e-08, "logits/chosen": -0.9864907264709473, "logits/rejected": -0.19051684439182281, "logps/chosen": -29.4318904876709, "logps/rejected": -605.131103515625, "loss": 11591.8508, "rewards/accuracies": 1.0, "rewards/chosen": 0.2185964584350586, "rewards/margins": 0.579878032207489, "rewards/rejected": -0.3612816333770752, "step": 6410 }, { "epoch": 32.551898734177215, "grad_norm": 103047.13529668628, "learning_rate": 5.264807270448135e-08, "logits/chosen": -2.3946361541748047, "logits/rejected": -1.8663170337677002, "logps/chosen": -22.362850189208984, "logps/rejected": -582.4278564453125, "loss": 11901.1398, "rewards/accuracies": 1.0, "rewards/chosen": 0.21427400410175323, "rewards/margins": 0.5642385482788086, "rewards/rejected": -0.34996455907821655, "step": 6420 }, { "epoch": 32.60253164556962, "grad_norm": 86074.947460872, "learning_rate": 5.1864619241617044e-08, "logits/chosen": 0.2598368227481842, "logits/rejected": 0.16884984076023102, "logps/chosen": -22.76316261291504, "logps/rejected": -594.866455078125, "loss": 12333.5344, "rewards/accuracies": 1.0, "rewards/chosen": 0.2121623456478119, "rewards/margins": 0.5697360038757324, "rewards/rejected": -0.35757365822792053, "step": 6430 }, { "epoch": 32.653164556962025, "grad_norm": 137970.73954909868, "learning_rate": 5.108116577875274e-08, "logits/chosen": -0.11699090898036957, "logits/rejected": 0.11212899535894394, "logps/chosen": -29.464065551757812, "logps/rejected": -573.3801879882812, "loss": 11953.9641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21537606418132782, "rewards/margins": 0.5438817739486694, "rewards/rejected": -0.3285056948661804, "step": 6440 }, { "epoch": 32.70379746835443, "grad_norm": 460796.64629538235, "learning_rate": 5.029771231588843e-08, "logits/chosen": -1.4031693935394287, "logits/rejected": -2.1060502529144287, "logps/chosen": -23.794132232666016, "logps/rejected": -581.7036743164062, "loss": 12159.9719, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20509609580039978, "rewards/margins": 0.5560418367385864, "rewards/rejected": -0.35094568133354187, "step": 6450 }, { "epoch": 32.754430379746836, "grad_norm": 88571.49642806537, "learning_rate": 4.951425885302413e-08, "logits/chosen": -0.29163846373558044, "logits/rejected": 0.15456560254096985, "logps/chosen": -19.800487518310547, "logps/rejected": -562.6231689453125, "loss": 11758.9578, "rewards/accuracies": 1.0, "rewards/chosen": 0.2048061192035675, "rewards/margins": 0.5433157682418823, "rewards/rejected": -0.33850961923599243, "step": 6460 }, { "epoch": 32.80506329113924, "grad_norm": 166818.40028028333, "learning_rate": 4.873080539015982e-08, "logits/chosen": 0.3278934061527252, "logits/rejected": 0.6011670827865601, "logps/chosen": -33.445350646972656, "logps/rejected": -590.470703125, "loss": 11395.1164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2123481035232544, "rewards/margins": 0.5555016994476318, "rewards/rejected": -0.34315359592437744, "step": 6470 }, { "epoch": 32.85569620253165, "grad_norm": 80619.8591659213, "learning_rate": 4.7947351927295515e-08, "logits/chosen": -1.3291213512420654, "logits/rejected": -1.6056814193725586, "logps/chosen": -29.16250228881836, "logps/rejected": -598.3140869140625, "loss": 11908.6562, "rewards/accuracies": 1.0, "rewards/chosen": 0.21245749294757843, "rewards/margins": 0.5684391856193542, "rewards/rejected": -0.3559816777706146, "step": 6480 }, { "epoch": 32.90632911392405, "grad_norm": 109452.38261580766, "learning_rate": 4.716389846443121e-08, "logits/chosen": -2.2227654457092285, "logits/rejected": -2.1318516731262207, "logps/chosen": -27.57879638671875, "logps/rejected": -593.1817626953125, "loss": 11900.8148, "rewards/accuracies": 1.0, "rewards/chosen": 0.2101704627275467, "rewards/margins": 0.565523624420166, "rewards/rejected": -0.35535311698913574, "step": 6490 }, { "epoch": 32.95696202531646, "grad_norm": 146037.74057243837, "learning_rate": 4.63804450015669e-08, "logits/chosen": -0.4855597913265228, "logits/rejected": -0.07905157655477524, "logps/chosen": -32.26173782348633, "logps/rejected": -582.983154296875, "loss": 12785.9484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2102789580821991, "rewards/margins": 0.5541440844535828, "rewards/rejected": -0.34386518597602844, "step": 6500 }, { "epoch": 33.00759493670886, "grad_norm": 80554.44381289573, "learning_rate": 4.55969915387026e-08, "logits/chosen": -1.16013503074646, "logits/rejected": -1.237755537033081, "logps/chosen": -22.434879302978516, "logps/rejected": -572.4281005859375, "loss": 11892.3344, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072155922651291, "rewards/margins": 0.54491126537323, "rewards/rejected": -0.3376956880092621, "step": 6510 }, { "epoch": 33.05822784810127, "grad_norm": 128557.62032643631, "learning_rate": 4.481353807583829e-08, "logits/chosen": -0.2354935109615326, "logits/rejected": 0.728766143321991, "logps/chosen": -29.432445526123047, "logps/rejected": -585.3494262695312, "loss": 11835.0961, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2072407454252243, "rewards/margins": 0.5606441497802734, "rewards/rejected": -0.35340338945388794, "step": 6520 }, { "epoch": 33.10886075949367, "grad_norm": 91776.99508964189, "learning_rate": 4.4030084612973985e-08, "logits/chosen": -1.175462007522583, "logits/rejected": -1.1933832168579102, "logps/chosen": -21.900630950927734, "logps/rejected": -574.4762573242188, "loss": 12157.9109, "rewards/accuracies": 1.0, "rewards/chosen": 0.20543113350868225, "rewards/margins": 0.5517674684524536, "rewards/rejected": -0.346336305141449, "step": 6530 }, { "epoch": 33.15949367088608, "grad_norm": 89893.29258028018, "learning_rate": 4.324663115010968e-08, "logits/chosen": -0.7350924015045166, "logits/rejected": -0.16997528076171875, "logps/chosen": -23.83113670349121, "logps/rejected": -575.5424194335938, "loss": 11686.9375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20410069823265076, "rewards/margins": 0.5512816309928894, "rewards/rejected": -0.34718090295791626, "step": 6540 }, { "epoch": 33.210126582278484, "grad_norm": 120975.35903766478, "learning_rate": 4.246317768724538e-08, "logits/chosen": -0.08163319528102875, "logits/rejected": 0.07650710642337799, "logps/chosen": -27.332035064697266, "logps/rejected": -579.8117065429688, "loss": 11339.9297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2052970826625824, "rewards/margins": 0.5518554449081421, "rewards/rejected": -0.3465583324432373, "step": 6550 }, { "epoch": 33.26075949367089, "grad_norm": 180391.18731890293, "learning_rate": 4.167972422438107e-08, "logits/chosen": -0.8266963958740234, "logits/rejected": 1.0672438144683838, "logps/chosen": -23.287370681762695, "logps/rejected": -572.2568969726562, "loss": 11743.5586, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19874341785907745, "rewards/margins": 0.5572081804275513, "rewards/rejected": -0.35846468806266785, "step": 6560 }, { "epoch": 33.311392405063295, "grad_norm": 84282.72341083131, "learning_rate": 4.0896270761516763e-08, "logits/chosen": -1.91861093044281, "logits/rejected": -1.3766604661941528, "logps/chosen": -24.914443969726562, "logps/rejected": -581.4729614257812, "loss": 11078.6969, "rewards/accuracies": 1.0, "rewards/chosen": 0.21130716800689697, "rewards/margins": 0.5577182769775391, "rewards/rejected": -0.34641116857528687, "step": 6570 }, { "epoch": 33.36202531645569, "grad_norm": 199903.347381946, "learning_rate": 4.0112817298652456e-08, "logits/chosen": -1.2995800971984863, "logits/rejected": -1.6440702676773071, "logps/chosen": -22.356828689575195, "logps/rejected": -591.6265869140625, "loss": 11937.0477, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072306126356125, "rewards/margins": 0.5668342709541321, "rewards/rejected": -0.3596035838127136, "step": 6580 }, { "epoch": 33.4126582278481, "grad_norm": 138603.96487037002, "learning_rate": 3.932936383578815e-08, "logits/chosen": 0.8098524212837219, "logits/rejected": 1.2947828769683838, "logps/chosen": -26.31606674194336, "logps/rejected": -584.9072265625, "loss": 11177.5336, "rewards/accuracies": 1.0, "rewards/chosen": 0.20386937260627747, "rewards/margins": 0.5589767694473267, "rewards/rejected": -0.3551073968410492, "step": 6590 }, { "epoch": 33.4632911392405, "grad_norm": 123948.78500072335, "learning_rate": 3.854591037292385e-08, "logits/chosen": -2.16947603225708, "logits/rejected": -1.0904394388198853, "logps/chosen": -42.8673095703125, "logps/rejected": -585.2350463867188, "loss": 11894.6641, "rewards/accuracies": 1.0, "rewards/chosen": 0.2195717990398407, "rewards/margins": 0.5601873397827148, "rewards/rejected": -0.34061557054519653, "step": 6600 }, { "epoch": 33.51392405063291, "grad_norm": 113327.62874252205, "learning_rate": 3.776245691005954e-08, "logits/chosen": -1.375249981880188, "logits/rejected": -0.7785667181015015, "logps/chosen": -29.649211883544922, "logps/rejected": -602.9840698242188, "loss": 12210.0344, "rewards/accuracies": 1.0, "rewards/chosen": 0.22011515498161316, "rewards/margins": 0.5793704390525818, "rewards/rejected": -0.35925528407096863, "step": 6610 }, { "epoch": 33.564556962025314, "grad_norm": 79524.96422723045, "learning_rate": 3.6979003447195234e-08, "logits/chosen": -0.7508550882339478, "logits/rejected": -0.23799777030944824, "logps/chosen": -17.09669303894043, "logps/rejected": -572.3134155273438, "loss": 12138.4203, "rewards/accuracies": 1.0, "rewards/chosen": 0.2047223150730133, "rewards/margins": 0.5538768768310547, "rewards/rejected": -0.34915462136268616, "step": 6620 }, { "epoch": 33.61518987341772, "grad_norm": 80597.64263401506, "learning_rate": 3.619554998433093e-08, "logits/chosen": -1.7500404119491577, "logits/rejected": -1.4937622547149658, "logps/chosen": -24.847320556640625, "logps/rejected": -594.1591796875, "loss": 12270.6344, "rewards/accuracies": 1.0, "rewards/chosen": 0.21394848823547363, "rewards/margins": 0.5700836181640625, "rewards/rejected": -0.35613518953323364, "step": 6630 }, { "epoch": 33.665822784810125, "grad_norm": 100669.75725024722, "learning_rate": 3.541209652146662e-08, "logits/chosen": -0.4524414539337158, "logits/rejected": -0.5694657564163208, "logps/chosen": -25.72067642211914, "logps/rejected": -572.9901123046875, "loss": 11448.4047, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20151250064373016, "rewards/margins": 0.5470980405807495, "rewards/rejected": -0.345585435628891, "step": 6640 }, { "epoch": 33.71645569620253, "grad_norm": 136734.1372891588, "learning_rate": 3.462864305860232e-08, "logits/chosen": -0.10392338037490845, "logits/rejected": 0.025324154645204544, "logps/chosen": -23.138744354248047, "logps/rejected": -578.2369995117188, "loss": 11719.0234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2065146416425705, "rewards/margins": 0.5536417365074158, "rewards/rejected": -0.3471270501613617, "step": 6650 }, { "epoch": 33.767088607594935, "grad_norm": 96060.1935775592, "learning_rate": 3.384518959573801e-08, "logits/chosen": -1.5298357009887695, "logits/rejected": -1.111659049987793, "logps/chosen": -36.602691650390625, "logps/rejected": -594.2269287109375, "loss": 11903.4828, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21466748416423798, "rewards/margins": 0.5613253116607666, "rewards/rejected": -0.34665781259536743, "step": 6660 }, { "epoch": 33.81772151898734, "grad_norm": 82308.39144839271, "learning_rate": 3.3061736132873705e-08, "logits/chosen": -1.9629747867584229, "logits/rejected": -1.8584734201431274, "logps/chosen": -17.865947723388672, "logps/rejected": -566.314453125, "loss": 12147.5891, "rewards/accuracies": 1.0, "rewards/chosen": 0.2041165828704834, "rewards/margins": 0.5491331219673157, "rewards/rejected": -0.3450164496898651, "step": 6670 }, { "epoch": 33.868354430379746, "grad_norm": 132433.76933098322, "learning_rate": 3.22782826700094e-08, "logits/chosen": -0.10643855482339859, "logits/rejected": 0.1565506011247635, "logps/chosen": -23.206607818603516, "logps/rejected": -565.3855590820312, "loss": 11928.0656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19701281189918518, "rewards/margins": 0.5372076630592346, "rewards/rejected": -0.34019485116004944, "step": 6680 }, { "epoch": 33.91898734177215, "grad_norm": 99524.21425394616, "learning_rate": 3.149482920714509e-08, "logits/chosen": 0.7746875286102295, "logits/rejected": 1.4906342029571533, "logps/chosen": -28.62857437133789, "logps/rejected": -569.8626708984375, "loss": 11616.475, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20620207488536835, "rewards/margins": 0.548004686832428, "rewards/rejected": -0.3418026268482208, "step": 6690 }, { "epoch": 33.96962025316456, "grad_norm": 72753.16066899289, "learning_rate": 3.071137574428079e-08, "logits/chosen": 0.6492331624031067, "logits/rejected": 0.7617141604423523, "logps/chosen": -25.677988052368164, "logps/rejected": -560.1131591796875, "loss": 12074.9086, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19937190413475037, "rewards/margins": 0.5361818075180054, "rewards/rejected": -0.33680984377861023, "step": 6700 }, { "epoch": 34.02025316455696, "grad_norm": 69602.29112012005, "learning_rate": 2.992792228141648e-08, "logits/chosen": -1.09294855594635, "logits/rejected": -0.46724218130111694, "logps/chosen": -27.281131744384766, "logps/rejected": -564.8081665039062, "loss": 11690.2227, "rewards/accuracies": 1.0, "rewards/chosen": 0.20867964625358582, "rewards/margins": 0.5397676825523376, "rewards/rejected": -0.3310880661010742, "step": 6710 }, { "epoch": 34.07088607594937, "grad_norm": 124920.949317699, "learning_rate": 2.9144468818552176e-08, "logits/chosen": -0.5232747197151184, "logits/rejected": 0.22049197554588318, "logps/chosen": -28.443017959594727, "logps/rejected": -575.3606567382812, "loss": 11532.3461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20501036942005157, "rewards/margins": 0.5482991933822632, "rewards/rejected": -0.34328892827033997, "step": 6720 }, { "epoch": 34.12151898734177, "grad_norm": 83544.836801972, "learning_rate": 2.836101535568787e-08, "logits/chosen": -1.3320372104644775, "logits/rejected": -1.3442357778549194, "logps/chosen": -22.64333724975586, "logps/rejected": -572.712890625, "loss": 12274.693, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20690365135669708, "rewards/margins": 0.550982654094696, "rewards/rejected": -0.34407907724380493, "step": 6730 }, { "epoch": 34.17215189873418, "grad_norm": 59570.622166337576, "learning_rate": 2.7577561892823564e-08, "logits/chosen": -0.21290139853954315, "logits/rejected": -0.059629153460264206, "logps/chosen": -25.701587677001953, "logps/rejected": -593.871826171875, "loss": 11275.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21512384712696075, "rewards/margins": 0.5740803480148315, "rewards/rejected": -0.358956515789032, "step": 6740 }, { "epoch": 34.22278481012658, "grad_norm": 65440.143207260466, "learning_rate": 2.6794108429959257e-08, "logits/chosen": 0.7750476598739624, "logits/rejected": 0.7844541668891907, "logps/chosen": -20.633939743041992, "logps/rejected": -552.0474853515625, "loss": 11428.4266, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19471852481365204, "rewards/margins": 0.5300661325454712, "rewards/rejected": -0.33534759283065796, "step": 6750 }, { "epoch": 34.27341772151899, "grad_norm": 68525.87135636444, "learning_rate": 2.6010654967094953e-08, "logits/chosen": -0.38150349259376526, "logits/rejected": -0.23816132545471191, "logps/chosen": -22.66727066040039, "logps/rejected": -573.2194213867188, "loss": 11908.0906, "rewards/accuracies": 1.0, "rewards/chosen": 0.21106655895709991, "rewards/margins": 0.551810622215271, "rewards/rejected": -0.3407440483570099, "step": 6760 }, { "epoch": 34.324050632911394, "grad_norm": 106626.98105878729, "learning_rate": 2.5227201504230646e-08, "logits/chosen": -1.4471355676651, "logits/rejected": -1.1485611200332642, "logps/chosen": -30.057971954345703, "logps/rejected": -575.90185546875, "loss": 12232.9586, "rewards/accuracies": 1.0, "rewards/chosen": 0.21069788932800293, "rewards/margins": 0.5553726553916931, "rewards/rejected": -0.3446747958660126, "step": 6770 }, { "epoch": 34.3746835443038, "grad_norm": 82201.6406165968, "learning_rate": 2.4443748041366342e-08, "logits/chosen": -0.32743334770202637, "logits/rejected": -0.1376263052225113, "logps/chosen": -21.2067928314209, "logps/rejected": -579.6965942382812, "loss": 12257.0297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1995442807674408, "rewards/margins": 0.5562165975570679, "rewards/rejected": -0.35667237639427185, "step": 6780 }, { "epoch": 34.425316455696205, "grad_norm": 101176.94346633952, "learning_rate": 2.3660294578502035e-08, "logits/chosen": -0.027527982369065285, "logits/rejected": 0.533041775226593, "logps/chosen": -26.760761260986328, "logps/rejected": -598.9354248046875, "loss": 11819.9969, "rewards/accuracies": 1.0, "rewards/chosen": 0.21746928989887238, "rewards/margins": 0.5736481547355652, "rewards/rejected": -0.356178879737854, "step": 6790 }, { "epoch": 34.47594936708861, "grad_norm": 121876.45413951192, "learning_rate": 2.2876841115637728e-08, "logits/chosen": 0.13078555464744568, "logits/rejected": 0.333305299282074, "logps/chosen": -21.788782119750977, "logps/rejected": -576.6356201171875, "loss": 11798.6219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20838093757629395, "rewards/margins": 0.5560372471809387, "rewards/rejected": -0.34765633940696716, "step": 6800 }, { "epoch": 34.526582278481015, "grad_norm": 82394.08815026373, "learning_rate": 2.2093387652773424e-08, "logits/chosen": -1.4721665382385254, "logits/rejected": -0.7360283732414246, "logps/chosen": -25.797271728515625, "logps/rejected": -575.2802734375, "loss": 11867.5641, "rewards/accuracies": 1.0, "rewards/chosen": 0.21113534271717072, "rewards/margins": 0.5519530177116394, "rewards/rejected": -0.34081774950027466, "step": 6810 }, { "epoch": 34.57721518987342, "grad_norm": 95810.78866413939, "learning_rate": 2.1309934189909117e-08, "logits/chosen": -1.161169409751892, "logits/rejected": -0.91253662109375, "logps/chosen": -27.88307762145996, "logps/rejected": -587.9513549804688, "loss": 12214.8375, "rewards/accuracies": 1.0, "rewards/chosen": 0.20574505627155304, "rewards/margins": 0.5562767386436462, "rewards/rejected": -0.3505316376686096, "step": 6820 }, { "epoch": 34.627848101265826, "grad_norm": 172219.54727864428, "learning_rate": 2.0526480727044813e-08, "logits/chosen": -0.3230019509792328, "logits/rejected": 0.021614838391542435, "logps/chosen": -25.339590072631836, "logps/rejected": -562.7479858398438, "loss": 12054.6734, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19978457689285278, "rewards/margins": 0.539312481880188, "rewards/rejected": -0.3395279347896576, "step": 6830 }, { "epoch": 34.678481012658224, "grad_norm": 97553.39405702737, "learning_rate": 1.9743027264180506e-08, "logits/chosen": 0.4619535505771637, "logits/rejected": 1.0029349327087402, "logps/chosen": -25.021116256713867, "logps/rejected": -548.2736206054688, "loss": 11655.4656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19646362960338593, "rewards/margins": 0.5229059457778931, "rewards/rejected": -0.32644230127334595, "step": 6840 }, { "epoch": 34.72911392405063, "grad_norm": 85932.39441572773, "learning_rate": 1.8959573801316202e-08, "logits/chosen": -1.6316754817962646, "logits/rejected": -1.3359291553497314, "logps/chosen": -29.495471954345703, "logps/rejected": -592.9409790039062, "loss": 11807.9469, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21343278884887695, "rewards/margins": 0.5635305643081665, "rewards/rejected": -0.3500978350639343, "step": 6850 }, { "epoch": 34.779746835443035, "grad_norm": 111046.4046992094, "learning_rate": 1.8176120338451895e-08, "logits/chosen": -1.9672811031341553, "logits/rejected": -1.9657671451568604, "logps/chosen": -28.83864402770996, "logps/rejected": -585.9581298828125, "loss": 11170.3703, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20914232730865479, "rewards/margins": 0.5588939189910889, "rewards/rejected": -0.3497515618801117, "step": 6860 }, { "epoch": 34.83037974683544, "grad_norm": 85983.71784945966, "learning_rate": 1.7392666875587588e-08, "logits/chosen": -0.45147451758384705, "logits/rejected": -0.5098736882209778, "logps/chosen": -27.473459243774414, "logps/rejected": -575.5574340820312, "loss": 10970.9664, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.20568005740642548, "rewards/margins": 0.5446674823760986, "rewards/rejected": -0.33898741006851196, "step": 6870 }, { "epoch": 34.881012658227846, "grad_norm": 65511.20912205873, "learning_rate": 1.6609213412723284e-08, "logits/chosen": -0.6330152750015259, "logits/rejected": 0.2800363004207611, "logps/chosen": -35.236671447753906, "logps/rejected": -596.6030883789062, "loss": 11516.7508, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21417097747325897, "rewards/margins": 0.5598101019859314, "rewards/rejected": -0.3456391394138336, "step": 6880 }, { "epoch": 34.93164556962025, "grad_norm": 92014.0392742902, "learning_rate": 1.5825759949858977e-08, "logits/chosen": 0.384327232837677, "logits/rejected": 0.5035692453384399, "logps/chosen": -30.62520408630371, "logps/rejected": -565.2235107421875, "loss": 11703.7977, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2038690596818924, "rewards/margins": 0.5329573750495911, "rewards/rejected": -0.3290882706642151, "step": 6890 }, { "epoch": 34.982278481012656, "grad_norm": 66423.21573889554, "learning_rate": 1.5042306486994673e-08, "logits/chosen": 0.34197521209716797, "logits/rejected": 0.8052785992622375, "logps/chosen": -28.992361068725586, "logps/rejected": -575.634033203125, "loss": 11720.9023, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20474262535572052, "rewards/margins": 0.5458477735519409, "rewards/rejected": -0.3411051332950592, "step": 6900 }, { "epoch": 35.03291139240506, "grad_norm": 131437.33515286856, "learning_rate": 1.4258853024130366e-08, "logits/chosen": -1.0964847803115845, "logits/rejected": -0.780900239944458, "logps/chosen": -32.114646911621094, "logps/rejected": -596.6611328125, "loss": 11782.3414, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2081715166568756, "rewards/margins": 0.5639813542366028, "rewards/rejected": -0.3558098375797272, "step": 6910 }, { "epoch": 35.08354430379747, "grad_norm": 56330.643408705866, "learning_rate": 1.347539956126606e-08, "logits/chosen": -0.7748197913169861, "logits/rejected": -0.39359089732170105, "logps/chosen": -25.1397705078125, "logps/rejected": -592.2284545898438, "loss": 12438.4016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21363647282123566, "rewards/margins": 0.5692847371101379, "rewards/rejected": -0.35564830899238586, "step": 6920 }, { "epoch": 35.13417721518987, "grad_norm": 61670.67939940539, "learning_rate": 1.2691946098401754e-08, "logits/chosen": -0.4548005163669586, "logits/rejected": -0.3663405776023865, "logps/chosen": -30.274677276611328, "logps/rejected": -566.1094970703125, "loss": 12029.2031, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19851429760456085, "rewards/margins": 0.5346571803092957, "rewards/rejected": -0.3361428678035736, "step": 6930 }, { "epoch": 35.18481012658228, "grad_norm": 58623.275250495586, "learning_rate": 1.1908492635537449e-08, "logits/chosen": -0.15851683914661407, "logits/rejected": 0.04081523418426514, "logps/chosen": -29.98187828063965, "logps/rejected": -592.0548095703125, "loss": 11041.75, "rewards/accuracies": 1.0, "rewards/chosen": 0.21129722893238068, "rewards/margins": 0.5635300874710083, "rewards/rejected": -0.3522329032421112, "step": 6940 }, { "epoch": 35.23544303797468, "grad_norm": 64412.181909102495, "learning_rate": 1.1125039172673142e-08, "logits/chosen": -1.656226396560669, "logits/rejected": -0.20408448576927185, "logps/chosen": -33.12029266357422, "logps/rejected": -594.759033203125, "loss": 11519.8148, "rewards/accuracies": 1.0, "rewards/chosen": 0.21017961204051971, "rewards/margins": 0.5670366883277893, "rewards/rejected": -0.356857031583786, "step": 6950 }, { "epoch": 35.28607594936709, "grad_norm": 56297.660589582665, "learning_rate": 1.0341585709808836e-08, "logits/chosen": -1.547500491142273, "logits/rejected": -0.6769775152206421, "logps/chosen": -24.704757690429688, "logps/rejected": -576.3121337890625, "loss": 11605.0844, "rewards/accuracies": 1.0, "rewards/chosen": 0.21090254187583923, "rewards/margins": 0.5502737760543823, "rewards/rejected": -0.3393712043762207, "step": 6960 }, { "epoch": 35.336708860759494, "grad_norm": 56909.53482731003, "learning_rate": 9.55813224694453e-09, "logits/chosen": -1.0313692092895508, "logits/rejected": -0.7449108362197876, "logps/chosen": -26.623498916625977, "logps/rejected": -587.3609008789062, "loss": 11753.7047, "rewards/accuracies": 1.0, "rewards/chosen": 0.21617145836353302, "rewards/margins": 0.5669609308242798, "rewards/rejected": -0.35078948736190796, "step": 6970 }, { "epoch": 35.3873417721519, "grad_norm": 178835.84410749955, "learning_rate": 8.774678784080225e-09, "logits/chosen": -1.1763582229614258, "logits/rejected": -1.1659915447235107, "logps/chosen": -31.361690521240234, "logps/rejected": -579.3533325195312, "loss": 11790.1078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.20984821021556854, "rewards/margins": 0.5475795269012451, "rewards/rejected": -0.337731271982193, "step": 6980 }, { "epoch": 35.437974683544304, "grad_norm": 71703.60906538504, "learning_rate": 7.99122532121592e-09, "logits/chosen": -0.38636288046836853, "logits/rejected": -0.6006811857223511, "logps/chosen": -34.191776275634766, "logps/rejected": -590.3701171875, "loss": 11606.5477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21669046580791473, "rewards/margins": 0.5592674016952515, "rewards/rejected": -0.34257692098617554, "step": 6990 }, { "epoch": 35.48860759493671, "grad_norm": 63272.25835527106, "learning_rate": 7.207771858351613e-09, "logits/chosen": 2.0330252647399902, "logits/rejected": 1.6279929876327515, "logps/chosen": -23.355932235717773, "logps/rejected": -561.3161010742188, "loss": 11705.225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20284771919250488, "rewards/margins": 0.5360475778579712, "rewards/rejected": -0.3331999182701111, "step": 7000 }, { "epoch": 35.539240506329115, "grad_norm": 61249.27413263123, "learning_rate": 6.424318395487308e-09, "logits/chosen": -0.4422365128993988, "logits/rejected": 0.3416946828365326, "logps/chosen": -23.512128829956055, "logps/rejected": -589.8380126953125, "loss": 12052.432, "rewards/accuracies": 1.0, "rewards/chosen": 0.21374277770519257, "rewards/margins": 0.5721359252929688, "rewards/rejected": -0.358393132686615, "step": 7010 }, { "epoch": 35.58987341772152, "grad_norm": 71404.84613231423, "learning_rate": 5.6408649326230014e-09, "logits/chosen": -0.8322502374649048, "logits/rejected": -0.6665211319923401, "logps/chosen": -27.32610511779785, "logps/rejected": -572.2705078125, "loss": 11414.2812, "rewards/accuracies": 1.0, "rewards/chosen": 0.20042335987091064, "rewards/margins": 0.5454776287078857, "rewards/rejected": -0.3450542390346527, "step": 7020 }, { "epoch": 35.640506329113926, "grad_norm": 59338.91271881947, "learning_rate": 4.857411469758696e-09, "logits/chosen": -0.6001558303833008, "logits/rejected": -0.34640446305274963, "logps/chosen": -27.738061904907227, "logps/rejected": -566.6168212890625, "loss": 11256.2984, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20105385780334473, "rewards/margins": 0.5371016263961792, "rewards/rejected": -0.33604779839515686, "step": 7030 }, { "epoch": 35.69113924050633, "grad_norm": 58656.86338579773, "learning_rate": 4.07395800689439e-09, "logits/chosen": -1.311295509338379, "logits/rejected": -0.9886859655380249, "logps/chosen": -34.54024124145508, "logps/rejected": -594.0494384765625, "loss": 11898.4797, "rewards/accuracies": 1.0, "rewards/chosen": 0.20786690711975098, "rewards/margins": 0.5536772608757019, "rewards/rejected": -0.3458103537559509, "step": 7040 }, { "epoch": 35.741772151898736, "grad_norm": 63433.605932250306, "learning_rate": 3.2905045440300845e-09, "logits/chosen": -2.9923501014709473, "logits/rejected": -2.3592472076416016, "logps/chosen": -27.545886993408203, "logps/rejected": -589.269775390625, "loss": 11235.3266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21022014319896698, "rewards/margins": 0.5570772886276245, "rewards/rejected": -0.34685713052749634, "step": 7050 }, { "epoch": 35.79240506329114, "grad_norm": 134377.54958033512, "learning_rate": 2.5070510811657785e-09, "logits/chosen": -1.720534324645996, "logits/rejected": -1.3578100204467773, "logps/chosen": -29.213916778564453, "logps/rejected": -567.0501708984375, "loss": 11589.5594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20267781615257263, "rewards/margins": 0.537537693977356, "rewards/rejected": -0.3348599672317505, "step": 7060 }, { "epoch": 35.84303797468355, "grad_norm": 67914.02766915208, "learning_rate": 1.7235976183014728e-09, "logits/chosen": -0.6159377098083496, "logits/rejected": 0.34991899132728577, "logps/chosen": -21.085241317749023, "logps/rejected": -550.9933471679688, "loss": 11967.5938, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.199566051363945, "rewards/margins": 0.5368520021438599, "rewards/rejected": -0.33728593587875366, "step": 7070 }, { "epoch": 35.89367088607595, "grad_norm": 59297.02701129836, "learning_rate": 9.40144155437167e-10, "logits/chosen": 0.5505496859550476, "logits/rejected": 1.5590342283248901, "logps/chosen": -19.077686309814453, "logps/rejected": -584.4923095703125, "loss": 11927.0484, "rewards/accuracies": 1.0, "rewards/chosen": 0.20645365118980408, "rewards/margins": 0.5670816898345947, "rewards/rejected": -0.36062803864479065, "step": 7080 }, { "epoch": 35.94430379746836, "grad_norm": 57292.383516847614, "learning_rate": 1.5669069257286116e-10, "logits/chosen": -1.5784406661987305, "logits/rejected": -1.423514485359192, "logps/chosen": -30.030658721923828, "logps/rejected": -591.372314453125, "loss": 11925.8789, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2148711234331131, "rewards/margins": 0.5598932504653931, "rewards/rejected": -0.3450221121311188, "step": 7090 }, { "epoch": 35.95443037974684, "step": 7092, "total_flos": 0.0, "train_loss": 6426.156043451248, "train_runtime": 5724.5218, "train_samples_per_second": 79.389, "train_steps_per_second": 1.239 } ], "logging_steps": 10, "max_steps": 7092, "num_input_tokens_seen": 0, "num_train_epochs": 36, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }