{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.3054830287206268e-08, "logits/chosen": 0.9550814628601074, "logits/rejected": 1.0664727687835693, "logps/chosen": -190.47879028320312, "logps/rejected": -177.6958770751953, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.3054830287206266e-07, "logits/chosen": 1.0215020179748535, "logits/rejected": 1.073843240737915, "logps/chosen": -277.8812561035156, "logps/rejected": -268.32220458984375, "loss": 0.6931, "rewards/accuracies": 0.4097222089767456, "rewards/chosen": 0.00012883776798844337, "rewards/margins": 0.0005512596690095961, "rewards/rejected": -0.00042242190102115273, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.610966057441253e-07, "logits/chosen": 1.054603099822998, "logits/rejected": 1.0355112552642822, "logps/chosen": -258.0608215332031, "logps/rejected": -219.5281219482422, "loss": 0.6932, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0007742593297734857, "rewards/margins": -0.0006406827596947551, "rewards/rejected": -0.00013357654097490013, "step": 20 }, { "epoch": 0.01, "learning_rate": 3.9164490861618804e-07, "logits/chosen": 0.9782761335372925, "logits/rejected": 0.9957435727119446, "logps/chosen": -234.4627685546875, "logps/rejected": -216.38687133789062, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0007474010926671326, "rewards/margins": 0.00048109880299307406, "rewards/rejected": -0.0012284999247640371, "step": 30 }, { "epoch": 0.01, "learning_rate": 5.221932114882506e-07, "logits/chosen": 1.060667634010315, "logits/rejected": 1.0616825819015503, "logps/chosen": -269.3723449707031, "logps/rejected": -236.52392578125, "loss": 0.6931, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0005697375163435936, "rewards/margins": -0.00030056110699661076, "rewards/rejected": -0.0002691763802431524, "step": 40 }, { "epoch": 0.01, "learning_rate": 6.527415143603135e-07, "logits/chosen": 1.0123419761657715, "logits/rejected": 1.0498076677322388, "logps/chosen": -245.165771484375, "logps/rejected": -241.9037322998047, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0007764647016301751, "rewards/margins": -0.00020735012367367744, "rewards/rejected": -0.0005691145197488368, "step": 50 }, { "epoch": 0.02, "learning_rate": 7.832898172323761e-07, "logits/chosen": 0.9758418202400208, "logits/rejected": 1.0930532217025757, "logps/chosen": -283.73480224609375, "logps/rejected": -234.12576293945312, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -4.083226667717099e-05, "rewards/margins": -0.00016547185077797621, "rewards/rejected": 0.00012463955499697477, "step": 60 }, { "epoch": 0.02, "learning_rate": 9.138381201044387e-07, "logits/chosen": 1.005788803100586, "logits/rejected": 1.0820422172546387, "logps/chosen": -271.99468994140625, "logps/rejected": -231.10446166992188, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 4.1856186726363376e-05, "rewards/margins": 0.0007076783804222941, "rewards/rejected": -0.0006658221827819943, "step": 70 }, { "epoch": 0.02, "learning_rate": 1.0443864229765013e-06, "logits/chosen": 1.0230119228363037, "logits/rejected": 1.0622894763946533, "logps/chosen": -283.88946533203125, "logps/rejected": -261.6778869628906, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.000257449341006577, "rewards/margins": -8.570156205678359e-05, "rewards/rejected": -0.00017174780077766627, "step": 80 }, { "epoch": 0.02, "learning_rate": 1.1749347258485642e-06, "logits/chosen": 1.043121337890625, "logits/rejected": 1.0927269458770752, "logps/chosen": -278.48974609375, "logps/rejected": -235.72775268554688, "loss": 0.6932, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.0013169237645342946, "rewards/margins": -0.0013213430065661669, "rewards/rejected": 4.419172000780236e-06, "step": 90 }, { "epoch": 0.03, "learning_rate": 1.305483028720627e-06, "logits/chosen": 0.991162121295929, "logits/rejected": 1.0666497945785522, "logps/chosen": -237.25302124023438, "logps/rejected": -218.53671264648438, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00023656387929804623, "rewards/margins": 0.0011323514627292752, "rewards/rejected": -0.0008957876125350595, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": 0.9709606766700745, "eval_logits/rejected": 1.0631499290466309, "eval_logps/chosen": -277.5757751464844, "eval_logps/rejected": -243.87449645996094, "eval_loss": 0.6931320428848267, "eval_rewards/accuracies": 0.4514999985694885, "eval_rewards/chosen": -0.00028967749676667154, "eval_rewards/margins": 0.00029621709836646914, "eval_rewards/rejected": -0.0005858945660293102, "eval_runtime": 539.9229, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 100 }, { "epoch": 0.03, "learning_rate": 1.4360313315926894e-06, "logits/chosen": 0.9960800409317017, "logits/rejected": 1.081016182899475, "logps/chosen": -283.5791015625, "logps/rejected": -250.1773681640625, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0004133354814257473, "rewards/margins": 0.0019074681913480163, "rewards/rejected": -0.0014941326808184385, "step": 110 }, { "epoch": 0.03, "learning_rate": 1.5665796344647521e-06, "logits/chosen": 1.0297129154205322, "logits/rejected": 1.075050950050354, "logps/chosen": -227.7956085205078, "logps/rejected": -234.15097045898438, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00013152281462680548, "rewards/margins": 0.0006020874716341496, "rewards/rejected": -0.00047056470066308975, "step": 120 }, { "epoch": 0.03, "learning_rate": 1.6971279373368146e-06, "logits/chosen": 1.0477956533432007, "logits/rejected": 1.093898892402649, "logps/chosen": -282.6871643066406, "logps/rejected": -239.27749633789062, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00015316384087782353, "rewards/margins": 0.0016984669491648674, "rewards/rejected": -0.0015453032683581114, "step": 130 }, { "epoch": 0.04, "learning_rate": 1.8276762402088774e-06, "logits/chosen": 1.026609182357788, "logits/rejected": 1.0265687704086304, "logps/chosen": -263.9731750488281, "logps/rejected": -237.11538696289062, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0007792095420882106, "rewards/margins": 0.0024368534795939922, "rewards/rejected": -0.0016576439375057817, "step": 140 }, { "epoch": 0.04, "learning_rate": 1.9582245430809403e-06, "logits/chosen": 1.007411241531372, "logits/rejected": 1.0271480083465576, "logps/chosen": -262.67718505859375, "logps/rejected": -235.0521240234375, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00037160428473725915, "rewards/margins": 0.002256640698760748, "rewards/rejected": -0.0018850360065698624, "step": 150 }, { "epoch": 0.04, "learning_rate": 2.0887728459530026e-06, "logits/chosen": 0.9555414915084839, "logits/rejected": 1.0848571062088013, "logps/chosen": -258.272705078125, "logps/rejected": -240.0525360107422, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00037167343543842435, "rewards/margins": 0.0017876753117889166, "rewards/rejected": -0.0014160019345581532, "step": 160 }, { "epoch": 0.04, "learning_rate": 2.2193211488250653e-06, "logits/chosen": 0.9961720705032349, "logits/rejected": 1.0719153881072998, "logps/chosen": -268.48541259765625, "logps/rejected": -218.2855987548828, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0007700158166699111, "rewards/margins": 0.0027975619304925203, "rewards/rejected": -0.0020275460556149483, "step": 170 }, { "epoch": 0.05, "learning_rate": 2.3498694516971284e-06, "logits/chosen": 0.9853116869926453, "logits/rejected": 1.033042073249817, "logps/chosen": -272.58697509765625, "logps/rejected": -237.8570556640625, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.000967078551184386, "rewards/margins": 0.0026892449241131544, "rewards/rejected": -0.0017221663147211075, "step": 180 }, { "epoch": 0.05, "learning_rate": 2.4804177545691907e-06, "logits/chosen": 0.9925041198730469, "logits/rejected": 1.0097901821136475, "logps/chosen": -269.4550476074219, "logps/rejected": -235.5220489501953, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0016971088480204344, "rewards/margins": 0.0036631212569773197, "rewards/rejected": -0.0019660124089568853, "step": 190 }, { "epoch": 0.05, "learning_rate": 2.610966057441254e-06, "logits/chosen": 1.0215815305709839, "logits/rejected": 1.1321773529052734, "logps/chosen": -278.18402099609375, "logps/rejected": -249.7351531982422, "loss": 0.693, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003087539691478014, "rewards/margins": 0.004473397042602301, "rewards/rejected": -0.0013858575839549303, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": 0.9704993367195129, "eval_logits/rejected": 1.0632352828979492, "eval_logps/chosen": -277.26611328125, "eval_logps/rejected": -243.9904022216797, "eval_loss": 0.6929376721382141, "eval_rewards/accuracies": 0.5885000228881836, "eval_rewards/chosen": 0.0028066388331353664, "eval_rewards/margins": 0.004551402758806944, "eval_rewards/rejected": -0.0017447640420868993, "eval_runtime": 540.2748, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.925, "step": 200 }, { "epoch": 0.05, "learning_rate": 2.741514360313316e-06, "logits/chosen": 1.0122153759002686, "logits/rejected": 1.091338038444519, "logps/chosen": -260.8949279785156, "logps/rejected": -233.3127899169922, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0018235554452985525, "rewards/margins": 0.004228769801557064, "rewards/rejected": -0.0024052143562585115, "step": 210 }, { "epoch": 0.06, "learning_rate": 2.872062663185379e-06, "logits/chosen": 1.041244387626648, "logits/rejected": 1.1119762659072876, "logps/chosen": -277.6720886230469, "logps/rejected": -243.11624145507812, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00286296010017395, "rewards/margins": 0.004079398699104786, "rewards/rejected": -0.0012164388317614794, "step": 220 }, { "epoch": 0.06, "learning_rate": 3.0026109660574416e-06, "logits/chosen": 1.0305936336517334, "logits/rejected": 1.0384341478347778, "logps/chosen": -268.91888427734375, "logps/rejected": -275.10919189453125, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004722142592072487, "rewards/margins": 0.005156674422323704, "rewards/rejected": -0.0004345317429397255, "step": 230 }, { "epoch": 0.06, "learning_rate": 3.1331592689295043e-06, "logits/chosen": 1.0291810035705566, "logits/rejected": 1.046808123588562, "logps/chosen": -272.0259704589844, "logps/rejected": -231.6946258544922, "loss": 0.6928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.007824023254215717, "rewards/margins": 0.009780755266547203, "rewards/rejected": -0.001956732477992773, "step": 240 }, { "epoch": 0.07, "learning_rate": 3.263707571801567e-06, "logits/chosen": 0.9261485934257507, "logits/rejected": 1.0824263095855713, "logps/chosen": -262.42010498046875, "logps/rejected": -207.1998291015625, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004594790283590555, "rewards/margins": 0.006828789599239826, "rewards/rejected": -0.0022339997813105583, "step": 250 }, { "epoch": 0.07, "learning_rate": 3.3942558746736293e-06, "logits/chosen": 1.0177654027938843, "logits/rejected": 1.0079872608184814, "logps/chosen": -255.89602661132812, "logps/rejected": -249.2772979736328, "loss": 0.6927, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.004748777486383915, "rewards/margins": 0.007689561694860458, "rewards/rejected": -0.0029407842084765434, "step": 260 }, { "epoch": 0.07, "learning_rate": 3.524804177545692e-06, "logits/chosen": 0.9201077222824097, "logits/rejected": 1.0668845176696777, "logps/chosen": -250.4193115234375, "logps/rejected": -225.25808715820312, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00446278415620327, "rewards/margins": 0.009258858859539032, "rewards/rejected": -0.004796075168997049, "step": 270 }, { "epoch": 0.07, "learning_rate": 3.6553524804177547e-06, "logits/chosen": 0.9566753506660461, "logits/rejected": 0.9955002069473267, "logps/chosen": -262.57159423828125, "logps/rejected": -244.9207763671875, "loss": 0.6926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.007930249907076359, "rewards/margins": 0.012794476933777332, "rewards/rejected": -0.0048642284236848354, "step": 280 }, { "epoch": 0.08, "learning_rate": 3.7859007832898174e-06, "logits/chosen": 0.9608441591262817, "logits/rejected": 1.0510733127593994, "logps/chosen": -258.68780517578125, "logps/rejected": -229.01083374023438, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.008841058239340782, "rewards/margins": 0.01509904582053423, "rewards/rejected": -0.006257989443838596, "step": 290 }, { "epoch": 0.08, "learning_rate": 3.9164490861618806e-06, "logits/chosen": 0.927672266960144, "logits/rejected": 1.0490531921386719, "logps/chosen": -257.1780090332031, "logps/rejected": -227.1814422607422, "loss": 0.6926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.008853832259774208, "rewards/margins": 0.014651511795818806, "rewards/rejected": -0.00579767907038331, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": 0.9545422196388245, "eval_logits/rejected": 1.0488145351409912, "eval_logps/chosen": -276.5484924316406, "eval_logps/rejected": -244.36415100097656, "eval_loss": 0.6924864649772644, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": 0.009982902556657791, "eval_rewards/margins": 0.015465173870325089, "eval_rewards/rejected": -0.005482269451022148, "eval_runtime": 540.3904, "eval_samples_per_second": 3.701, "eval_steps_per_second": 0.925, "step": 300 }, { "epoch": 0.08, "learning_rate": 4.046997389033943e-06, "logits/chosen": 0.9173529744148254, "logits/rejected": 1.0776686668395996, "logps/chosen": -256.60821533203125, "logps/rejected": -226.8739013671875, "loss": 0.6924, "rewards/accuracies": 0.65625, "rewards/chosen": 0.012835139408707619, "rewards/margins": 0.01847982034087181, "rewards/rejected": -0.005644683726131916, "step": 310 }, { "epoch": 0.08, "learning_rate": 4.177545691906005e-06, "logits/chosen": 0.8929821252822876, "logits/rejected": 1.021347165107727, "logps/chosen": -282.7056579589844, "logps/rejected": -256.8482360839844, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.010254921391606331, "rewards/margins": 0.0137600377202034, "rewards/rejected": -0.003505116328597069, "step": 320 }, { "epoch": 0.09, "learning_rate": 4.308093994778068e-06, "logits/chosen": 0.9728350639343262, "logits/rejected": 1.0812106132507324, "logps/chosen": -278.6542663574219, "logps/rejected": -243.5943603515625, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005831174552440643, "rewards/margins": 0.016604231670498848, "rewards/rejected": -0.01077305804938078, "step": 330 }, { "epoch": 0.09, "learning_rate": 4.4386422976501306e-06, "logits/chosen": 0.9642572402954102, "logits/rejected": 0.9961212873458862, "logps/chosen": -273.8861389160156, "logps/rejected": -268.4089050292969, "loss": 0.6924, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.007403290830552578, "rewards/margins": 0.017763126641511917, "rewards/rejected": -0.010359834879636765, "step": 340 }, { "epoch": 0.09, "learning_rate": 4.569190600522193e-06, "logits/chosen": 0.9777683019638062, "logits/rejected": 0.9659522771835327, "logps/chosen": -284.44354248046875, "logps/rejected": -250.377685546875, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009272046387195587, "rewards/margins": 0.019789326936006546, "rewards/rejected": -0.010517279617488384, "step": 350 }, { "epoch": 0.09, "learning_rate": 4.699738903394257e-06, "logits/chosen": 0.926777184009552, "logits/rejected": 1.0163743495941162, "logps/chosen": -301.6736755371094, "logps/rejected": -258.0189514160156, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.009693610481917858, "rewards/margins": 0.02282491698861122, "rewards/rejected": -0.013131308369338512, "step": 360 }, { "epoch": 0.1, "learning_rate": 4.8302872062663196e-06, "logits/chosen": 0.9656028747558594, "logits/rejected": 0.953553318977356, "logps/chosen": -304.98760986328125, "logps/rejected": -260.2730407714844, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00983491726219654, "rewards/margins": 0.022675124928355217, "rewards/rejected": -0.012840206734836102, "step": 370 }, { "epoch": 0.1, "learning_rate": 4.9608355091383814e-06, "logits/chosen": 0.942939281463623, "logits/rejected": 1.044382929801941, "logps/chosen": -255.0367431640625, "logps/rejected": -223.09664916992188, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": 0.012080366723239422, "rewards/margins": 0.02917185053229332, "rewards/rejected": -0.017091484740376472, "step": 380 }, { "epoch": 0.1, "learning_rate": 4.9999488562447675e-06, "logits/chosen": 0.9582707285881042, "logits/rejected": 0.9803364872932434, "logps/chosen": -298.5652770996094, "logps/rejected": -255.8463592529297, "loss": 0.6923, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01266313623636961, "rewards/margins": 0.030872434377670288, "rewards/rejected": -0.018209297209978104, "step": 390 }, { "epoch": 0.1, "learning_rate": 4.999698361256577e-06, "logits/chosen": 0.9647480249404907, "logits/rejected": 0.9665771722793579, "logps/chosen": -278.61871337890625, "logps/rejected": -262.8523254394531, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0069937086664140224, "rewards/margins": 0.03327130153775215, "rewards/rejected": -0.02627759613096714, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": 0.8978338837623596, "eval_logits/rejected": 0.992965817451477, "eval_logps/chosen": -276.977783203125, "eval_logps/rejected": -246.2157440185547, "eval_loss": 0.6920285820960999, "eval_rewards/accuracies": 0.6340000033378601, "eval_rewards/chosen": 0.005690301302820444, "eval_rewards/margins": 0.029688764363527298, "eval_rewards/rejected": -0.02399846352636814, "eval_runtime": 540.0105, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 400 }, { "epoch": 0.11, "learning_rate": 4.999239142174581e-06, "logits/chosen": 0.8835350871086121, "logits/rejected": 0.8935839533805847, "logps/chosen": -293.0201721191406, "logps/rejected": -244.9386444091797, "loss": 0.6921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.005041834898293018, "rewards/margins": 0.02935781143605709, "rewards/rejected": -0.024315981194376945, "step": 410 }, { "epoch": 0.11, "learning_rate": 4.99857123734344e-06, "logits/chosen": 0.9205878376960754, "logits/rejected": 1.0264567136764526, "logps/chosen": -278.33697509765625, "logps/rejected": -245.725341796875, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0019535294268280268, "rewards/margins": 0.029373669996857643, "rewards/rejected": -0.02742014452815056, "step": 420 }, { "epoch": 0.11, "learning_rate": 4.997694702533016e-06, "logits/chosen": 0.8263881802558899, "logits/rejected": 0.9619684219360352, "logps/chosen": -264.7477111816406, "logps/rejected": -259.3697509765625, "loss": 0.6921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.009586657397449017, "rewards/margins": 0.028161650523543358, "rewards/rejected": -0.0377482995390892, "step": 430 }, { "epoch": 0.12, "learning_rate": 4.996609610933713e-06, "logits/chosen": 0.9048721194267273, "logits/rejected": 0.8793436288833618, "logps/chosen": -283.28668212890625, "logps/rejected": -261.3004150390625, "loss": 0.6921, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.013652992434799671, "rewards/margins": 0.024908630177378654, "rewards/rejected": -0.0385616198182106, "step": 440 }, { "epoch": 0.12, "learning_rate": 4.995316053150366e-06, "logits/chosen": 0.8724738359451294, "logits/rejected": 0.9701471328735352, "logps/chosen": -262.9942321777344, "logps/rejected": -236.6386260986328, "loss": 0.6925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.015233024954795837, "rewards/margins": 0.0235325675457716, "rewards/rejected": -0.038765594363212585, "step": 450 }, { "epoch": 0.12, "learning_rate": 4.9938141371946815e-06, "logits/chosen": 0.910510241985321, "logits/rejected": 1.033053994178772, "logps/chosen": -252.5099334716797, "logps/rejected": -252.48379516601562, "loss": 0.6917, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.006133816204965115, "rewards/margins": 0.03705804795026779, "rewards/rejected": -0.04319187253713608, "step": 460 }, { "epoch": 0.12, "learning_rate": 4.992103988476206e-06, "logits/chosen": 0.8955548405647278, "logits/rejected": 0.9851361513137817, "logps/chosen": -296.1971435546875, "logps/rejected": -249.4320526123047, "loss": 0.6917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00848749466240406, "rewards/margins": 0.03199051320552826, "rewards/rejected": -0.04047800973057747, "step": 470 }, { "epoch": 0.13, "learning_rate": 4.990185749791866e-06, "logits/chosen": 0.8434032201766968, "logits/rejected": 0.9481671452522278, "logps/chosen": -243.55819702148438, "logps/rejected": -213.0118865966797, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.011666789650917053, "rewards/margins": 0.036795832216739655, "rewards/rejected": -0.04846261814236641, "step": 480 }, { "epoch": 0.13, "learning_rate": 4.9880595813140395e-06, "logits/chosen": 0.8387807011604309, "logits/rejected": 0.9744217991828918, "logps/chosen": -276.4521484375, "logps/rejected": -238.837890625, "loss": 0.6918, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.016785232350230217, "rewards/margins": 0.03324298933148384, "rewards/rejected": -0.05002821609377861, "step": 490 }, { "epoch": 0.13, "learning_rate": 4.985725660577184e-06, "logits/chosen": 0.9428791999816895, "logits/rejected": 0.9261376261711121, "logps/chosen": -245.7830047607422, "logps/rejected": -235.5664825439453, "loss": 0.6913, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.022965628653764725, "rewards/margins": 0.034700777381658554, "rewards/rejected": -0.05766640976071358, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": 0.8238934874534607, "eval_logits/rejected": 0.9187954068183899, "eval_logps/chosen": -280.7515869140625, "eval_logps/rejected": -250.6851348876953, "eval_loss": 0.6917084455490112, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.03204774856567383, "eval_rewards/margins": 0.03664441406726837, "eval_rewards/rejected": -0.0686921626329422, "eval_runtime": 540.183, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 500 }, { "epoch": 0.13, "learning_rate": 4.983184182463009e-06, "logits/chosen": 0.8044353723526001, "logits/rejected": 0.970533549785614, "logps/chosen": -285.99761962890625, "logps/rejected": -224.1236114501953, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": -0.026267999783158302, "rewards/margins": 0.038414839655160904, "rewards/rejected": -0.06468284130096436, "step": 510 }, { "epoch": 0.14, "learning_rate": 4.980435359184203e-06, "logits/chosen": 0.8804060220718384, "logits/rejected": 0.9580115079879761, "logps/chosen": -290.8780822753906, "logps/rejected": -256.63543701171875, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.026573490351438522, "rewards/margins": 0.04471993073821068, "rewards/rejected": -0.0712934285402298, "step": 520 }, { "epoch": 0.14, "learning_rate": 4.9774794202667236e-06, "logits/chosen": 0.863343358039856, "logits/rejected": 0.9443706274032593, "logps/chosen": -268.26751708984375, "logps/rejected": -238.9235076904297, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.022545838728547096, "rewards/margins": 0.04438090696930885, "rewards/rejected": -0.0669267401099205, "step": 530 }, { "epoch": 0.14, "learning_rate": 4.974316612530615e-06, "logits/chosen": 0.8318734169006348, "logits/rejected": 0.8924848437309265, "logps/chosen": -244.3855438232422, "logps/rejected": -226.19100952148438, "loss": 0.6914, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028137261047959328, "rewards/margins": 0.04323118180036545, "rewards/rejected": -0.07136844098567963, "step": 540 }, { "epoch": 0.14, "learning_rate": 4.970947200069416e-06, "logits/chosen": 0.8746344447135925, "logits/rejected": 0.9145771861076355, "logps/chosen": -266.9718322753906, "logps/rejected": -231.31741333007812, "loss": 0.6915, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04218477010726929, "rewards/margins": 0.05346935987472534, "rewards/rejected": -0.09565412998199463, "step": 550 }, { "epoch": 0.15, "learning_rate": 4.967371464228096e-06, "logits/chosen": 0.9077315330505371, "logits/rejected": 0.9803347587585449, "logps/chosen": -288.12908935546875, "logps/rejected": -266.33343505859375, "loss": 0.6922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.044233765453100204, "rewards/margins": 0.0387597382068634, "rewards/rejected": -0.0829935073852539, "step": 560 }, { "epoch": 0.15, "learning_rate": 4.963589703579569e-06, "logits/chosen": 0.8836727142333984, "logits/rejected": 1.0670572519302368, "logps/chosen": -271.2341003417969, "logps/rejected": -226.1123046875, "loss": 0.6903, "rewards/accuracies": 0.71875, "rewards/chosen": -0.03294859081506729, "rewards/margins": 0.06488404422998428, "rewards/rejected": -0.09783263504505157, "step": 570 }, { "epoch": 0.15, "learning_rate": 4.9596022338997615e-06, "logits/chosen": 0.9106415510177612, "logits/rejected": 0.8857673406600952, "logps/chosen": -260.3927307128906, "logps/rejected": -232.7233428955078, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0389188714325428, "rewards/margins": 0.03832856938242912, "rewards/rejected": -0.07724744826555252, "step": 580 }, { "epoch": 0.15, "learning_rate": 4.955409388141243e-06, "logits/chosen": 0.9676392674446106, "logits/rejected": 0.9093373417854309, "logps/chosen": -271.92193603515625, "logps/rejected": -235.1454620361328, "loss": 0.6912, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04100045561790466, "rewards/margins": 0.03780464082956314, "rewards/rejected": -0.0788050964474678, "step": 590 }, { "epoch": 0.16, "learning_rate": 4.951011516405429e-06, "logits/chosen": 0.9376009702682495, "logits/rejected": 0.9335840344429016, "logps/chosen": -242.3965301513672, "logps/rejected": -265.5517578125, "loss": 0.6916, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0576152577996254, "rewards/margins": 0.03630609065294266, "rewards/rejected": -0.09392134845256805, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": 0.8586252927780151, "eval_logits/rejected": 0.9506573677062988, "eval_logps/chosen": -283.59686279296875, "eval_logps/rejected": -254.26144409179688, "eval_loss": 0.6914932131767273, "eval_rewards/accuracies": 0.6215000152587891, "eval_rewards/chosen": -0.06050081178545952, "eval_rewards/margins": 0.0439542680978775, "eval_rewards/rejected": -0.10445508360862732, "eval_runtime": 539.8865, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 600 }, { "epoch": 0.16, "learning_rate": 4.946408985913344e-06, "logits/chosen": 0.9270865321159363, "logits/rejected": 0.946628212928772, "logps/chosen": -250.1840057373047, "logps/rejected": -228.7745361328125, "loss": 0.6922, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06987909972667694, "rewards/margins": 0.03838449344038963, "rewards/rejected": -0.10826359689235687, "step": 610 }, { "epoch": 0.16, "learning_rate": 4.941602180974958e-06, "logits/chosen": 0.9000824093818665, "logits/rejected": 0.9378656148910522, "logps/chosen": -292.17010498046875, "logps/rejected": -237.016357421875, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": -0.06269699335098267, "rewards/margins": 0.043883226811885834, "rewards/rejected": -0.1065802201628685, "step": 620 }, { "epoch": 0.16, "learning_rate": 4.936591502957101e-06, "logits/chosen": 0.7903264760971069, "logits/rejected": 0.9245864152908325, "logps/chosen": -301.8384704589844, "logps/rejected": -256.8984069824219, "loss": 0.6926, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04840216785669327, "rewards/margins": 0.0328446589410305, "rewards/rejected": -0.08124681562185287, "step": 630 }, { "epoch": 0.17, "learning_rate": 4.931377370249946e-06, "logits/chosen": 0.8983148336410522, "logits/rejected": 0.9722617268562317, "logps/chosen": -278.6772155761719, "logps/rejected": -235.2743377685547, "loss": 0.6919, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04875911772251129, "rewards/margins": 0.03910985589027405, "rewards/rejected": -0.08786897361278534, "step": 640 }, { "epoch": 0.17, "learning_rate": 4.925960218232073e-06, "logits/chosen": 0.9069989323616028, "logits/rejected": 1.027419924736023, "logps/chosen": -308.1993103027344, "logps/rejected": -285.44903564453125, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05254577472805977, "rewards/margins": 0.04926187917590141, "rewards/rejected": -0.10180766880512238, "step": 650 }, { "epoch": 0.17, "learning_rate": 4.920340499234116e-06, "logits/chosen": 0.8621240854263306, "logits/rejected": 0.9766268730163574, "logps/chosen": -288.5992431640625, "logps/rejected": -216.592041015625, "loss": 0.692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0491311252117157, "rewards/margins": 0.03728429228067398, "rewards/rejected": -0.08641541749238968, "step": 660 }, { "epoch": 0.18, "learning_rate": 4.914518682500995e-06, "logits/chosen": 0.9439069032669067, "logits/rejected": 0.9264806509017944, "logps/chosen": -282.1919250488281, "logps/rejected": -278.38037109375, "loss": 0.6915, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05234139412641525, "rewards/margins": 0.04387987032532692, "rewards/rejected": -0.09622126072645187, "step": 670 }, { "epoch": 0.18, "learning_rate": 4.9084952541527315e-06, "logits/chosen": 0.8950139880180359, "logits/rejected": 0.9192444682121277, "logps/chosen": -258.887939453125, "logps/rejected": -246.53256225585938, "loss": 0.6914, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03944612294435501, "rewards/margins": 0.04930661618709564, "rewards/rejected": -0.08875273913145065, "step": 680 }, { "epoch": 0.18, "learning_rate": 4.902270717143858e-06, "logits/chosen": 0.8779371380805969, "logits/rejected": 0.9450112581253052, "logps/chosen": -288.85736083984375, "logps/rejected": -251.94943237304688, "loss": 0.6912, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.032757535576820374, "rewards/margins": 0.05075981095433235, "rewards/rejected": -0.08351735770702362, "step": 690 }, { "epoch": 0.18, "learning_rate": 4.895845591221427e-06, "logits/chosen": 0.7992750406265259, "logits/rejected": 0.9617929458618164, "logps/chosen": -251.58767700195312, "logps/rejected": -232.5711669921875, "loss": 0.6911, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04521554335951805, "rewards/margins": 0.0445394404232502, "rewards/rejected": -0.08975498378276825, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": 0.8818045854568481, "eval_logits/rejected": 0.9764631390571594, "eval_logps/chosen": -281.1485595703125, "eval_logps/rejected": -251.7943878173828, "eval_loss": 0.6914020776748657, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.03601725026965141, "eval_rewards/margins": 0.04376746341586113, "eval_rewards/rejected": -0.07978471368551254, "eval_runtime": 539.8363, "eval_samples_per_second": 3.705, "eval_steps_per_second": 0.926, "step": 700 }, { "epoch": 0.19, "learning_rate": 4.8892204128816e-06, "logits/chosen": 0.9707541465759277, "logits/rejected": 1.0011179447174072, "logps/chosen": -286.44696044921875, "logps/rejected": -283.7610778808594, "loss": 0.6911, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0358760841190815, "rewards/margins": 0.044750213623046875, "rewards/rejected": -0.08062629401683807, "step": 710 }, { "epoch": 0.19, "learning_rate": 4.882395735324864e-06, "logits/chosen": 0.8825947046279907, "logits/rejected": 0.8657267689704895, "logps/chosen": -280.93768310546875, "logps/rejected": -255.46212768554688, "loss": 0.6914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.026189198717474937, "rewards/margins": 0.043071091175079346, "rewards/rejected": -0.06926029175519943, "step": 720 }, { "epoch": 0.19, "learning_rate": 4.87537212840983e-06, "logits/chosen": 0.9044111371040344, "logits/rejected": 0.9175283312797546, "logps/chosen": -301.9758605957031, "logps/rejected": -256.6850280761719, "loss": 0.6915, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024754587560892105, "rewards/margins": 0.04776642844080925, "rewards/rejected": -0.07252101600170135, "step": 730 }, { "epoch": 0.19, "learning_rate": 4.8681501786056545e-06, "logits/chosen": 0.8817905187606812, "logits/rejected": 0.9648914337158203, "logps/chosen": -273.3789367675781, "logps/rejected": -221.35986328125, "loss": 0.6916, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03455664590001106, "rewards/margins": 0.03378739953041077, "rewards/rejected": -0.06834404170513153, "step": 740 }, { "epoch": 0.2, "learning_rate": 4.860730488943068e-06, "logits/chosen": 0.9233236312866211, "logits/rejected": 0.9309977293014526, "logps/chosen": -278.74212646484375, "logps/rejected": -244.9453582763672, "loss": 0.6907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.013511193916201591, "rewards/margins": 0.05613558739423752, "rewards/rejected": -0.06964678317308426, "step": 750 }, { "epoch": 0.2, "learning_rate": 4.853113678964022e-06, "logits/chosen": 0.9726675152778625, "logits/rejected": 0.9908739924430847, "logps/chosen": -249.2692108154297, "logps/rejected": -234.45663452148438, "loss": 0.6908, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.023977819830179214, "rewards/margins": 0.054052967578172684, "rewards/rejected": -0.0780307799577713, "step": 760 }, { "epoch": 0.2, "learning_rate": 4.845300384669958e-06, "logits/chosen": 0.9504178166389465, "logits/rejected": 0.9623494148254395, "logps/chosen": -265.9614562988281, "logps/rejected": -223.534423828125, "loss": 0.6918, "rewards/accuracies": 0.5625, "rewards/chosen": -0.033301644027233124, "rewards/margins": 0.033827196806669235, "rewards/rejected": -0.06712885200977325, "step": 770 }, { "epoch": 0.2, "learning_rate": 4.837291258468701e-06, "logits/chosen": 0.8907458186149597, "logits/rejected": 0.9131177663803101, "logps/chosen": -279.34661865234375, "logps/rejected": -248.4618377685547, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04006670415401459, "rewards/margins": 0.05590524524450302, "rewards/rejected": -0.09597194939851761, "step": 780 }, { "epoch": 0.21, "learning_rate": 4.829086969119984e-06, "logits/chosen": 0.8844520449638367, "logits/rejected": 0.9553192257881165, "logps/chosen": -248.71533203125, "logps/rejected": -241.25350952148438, "loss": 0.6923, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.039020974189043045, "rewards/margins": 0.04393316060304642, "rewards/rejected": -0.08295414596796036, "step": 790 }, { "epoch": 0.21, "learning_rate": 4.820688201679605e-06, "logits/chosen": 0.9063314199447632, "logits/rejected": 0.9573804140090942, "logps/chosen": -261.8778381347656, "logps/rejected": -234.04788208007812, "loss": 0.6915, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04035869985818863, "rewards/margins": 0.04505294933915138, "rewards/rejected": -0.08541164547204971, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": 0.9022059440612793, "eval_logits/rejected": 0.9965441226959229, "eval_logps/chosen": -281.8776550292969, "eval_logps/rejected": -252.8778533935547, "eval_loss": 0.6913056373596191, "eval_rewards/accuracies": 0.6240000128746033, "eval_rewards/chosen": -0.04330845922231674, "eval_rewards/margins": 0.04731076583266258, "eval_rewards/rejected": -0.09061922132968903, "eval_runtime": 540.1978, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 800 }, { "epoch": 0.21, "learning_rate": 4.8120956574422315e-06, "logits/chosen": 0.9879693984985352, "logits/rejected": 0.9654221534729004, "logps/chosen": -284.70416259765625, "logps/rejected": -282.3924255371094, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04387697950005531, "rewards/margins": 0.05213203281164169, "rewards/rejected": -0.0960090160369873, "step": 810 }, { "epoch": 0.21, "learning_rate": 4.803310053882831e-06, "logits/chosen": 0.9151199460029602, "logits/rejected": 0.9490255117416382, "logps/chosen": -214.4105987548828, "logps/rejected": -204.9471435546875, "loss": 0.6918, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04438359662890434, "rewards/margins": 0.04097510129213333, "rewards/rejected": -0.08535870164632797, "step": 820 }, { "epoch": 0.22, "learning_rate": 4.794332124596775e-06, "logits/chosen": 0.910225510597229, "logits/rejected": 0.9843104481697083, "logps/chosen": -279.525634765625, "logps/rejected": -255.632568359375, "loss": 0.6919, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.060014404356479645, "rewards/margins": 0.03720748424530029, "rewards/rejected": -0.09722188115119934, "step": 830 }, { "epoch": 0.22, "learning_rate": 4.785162619238575e-06, "logits/chosen": 0.9961544275283813, "logits/rejected": 1.0286822319030762, "logps/chosen": -268.55023193359375, "logps/rejected": -252.0181121826172, "loss": 0.6915, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.07715798169374466, "rewards/margins": 0.04793631657958031, "rewards/rejected": -0.12509429454803467, "step": 840 }, { "epoch": 0.22, "learning_rate": 4.775802303459288e-06, "logits/chosen": 0.9325528144836426, "logits/rejected": 1.0585540533065796, "logps/chosen": -295.1412658691406, "logps/rejected": -265.9173889160156, "loss": 0.6911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0818694531917572, "rewards/margins": 0.049801088869571686, "rewards/rejected": -0.13167054951190948, "step": 850 }, { "epoch": 0.23, "learning_rate": 4.766251958842589e-06, "logits/chosen": 0.9676458239555359, "logits/rejected": 1.0127732753753662, "logps/chosen": -307.82403564453125, "logps/rejected": -271.78009033203125, "loss": 0.6909, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0727236270904541, "rewards/margins": 0.0467851385474205, "rewards/rejected": -0.1195087656378746, "step": 860 }, { "epoch": 0.23, "learning_rate": 4.7565123828395066e-06, "logits/chosen": 0.9034944772720337, "logits/rejected": 1.0843673944473267, "logps/chosen": -318.55096435546875, "logps/rejected": -294.28131103515625, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.050767261534929276, "rewards/margins": 0.045433562248945236, "rewards/rejected": -0.09620082378387451, "step": 870 }, { "epoch": 0.23, "learning_rate": 4.746584388701831e-06, "logits/chosen": 0.990381121635437, "logits/rejected": 1.1196520328521729, "logps/chosen": -261.2209777832031, "logps/rejected": -227.89700317382812, "loss": 0.6888, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.05034313350915909, "rewards/margins": 0.06942330300807953, "rewards/rejected": -0.11976643651723862, "step": 880 }, { "epoch": 0.23, "learning_rate": 4.736468805414218e-06, "logits/chosen": 0.9858494997024536, "logits/rejected": 1.1457973718643188, "logps/chosen": -296.84906005859375, "logps/rejected": -270.0372009277344, "loss": 0.6899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04343515634536743, "rewards/margins": 0.07933736592531204, "rewards/rejected": -0.12277251482009888, "step": 890 }, { "epoch": 0.24, "learning_rate": 4.7261664776249595e-06, "logits/chosen": 0.9974620938301086, "logits/rejected": 0.9908777475357056, "logps/chosen": -273.75848388671875, "logps/rejected": -256.10833740234375, "loss": 0.691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05218503996729851, "rewards/margins": 0.057306624948978424, "rewards/rejected": -0.10949166119098663, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": 0.9266276955604553, "eval_logits/rejected": 1.0206482410430908, "eval_logps/chosen": -282.83209228515625, "eval_logps/rejected": -254.36526489257812, "eval_loss": 0.6912428140640259, "eval_rewards/accuracies": 0.6244999766349792, "eval_rewards/chosen": -0.052852813154459, "eval_rewards/margins": 0.052640702575445175, "eval_rewards/rejected": -0.10549352318048477, "eval_runtime": 539.8724, "eval_samples_per_second": 3.705, "eval_steps_per_second": 0.926, "step": 900 }, { "epoch": 0.24, "learning_rate": 4.715678265575463e-06, "logits/chosen": 1.0043308734893799, "logits/rejected": 1.0745335817337036, "logps/chosen": -312.68194580078125, "logps/rejected": -295.7378845214844, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.05200989916920662, "rewards/margins": 0.03657294064760208, "rewards/rejected": -0.088582843542099, "step": 910 }, { "epoch": 0.24, "learning_rate": 4.705005045028415e-06, "logits/chosen": 0.9822233319282532, "logits/rejected": 1.0417166948318481, "logps/chosen": -336.60919189453125, "logps/rejected": -289.87689208984375, "loss": 0.6908, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04931178689002991, "rewards/margins": 0.06655998528003693, "rewards/rejected": -0.11587176471948624, "step": 920 }, { "epoch": 0.24, "learning_rate": 4.694147707194659e-06, "logits/chosen": 0.9825722575187683, "logits/rejected": 1.0669372081756592, "logps/chosen": -291.48394775390625, "logps/rejected": -246.3392333984375, "loss": 0.6914, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06181565672159195, "rewards/margins": 0.05421454459428787, "rewards/rejected": -0.11603017896413803, "step": 930 }, { "epoch": 0.25, "learning_rate": 4.683107158658782e-06, "logits/chosen": 0.9451677203178406, "logits/rejected": 0.9427906274795532, "logps/chosen": -267.2729797363281, "logps/rejected": -239.00460815429688, "loss": 0.6903, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.062487781047821045, "rewards/margins": 0.05825795605778694, "rewards/rejected": -0.12074574083089828, "step": 940 }, { "epoch": 0.25, "learning_rate": 4.671884321303407e-06, "logits/chosen": 0.9309576749801636, "logits/rejected": 0.9632508158683777, "logps/chosen": -292.92864990234375, "logps/rejected": -279.8366394042969, "loss": 0.6911, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08061058819293976, "rewards/margins": 0.05397840216755867, "rewards/rejected": -0.13458898663520813, "step": 950 }, { "epoch": 0.25, "learning_rate": 4.660480132232224e-06, "logits/chosen": 0.8813568353652954, "logits/rejected": 0.9471953511238098, "logps/chosen": -284.23333740234375, "logps/rejected": -256.72662353515625, "loss": 0.6908, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08392615616321564, "rewards/margins": 0.04842451959848404, "rewards/rejected": -0.13235066831111908, "step": 960 }, { "epoch": 0.25, "learning_rate": 4.6488955436917414e-06, "logits/chosen": 0.9631437063217163, "logits/rejected": 1.0291635990142822, "logps/chosen": -284.9555358886719, "logps/rejected": -221.7299041748047, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07446320354938507, "rewards/margins": 0.03399290144443512, "rewards/rejected": -0.10845611244440079, "step": 970 }, { "epoch": 0.26, "learning_rate": 4.6371315229917644e-06, "logits/chosen": 0.9003992080688477, "logits/rejected": 0.9652150273323059, "logps/chosen": -267.9747619628906, "logps/rejected": -250.9159393310547, "loss": 0.6915, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05370105430483818, "rewards/margins": 0.05308745428919792, "rewards/rejected": -0.1067885160446167, "step": 980 }, { "epoch": 0.26, "learning_rate": 4.625189052424638e-06, "logits/chosen": 0.9419862627983093, "logits/rejected": 1.0019476413726807, "logps/chosen": -249.2467803955078, "logps/rejected": -216.28857421875, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04485908895730972, "rewards/margins": 0.03853844106197357, "rewards/rejected": -0.0833975300192833, "step": 990 }, { "epoch": 0.26, "learning_rate": 4.613069129183218e-06, "logits/chosen": 0.9215824007987976, "logits/rejected": 0.9359531402587891, "logps/chosen": -265.0127868652344, "logps/rejected": -257.2760314941406, "loss": 0.6913, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.047946006059646606, "rewards/margins": 0.042474087327718735, "rewards/rejected": -0.09042008966207504, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": 0.9215968251228333, "eval_logits/rejected": 1.01704740524292, "eval_logps/chosen": -281.5216064453125, "eval_logps/rejected": -252.86395263671875, "eval_loss": 0.6912136673927307, "eval_rewards/accuracies": 0.6290000081062317, "eval_rewards/chosen": -0.03974788263440132, "eval_rewards/margins": 0.0507323183119297, "eval_rewards/rejected": -0.09048020094633102, "eval_runtime": 539.8968, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1000 }, { "epoch": 0.26, "learning_rate": 4.600772765277607e-06, "logits/chosen": 0.9124513864517212, "logits/rejected": 1.0127254724502563, "logps/chosen": -300.19659423828125, "logps/rejected": -253.8302764892578, "loss": 0.6915, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04729009419679642, "rewards/margins": 0.055154770612716675, "rewards/rejected": -0.1024448499083519, "step": 1010 }, { "epoch": 0.27, "learning_rate": 4.588300987450652e-06, "logits/chosen": 0.9808125495910645, "logits/rejected": 1.0599421262741089, "logps/chosen": -282.46038818359375, "logps/rejected": -267.24188232421875, "loss": 0.6912, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05007319524884224, "rewards/margins": 0.05987462401390076, "rewards/rejected": -0.1099478155374527, "step": 1020 }, { "epoch": 0.27, "learning_rate": 4.5756548370922136e-06, "logits/chosen": 0.9697202444076538, "logits/rejected": 1.0168477296829224, "logps/chosen": -282.2762145996094, "logps/rejected": -255.68844604492188, "loss": 0.6921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.044876083731651306, "rewards/margins": 0.03766594082117081, "rewards/rejected": -0.08254201710224152, "step": 1030 }, { "epoch": 0.27, "learning_rate": 4.562835370152206e-06, "logits/chosen": 0.9049477577209473, "logits/rejected": 1.021236538887024, "logps/chosen": -253.36471557617188, "logps/rejected": -239.316650390625, "loss": 0.692, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.038831353187561035, "rewards/margins": 0.035190094262361526, "rewards/rejected": -0.07402144372463226, "step": 1040 }, { "epoch": 0.27, "learning_rate": 4.54984365705243e-06, "logits/chosen": 1.0183782577514648, "logits/rejected": 1.024595022201538, "logps/chosen": -254.61691284179688, "logps/rejected": -231.1280975341797, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027654901146888733, "rewards/margins": 0.042562730610370636, "rewards/rejected": -0.07021763920783997, "step": 1050 }, { "epoch": 0.28, "learning_rate": 4.536680782597191e-06, "logits/chosen": 0.9354788661003113, "logits/rejected": 1.0331499576568604, "logps/chosen": -297.9024963378906, "logps/rejected": -269.0444030761719, "loss": 0.6908, "rewards/accuracies": 0.65625, "rewards/chosen": -0.016384651884436607, "rewards/margins": 0.05180732160806656, "rewards/rejected": -0.06819198280572891, "step": 1060 }, { "epoch": 0.28, "learning_rate": 4.523347845882718e-06, "logits/chosen": 1.014983057975769, "logits/rejected": 1.0802810192108154, "logps/chosen": -258.166015625, "logps/rejected": -207.4542999267578, "loss": 0.6907, "rewards/accuracies": 0.65625, "rewards/chosen": -0.028195038437843323, "rewards/margins": 0.05233887955546379, "rewards/rejected": -0.08053391426801682, "step": 1070 }, { "epoch": 0.28, "learning_rate": 4.50984596020539e-06, "logits/chosen": 0.9456683397293091, "logits/rejected": 1.018128752708435, "logps/chosen": -300.35211181640625, "logps/rejected": -264.9364318847656, "loss": 0.6921, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.030218075960874557, "rewards/margins": 0.046960409730672836, "rewards/rejected": -0.0771784856915474, "step": 1080 }, { "epoch": 0.29, "learning_rate": 4.4961762529687745e-06, "logits/chosen": 0.9062995910644531, "logits/rejected": 1.0201383829116821, "logps/chosen": -305.39776611328125, "logps/rejected": -233.7451171875, "loss": 0.6919, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04302235692739487, "rewards/margins": 0.04250651225447655, "rewards/rejected": -0.08552887290716171, "step": 1090 }, { "epoch": 0.29, "learning_rate": 4.482339865589492e-06, "logits/chosen": 0.8466746211051941, "logits/rejected": 1.021051049232483, "logps/chosen": -266.606689453125, "logps/rejected": -235.42123413085938, "loss": 0.6912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04704426974058151, "rewards/margins": 0.05677234008908272, "rewards/rejected": -0.10381660610437393, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": 0.9243915677070618, "eval_logits/rejected": 1.018973708152771, "eval_logps/chosen": -283.05096435546875, "eval_logps/rejected": -253.97817993164062, "eval_loss": 0.6911666989326477, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.05504164099693298, "eval_rewards/margins": 0.046581096947193146, "eval_rewards/rejected": -0.10162272304296494, "eval_runtime": 539.981, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1100 }, { "epoch": 0.29, "learning_rate": 4.468337953401909e-06, "logits/chosen": 0.9825204610824585, "logits/rejected": 1.0412126779556274, "logps/chosen": -299.93157958984375, "logps/rejected": -247.61044311523438, "loss": 0.6906, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04361564666032791, "rewards/margins": 0.049379851669073105, "rewards/rejected": -0.09299550205469131, "step": 1110 }, { "epoch": 0.29, "learning_rate": 4.45417168556166e-06, "logits/chosen": 0.9352057576179504, "logits/rejected": 1.0108729600906372, "logps/chosen": -261.39959716796875, "logps/rejected": -262.29864501953125, "loss": 0.6908, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03935537114739418, "rewards/margins": 0.043966446071863174, "rewards/rejected": -0.08332181721925735, "step": 1120 }, { "epoch": 0.3, "learning_rate": 4.439842244948036e-06, "logits/chosen": 0.9064655303955078, "logits/rejected": 0.9808236956596375, "logps/chosen": -262.05462646484375, "logps/rejected": -245.01651000976562, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.042961589992046356, "rewards/margins": 0.04068412259221077, "rewards/rejected": -0.08364571630954742, "step": 1130 }, { "epoch": 0.3, "learning_rate": 4.425350828065204e-06, "logits/chosen": 0.9776175618171692, "logits/rejected": 0.9820224642753601, "logps/chosen": -282.891357421875, "logps/rejected": -266.565673828125, "loss": 0.6908, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.034526508301496506, "rewards/margins": 0.04868306592106819, "rewards/rejected": -0.0832095816731453, "step": 1140 }, { "epoch": 0.3, "learning_rate": 4.410698644942303e-06, "logits/chosen": 0.9672806859016418, "logits/rejected": 1.0234858989715576, "logps/chosen": -281.4849548339844, "logps/rejected": -285.44293212890625, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.032487623393535614, "rewards/margins": 0.037405431270599365, "rewards/rejected": -0.06989306956529617, "step": 1150 }, { "epoch": 0.3, "learning_rate": 4.395886919032406e-06, "logits/chosen": 0.9714674949645996, "logits/rejected": 1.0344150066375732, "logps/chosen": -274.4117126464844, "logps/rejected": -232.0000762939453, "loss": 0.6913, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031604982912540436, "rewards/margins": 0.046002503484487534, "rewards/rejected": -0.07760748267173767, "step": 1160 }, { "epoch": 0.31, "learning_rate": 4.380916887110366e-06, "logits/chosen": 0.9162635803222656, "logits/rejected": 1.0360467433929443, "logps/chosen": -251.49404907226562, "logps/rejected": -223.4456329345703, "loss": 0.6902, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031815849244594574, "rewards/margins": 0.059083469212055206, "rewards/rejected": -0.09089931845664978, "step": 1170 }, { "epoch": 0.31, "learning_rate": 4.365789799169539e-06, "logits/chosen": 0.977371096611023, "logits/rejected": 0.9612863659858704, "logps/chosen": -313.4932556152344, "logps/rejected": -262.97113037109375, "loss": 0.6915, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.05341174453496933, "rewards/margins": 0.05502920225262642, "rewards/rejected": -0.10844095051288605, "step": 1180 }, { "epoch": 0.31, "learning_rate": 4.350506918317416e-06, "logits/chosen": 0.9038007855415344, "logits/rejected": 0.8652862310409546, "logps/chosen": -306.91912841796875, "logps/rejected": -275.84429931640625, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": -0.041279759258031845, "rewards/margins": 0.039026908576488495, "rewards/rejected": -0.08030666410923004, "step": 1190 }, { "epoch": 0.31, "learning_rate": 4.335069520670149e-06, "logits/chosen": 0.9440900683403015, "logits/rejected": 1.0484023094177246, "logps/chosen": -281.1288757324219, "logps/rejected": -216.98269653320312, "loss": 0.6902, "rewards/accuracies": 0.65625, "rewards/chosen": -0.047745514661073685, "rewards/margins": 0.06614203751087189, "rewards/rejected": -0.11388754844665527, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": 0.9163612127304077, "eval_logits/rejected": 1.0101075172424316, "eval_logps/chosen": -283.2487487792969, "eval_logps/rejected": -254.8289031982422, "eval_loss": 0.6911502480506897, "eval_rewards/accuracies": 0.6230000257492065, "eval_rewards/chosen": -0.057019300758838654, "eval_rewards/margins": 0.0531105101108551, "eval_rewards/rejected": -0.11012981832027435, "eval_runtime": 539.827, "eval_samples_per_second": 3.705, "eval_steps_per_second": 0.926, "step": 1200 }, { "epoch": 0.32, "learning_rate": 4.319478895246e-06, "logits/chosen": 0.9531642198562622, "logits/rejected": 0.9094236493110657, "logps/chosen": -278.7569274902344, "logps/rejected": -242.00259399414062, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04623215273022652, "rewards/margins": 0.04663626849651337, "rewards/rejected": -0.09286841005086899, "step": 1210 }, { "epoch": 0.32, "learning_rate": 4.303736343857704e-06, "logits/chosen": 1.053362488746643, "logits/rejected": 1.00510573387146, "logps/chosen": -268.8019714355469, "logps/rejected": -252.3562774658203, "loss": 0.6912, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.047048769891262054, "rewards/margins": 0.0653461441397667, "rewards/rejected": -0.11239492893218994, "step": 1220 }, { "epoch": 0.32, "learning_rate": 4.287843181003772e-06, "logits/chosen": 0.9648410677909851, "logits/rejected": 1.080945611000061, "logps/chosen": -280.63470458984375, "logps/rejected": -238.6513214111328, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.029626190662384033, "rewards/margins": 0.06667675822973251, "rewards/rejected": -0.09630295634269714, "step": 1230 }, { "epoch": 0.32, "learning_rate": 4.27180073375873e-06, "logits/chosen": 0.9461795687675476, "logits/rejected": 0.9696234464645386, "logps/chosen": -280.750244140625, "logps/rejected": -253.75149536132812, "loss": 0.6907, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02992023155093193, "rewards/margins": 0.06105700135231018, "rewards/rejected": -0.09097723662853241, "step": 1240 }, { "epoch": 0.33, "learning_rate": 4.255610341662304e-06, "logits/chosen": 0.9404398798942566, "logits/rejected": 1.006503701210022, "logps/chosen": -306.0481262207031, "logps/rejected": -267.99591064453125, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015811875462532043, "rewards/margins": 0.061067551374435425, "rewards/rejected": -0.07687942683696747, "step": 1250 }, { "epoch": 0.33, "learning_rate": 4.2392733566075764e-06, "logits/chosen": 0.9584512710571289, "logits/rejected": 1.0292441844940186, "logps/chosen": -243.0128173828125, "logps/rejected": -224.4785614013672, "loss": 0.6905, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.021735265851020813, "rewards/margins": 0.06055579334497452, "rewards/rejected": -0.08229105174541473, "step": 1260 }, { "epoch": 0.33, "learning_rate": 4.2227911427280975e-06, "logits/chosen": 0.9586073160171509, "logits/rejected": 1.008643388748169, "logps/chosen": -286.1580505371094, "logps/rejected": -270.8837585449219, "loss": 0.6918, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023701880127191544, "rewards/margins": 0.04824981838464737, "rewards/rejected": -0.07195170223712921, "step": 1270 }, { "epoch": 0.33, "learning_rate": 4.206165076283983e-06, "logits/chosen": 0.9629222750663757, "logits/rejected": 1.0080350637435913, "logps/chosen": -238.8936309814453, "logps/rejected": -228.6975860595703, "loss": 0.6913, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022201109677553177, "rewards/margins": 0.04362647980451584, "rewards/rejected": -0.06582758575677872, "step": 1280 }, { "epoch": 0.34, "learning_rate": 4.189396545546995e-06, "logits/chosen": 0.9939113855361938, "logits/rejected": 1.0935704708099365, "logps/chosen": -289.91900634765625, "logps/rejected": -280.5440673828125, "loss": 0.6917, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020857712253928185, "rewards/margins": 0.0328826978802681, "rewards/rejected": -0.05374041199684143, "step": 1290 }, { "epoch": 0.34, "learning_rate": 4.172486950684627e-06, "logits/chosen": 1.0020853281021118, "logits/rejected": 1.0316828489303589, "logps/chosen": -279.2798767089844, "logps/rejected": -262.09442138671875, "loss": 0.6912, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.027787720784544945, "rewards/margins": 0.048239342868328094, "rewards/rejected": -0.07602706551551819, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": 0.9400979280471802, "eval_logits/rejected": 1.0356999635696411, "eval_logps/chosen": -279.8863830566406, "eval_logps/rejected": -251.13418579101562, "eval_loss": 0.6911138296127319, "eval_rewards/accuracies": 0.6129999756813049, "eval_rewards/chosen": -0.023395679891109467, "eval_rewards/margins": 0.04978705570101738, "eval_rewards/rejected": -0.07318273931741714, "eval_runtime": 540.0054, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1300 }, { "epoch": 0.34, "learning_rate": 4.155437703643182e-06, "logits/chosen": 0.8979179263114929, "logits/rejected": 0.9356774091720581, "logps/chosen": -284.64422607421875, "logps/rejected": -251.070556640625, "loss": 0.6909, "rewards/accuracies": 0.65625, "rewards/chosen": -0.017777901142835617, "rewards/margins": 0.06165579706430435, "rewards/rejected": -0.07943369448184967, "step": 1310 }, { "epoch": 0.35, "learning_rate": 4.138250228029882e-06, "logits/chosen": 0.9831737279891968, "logits/rejected": 1.103745460510254, "logps/chosen": -233.90310668945312, "logps/rejected": -216.5464324951172, "loss": 0.6901, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.01819593645632267, "rewards/margins": 0.0664977878332138, "rewards/rejected": -0.08469371497631073, "step": 1320 }, { "epoch": 0.35, "learning_rate": 4.120925958993994e-06, "logits/chosen": 0.948932945728302, "logits/rejected": 1.0110113620758057, "logps/chosen": -296.0926818847656, "logps/rejected": -249.8265380859375, "loss": 0.6918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02898983657360077, "rewards/margins": 0.047562919557094574, "rewards/rejected": -0.07655275613069534, "step": 1330 }, { "epoch": 0.35, "learning_rate": 4.103466343106999e-06, "logits/chosen": 0.9645043611526489, "logits/rejected": 0.9807823896408081, "logps/chosen": -274.97869873046875, "logps/rejected": -243.57064819335938, "loss": 0.6912, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.024762103334069252, "rewards/margins": 0.04700572043657303, "rewards/rejected": -0.07176782190799713, "step": 1340 }, { "epoch": 0.35, "learning_rate": 4.085872838241797e-06, "logits/chosen": 0.9856332540512085, "logits/rejected": 1.0186711549758911, "logps/chosen": -286.65740966796875, "logps/rejected": -285.44451904296875, "loss": 0.691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015045429579913616, "rewards/margins": 0.0600103922188282, "rewards/rejected": -0.0750558152794838, "step": 1350 }, { "epoch": 0.36, "learning_rate": 4.06814691345098e-06, "logits/chosen": 0.9584442973136902, "logits/rejected": 1.0773932933807373, "logps/chosen": -303.4087829589844, "logps/rejected": -257.34381103515625, "loss": 0.692, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.03773612901568413, "rewards/margins": 0.043736010789871216, "rewards/rejected": -0.08147214353084564, "step": 1360 }, { "epoch": 0.36, "learning_rate": 4.050290048844171e-06, "logits/chosen": 0.9791196584701538, "logits/rejected": 1.037467360496521, "logps/chosen": -256.17120361328125, "logps/rejected": -225.8929443359375, "loss": 0.6917, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.015562976710498333, "rewards/margins": 0.03945142775774002, "rewards/rejected": -0.05501440912485123, "step": 1370 }, { "epoch": 0.36, "learning_rate": 4.032303735464422e-06, "logits/chosen": 0.9337407946586609, "logits/rejected": 1.0309889316558838, "logps/chosen": -289.6443176269531, "logps/rejected": -257.22784423828125, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005114209838211536, "rewards/margins": 0.05096329376101494, "rewards/rejected": -0.0560775101184845, "step": 1380 }, { "epoch": 0.36, "learning_rate": 4.014189475163727e-06, "logits/chosen": 0.9751637578010559, "logits/rejected": 1.0514873266220093, "logps/chosen": -266.8908386230469, "logps/rejected": -240.3486328125, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006348415277898312, "rewards/margins": 0.04528724402189255, "rewards/rejected": -0.05163566395640373, "step": 1390 }, { "epoch": 0.37, "learning_rate": 3.995948780477605e-06, "logits/chosen": 1.0069153308868408, "logits/rejected": 1.0419646501541138, "logps/chosen": -287.21966552734375, "logps/rejected": -246.9211883544922, "loss": 0.6914, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.008892977610230446, "rewards/margins": 0.038406871259212494, "rewards/rejected": -0.04729985073208809, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": 0.9341767430305481, "eval_logits/rejected": 1.0311079025268555, "eval_logps/chosen": -279.1179504394531, "eval_logps/rejected": -250.15402221679688, "eval_loss": 0.6911120414733887, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -0.015711043030023575, "eval_rewards/margins": 0.04766979068517685, "eval_rewards/rejected": -0.06338082998991013, "eval_runtime": 540.2273, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 1400 }, { "epoch": 0.37, "learning_rate": 3.977583174498816e-06, "logits/chosen": 0.9394723773002625, "logits/rejected": 0.9873741269111633, "logps/chosen": -274.50396728515625, "logps/rejected": -242.7906951904297, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014722615480422974, "rewards/margins": 0.039573125541210175, "rewards/rejected": -0.05429573729634285, "step": 1410 }, { "epoch": 0.37, "learning_rate": 3.959094190750172e-06, "logits/chosen": 0.9101311564445496, "logits/rejected": 1.0188195705413818, "logps/chosen": -288.55426025390625, "logps/rejected": -259.5224304199219, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029792049899697304, "rewards/margins": 0.036666542291641235, "rewards/rejected": -0.06645859032869339, "step": 1420 }, { "epoch": 0.37, "learning_rate": 3.9404833730564975e-06, "logits/chosen": 0.9995678663253784, "logits/rejected": 1.0337575674057007, "logps/chosen": -249.91671752929688, "logps/rejected": -249.8981475830078, "loss": 0.6915, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0360335037112236, "rewards/margins": 0.040712870657444, "rewards/rejected": -0.076746366918087, "step": 1430 }, { "epoch": 0.38, "learning_rate": 3.921752275415712e-06, "logits/chosen": 0.9858711361885071, "logits/rejected": 1.0380220413208008, "logps/chosen": -242.677490234375, "logps/rejected": -215.031494140625, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04031269997358322, "rewards/margins": 0.04829539731144905, "rewards/rejected": -0.08860810101032257, "step": 1440 }, { "epoch": 0.38, "learning_rate": 3.902902461869079e-06, "logits/chosen": 0.9379655122756958, "logits/rejected": 1.0792181491851807, "logps/chosen": -273.37957763671875, "logps/rejected": -246.2140350341797, "loss": 0.6903, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.047497231513261795, "rewards/margins": 0.0587293803691864, "rewards/rejected": -0.1062266081571579, "step": 1450 }, { "epoch": 0.38, "learning_rate": 3.883935506370605e-06, "logits/chosen": 0.916599452495575, "logits/rejected": 0.9806827306747437, "logps/chosen": -281.9441833496094, "logps/rejected": -244.51806640625, "loss": 0.6919, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.051647938787937164, "rewards/margins": 0.05136079713702202, "rewards/rejected": -0.10300873219966888, "step": 1460 }, { "epoch": 0.38, "learning_rate": 3.864852992655617e-06, "logits/chosen": 0.8767641186714172, "logits/rejected": 0.9864808320999146, "logps/chosen": -270.2128601074219, "logps/rejected": -259.9897766113281, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.042821671813726425, "rewards/margins": 0.05382692068815231, "rewards/rejected": -0.09664861112833023, "step": 1470 }, { "epoch": 0.39, "learning_rate": 3.845656514108516e-06, "logits/chosen": 0.8953625559806824, "logits/rejected": 0.9520618319511414, "logps/chosen": -297.1680603027344, "logps/rejected": -249.801025390625, "loss": 0.69, "rewards/accuracies": 0.6875, "rewards/chosen": -0.038018837571144104, "rewards/margins": 0.06626948714256287, "rewards/rejected": -0.10428832471370697, "step": 1480 }, { "epoch": 0.39, "learning_rate": 3.826347673629738e-06, "logits/chosen": 0.9500657320022583, "logits/rejected": 0.9770669937133789, "logps/chosen": -247.0959930419922, "logps/rejected": -231.38626098632812, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04545672982931137, "rewards/margins": 0.05727598816156387, "rewards/rejected": -0.10273271799087524, "step": 1490 }, { "epoch": 0.39, "learning_rate": 3.8069280835019062e-06, "logits/chosen": 0.9221125841140747, "logits/rejected": 1.0069925785064697, "logps/chosen": -325.3990783691406, "logps/rejected": -256.74163818359375, "loss": 0.6919, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03280999884009361, "rewards/margins": 0.04576558619737625, "rewards/rejected": -0.07857557386159897, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": 0.9161292314529419, "eval_logits/rejected": 1.013729453086853, "eval_logps/chosen": -282.56494140625, "eval_logps/rejected": -254.04412841796875, "eval_loss": 0.6910136938095093, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.05018109828233719, "eval_rewards/margins": 0.052100956439971924, "eval_rewards/rejected": -0.10228205472230911, "eval_runtime": 540.0139, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1500 }, { "epoch": 0.4, "learning_rate": 3.7873993652552077e-06, "logits/chosen": 0.9163872003555298, "logits/rejected": 1.0111920833587646, "logps/chosen": -263.4795227050781, "logps/rejected": -241.57888793945312, "loss": 0.6923, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.05680041387677193, "rewards/margins": 0.021093839779496193, "rewards/rejected": -0.07789425551891327, "step": 1510 }, { "epoch": 0.4, "learning_rate": 3.7677631495319953e-06, "logits/chosen": 0.9174981117248535, "logits/rejected": 0.9849978685379028, "logps/chosen": -263.1864929199219, "logps/rejected": -217.2829132080078, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.049250274896621704, "rewards/margins": 0.0348433181643486, "rewards/rejected": -0.0840936005115509, "step": 1520 }, { "epoch": 0.4, "learning_rate": 3.748021075950633e-06, "logits/chosen": 1.0062494277954102, "logits/rejected": 0.9941271543502808, "logps/chosen": -307.7798767089844, "logps/rejected": -280.7508239746094, "loss": 0.6914, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05701036378741264, "rewards/margins": 0.05211354047060013, "rewards/rejected": -0.10912390798330307, "step": 1530 }, { "epoch": 0.4, "learning_rate": 3.7281747929685824e-06, "logits/chosen": 0.926976203918457, "logits/rejected": 1.0702403783798218, "logps/chosen": -292.91351318359375, "logps/rejected": -256.81536865234375, "loss": 0.6912, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04857570677995682, "rewards/margins": 0.041417308151721954, "rewards/rejected": -0.08999301493167877, "step": 1540 }, { "epoch": 0.41, "learning_rate": 3.7082259577447604e-06, "logits/chosen": 0.9064332842826843, "logits/rejected": 1.022146224975586, "logps/chosen": -324.94122314453125, "logps/rejected": -255.82431030273438, "loss": 0.691, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0545024499297142, "rewards/margins": 0.05221433565020561, "rewards/rejected": -0.10671677440404892, "step": 1550 }, { "epoch": 0.41, "learning_rate": 3.6881762360011688e-06, "logits/chosen": 0.9837790727615356, "logits/rejected": 1.0159826278686523, "logps/chosen": -292.13531494140625, "logps/rejected": -249.9795684814453, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.061875708401203156, "rewards/margins": 0.05055435746908188, "rewards/rejected": -0.11243007332086563, "step": 1560 }, { "epoch": 0.41, "learning_rate": 3.668027301883802e-06, "logits/chosen": 0.9538768529891968, "logits/rejected": 1.0130136013031006, "logps/chosen": -266.67266845703125, "logps/rejected": -224.70468139648438, "loss": 0.6909, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.052473802119493484, "rewards/margins": 0.05191361904144287, "rewards/rejected": -0.10438741743564606, "step": 1570 }, { "epoch": 0.41, "learning_rate": 3.64778083782286e-06, "logits/chosen": 0.9861133694648743, "logits/rejected": 1.0575048923492432, "logps/chosen": -270.86236572265625, "logps/rejected": -278.5152893066406, "loss": 0.691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04235236719250679, "rewards/margins": 0.05154258757829666, "rewards/rejected": -0.09389495849609375, "step": 1580 }, { "epoch": 0.42, "learning_rate": 3.627438534392268e-06, "logits/chosen": 0.9495855569839478, "logits/rejected": 0.9809403419494629, "logps/chosen": -264.71380615234375, "logps/rejected": -214.28939819335938, "loss": 0.6923, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03805701807141304, "rewards/margins": 0.04229050129652023, "rewards/rejected": -0.08034752309322357, "step": 1590 }, { "epoch": 0.42, "learning_rate": 3.607002090168506e-06, "logits/chosen": 0.9524203538894653, "logits/rejected": 1.011755108833313, "logps/chosen": -262.1990661621094, "logps/rejected": -238.88052368164062, "loss": 0.6912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04073198884725571, "rewards/margins": 0.039406824856996536, "rewards/rejected": -0.08013881742954254, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": 0.9319766759872437, "eval_logits/rejected": 1.0314892530441284, "eval_logps/chosen": -281.04010009765625, "eval_logps/rejected": -252.43983459472656, "eval_loss": 0.6909967660903931, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.03493276238441467, "eval_rewards/margins": 0.051306504756212234, "eval_rewards/rejected": -0.08623925596475601, "eval_runtime": 539.9382, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1600 }, { "epoch": 0.42, "learning_rate": 3.586473211588787e-06, "logits/chosen": 0.9360347986221313, "logits/rejected": 1.0636814832687378, "logps/chosen": -301.8039245605469, "logps/rejected": -245.0256805419922, "loss": 0.6904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02840288355946541, "rewards/margins": 0.06062694638967514, "rewards/rejected": -0.08902983367443085, "step": 1610 }, { "epoch": 0.42, "learning_rate": 3.5658536128085623e-06, "logits/chosen": 0.9372593760490417, "logits/rejected": 1.0011626482009888, "logps/chosen": -253.13583374023438, "logps/rejected": -247.22116088867188, "loss": 0.691, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03672366216778755, "rewards/margins": 0.055417824536561966, "rewards/rejected": -0.09214149415493011, "step": 1620 }, { "epoch": 0.43, "learning_rate": 3.545145015558399e-06, "logits/chosen": 0.9559675455093384, "logits/rejected": 1.0077977180480957, "logps/chosen": -301.99700927734375, "logps/rejected": -291.1852111816406, "loss": 0.6921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03201790526509285, "rewards/margins": 0.047422006726264954, "rewards/rejected": -0.0794399082660675, "step": 1630 }, { "epoch": 0.43, "learning_rate": 3.5243491490002056e-06, "logits/chosen": 0.9405361413955688, "logits/rejected": 0.972905158996582, "logps/chosen": -277.42578125, "logps/rejected": -244.67495727539062, "loss": 0.6913, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.035732053220272064, "rewards/margins": 0.041307561099529266, "rewards/rejected": -0.07703961431980133, "step": 1640 }, { "epoch": 0.43, "learning_rate": 3.503467749582857e-06, "logits/chosen": 0.9268352389335632, "logits/rejected": 0.9519475102424622, "logps/chosen": -251.6887664794922, "logps/rejected": -207.09823608398438, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03763822466135025, "rewards/margins": 0.05269993096590042, "rewards/rejected": -0.09033815562725067, "step": 1650 }, { "epoch": 0.43, "learning_rate": 3.4825025608971947e-06, "logits/chosen": 0.8857590556144714, "logits/rejected": 1.0210973024368286, "logps/chosen": -318.9454650878906, "logps/rejected": -253.1227569580078, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.041044797748327255, "rewards/margins": 0.05614280700683594, "rewards/rejected": -0.0971876010298729, "step": 1660 }, { "epoch": 0.44, "learning_rate": 3.4614553335304407e-06, "logits/chosen": 0.9426645040512085, "logits/rejected": 1.06788969039917, "logps/chosen": -280.87506103515625, "logps/rejected": -242.68856811523438, "loss": 0.6899, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04844700172543526, "rewards/margins": 0.06784703582525253, "rewards/rejected": -0.11629404127597809, "step": 1670 }, { "epoch": 0.44, "learning_rate": 3.4403278249200222e-06, "logits/chosen": 0.8899833559989929, "logits/rejected": 0.981528639793396, "logps/chosen": -261.93939208984375, "logps/rejected": -244.01416015625, "loss": 0.6908, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.048227887600660324, "rewards/margins": 0.04917442053556442, "rewards/rejected": -0.09740231186151505, "step": 1680 }, { "epoch": 0.44, "learning_rate": 3.4191217992068293e-06, "logits/chosen": 0.9601287841796875, "logits/rejected": 0.9461712837219238, "logps/chosen": -262.20184326171875, "logps/rejected": -260.2802429199219, "loss": 0.6913, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06563547253608704, "rewards/margins": 0.05082215741276741, "rewards/rejected": -0.11645762622356415, "step": 1690 }, { "epoch": 0.44, "learning_rate": 3.3978390270879056e-06, "logits/chosen": 0.9125478863716125, "logits/rejected": 1.015061616897583, "logps/chosen": -299.32171630859375, "logps/rejected": -256.58953857421875, "loss": 0.6905, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04695742204785347, "rewards/margins": 0.05506708472967148, "rewards/rejected": -0.10202451795339584, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": 0.9100068211555481, "eval_logits/rejected": 1.0087957382202148, "eval_logps/chosen": -282.8432922363281, "eval_logps/rejected": -254.7029571533203, "eval_loss": 0.69096839427948, "eval_rewards/accuracies": 0.6324999928474426, "eval_rewards/chosen": -0.052964530885219574, "eval_rewards/margins": 0.05590558797121048, "eval_rewards/rejected": -0.10887012630701065, "eval_runtime": 540.5566, "eval_samples_per_second": 3.7, "eval_steps_per_second": 0.925, "step": 1700 }, { "epoch": 0.45, "learning_rate": 3.3764812856685995e-06, "logits/chosen": 0.944310188293457, "logits/rejected": 0.9725696444511414, "logps/chosen": -294.048095703125, "logps/rejected": -265.73284912109375, "loss": 0.6913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04054544121026993, "rewards/margins": 0.06353868544101715, "rewards/rejected": -0.10408411920070648, "step": 1710 }, { "epoch": 0.45, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 0.9270051717758179, "logits/rejected": 0.9650331735610962, "logps/chosen": -327.187744140625, "logps/rejected": -308.1183166503906, "loss": 0.6904, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.045968521386384964, "rewards/margins": 0.062069911509752274, "rewards/rejected": -0.10803844034671783, "step": 1720 }, { "epoch": 0.45, "learning_rate": 3.3335480345008907e-06, "logits/chosen": 0.9014407992362976, "logits/rejected": 0.9876381754875183, "logps/chosen": -266.403564453125, "logps/rejected": -229.73080444335938, "loss": 0.6902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04243239760398865, "rewards/margins": 0.06432008743286133, "rewards/rejected": -0.10675249993801117, "step": 1730 }, { "epoch": 0.46, "learning_rate": 3.3119761096666055e-06, "logits/chosen": 1.0035514831542969, "logits/rejected": 0.9860905408859253, "logps/chosen": -317.1201171875, "logps/rejected": -265.30499267578125, "loss": 0.6917, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06333385407924652, "rewards/margins": 0.047281377017498016, "rewards/rejected": -0.11061523109674454, "step": 1740 }, { "epoch": 0.46, "learning_rate": 3.290336385060832e-06, "logits/chosen": 0.9631811380386353, "logits/rejected": 0.9473791122436523, "logps/chosen": -288.9410095214844, "logps/rejected": -267.47454833984375, "loss": 0.6903, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.049429457634687424, "rewards/margins": 0.06482435762882233, "rewards/rejected": -0.11425381898880005, "step": 1750 }, { "epoch": 0.46, "learning_rate": 3.268630667594348e-06, "logits/chosen": 0.9454625844955444, "logits/rejected": 1.029585838317871, "logps/chosen": -284.25531005859375, "logps/rejected": -266.0755920410156, "loss": 0.6917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05219440534710884, "rewards/margins": 0.05088215321302414, "rewards/rejected": -0.10307655483484268, "step": 1760 }, { "epoch": 0.46, "learning_rate": 3.2468607696883147e-06, "logits/chosen": 0.9423101544380188, "logits/rejected": 1.0101561546325684, "logps/chosen": -302.5003356933594, "logps/rejected": -258.0522155761719, "loss": 0.6902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05451619625091553, "rewards/margins": 0.06200449541211128, "rewards/rejected": -0.11652068793773651, "step": 1770 }, { "epoch": 0.47, "learning_rate": 3.225028509122944e-06, "logits/chosen": 0.9246463775634766, "logits/rejected": 1.057897925376892, "logps/chosen": -307.1044921875, "logps/rejected": -260.0653076171875, "loss": 0.6913, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025903914123773575, "rewards/margins": 0.06586969643831253, "rewards/rejected": -0.0917736142873764, "step": 1780 }, { "epoch": 0.47, "learning_rate": 3.2031357088857083e-06, "logits/chosen": 0.9973335266113281, "logits/rejected": 1.0170247554779053, "logps/chosen": -275.74005126953125, "logps/rejected": -261.8125915527344, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.047602783888578415, "rewards/margins": 0.05812246724963188, "rewards/rejected": -0.1057252511382103, "step": 1790 }, { "epoch": 0.47, "learning_rate": 3.181184197019127e-06, "logits/chosen": 1.0013173818588257, "logits/rejected": 0.9759401082992554, "logps/chosen": -283.0948791503906, "logps/rejected": -236.4706268310547, "loss": 0.6901, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04060991853475571, "rewards/margins": 0.06453616917133331, "rewards/rejected": -0.10514608770608902, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": 0.9323698282241821, "eval_logits/rejected": 1.0313962697982788, "eval_logps/chosen": -281.6338195800781, "eval_logps/rejected": -253.65231323242188, "eval_loss": 0.6909690499305725, "eval_rewards/accuracies": 0.6225000023841858, "eval_rewards/chosen": -0.0408700592815876, "eval_rewards/margins": 0.057493917644023895, "eval_rewards/rejected": -0.0983639732003212, "eval_runtime": 539.9632, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1800 }, { "epoch": 0.47, "learning_rate": 3.159175806468126e-06, "logits/chosen": 0.9391202926635742, "logits/rejected": 1.0263153314590454, "logps/chosen": -284.8543395996094, "logps/rejected": -277.1647644042969, "loss": 0.6914, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03517382591962814, "rewards/margins": 0.05179664492607117, "rewards/rejected": -0.08697047084569931, "step": 1810 }, { "epoch": 0.48, "learning_rate": 3.1371123749269804e-06, "logits/chosen": 0.9998887777328491, "logits/rejected": 1.085648536682129, "logps/chosen": -282.2349548339844, "logps/rejected": -252.2509765625, "loss": 0.6907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.029559914022684097, "rewards/margins": 0.06976927816867828, "rewards/rejected": -0.09932918846607208, "step": 1820 }, { "epoch": 0.48, "learning_rate": 3.114995744685877e-06, "logits/chosen": 0.9946446418762207, "logits/rejected": 0.9664584398269653, "logps/chosen": -258.7174072265625, "logps/rejected": -240.670654296875, "loss": 0.6902, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02138775773346424, "rewards/margins": 0.07551835477352142, "rewards/rejected": -0.09690611809492111, "step": 1830 }, { "epoch": 0.48, "learning_rate": 3.0928277624770743e-06, "logits/chosen": 0.9557411074638367, "logits/rejected": 1.0262891054153442, "logps/chosen": -286.01458740234375, "logps/rejected": -232.46139526367188, "loss": 0.6891, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01390999834984541, "rewards/margins": 0.07762787491083145, "rewards/rejected": -0.09153787791728973, "step": 1840 }, { "epoch": 0.48, "learning_rate": 3.070610279320708e-06, "logits/chosen": 0.9258459806442261, "logits/rejected": 1.0731465816497803, "logps/chosen": -286.8509826660156, "logps/rejected": -233.0972137451172, "loss": 0.6921, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.032274287194013596, "rewards/margins": 0.0481577143073082, "rewards/rejected": -0.0804319903254509, "step": 1850 }, { "epoch": 0.49, "learning_rate": 3.0483451503702264e-06, "logits/chosen": 0.9777040481567383, "logits/rejected": 1.0403920412063599, "logps/chosen": -288.07568359375, "logps/rejected": -264.2755432128906, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.023307165130972862, "rewards/margins": 0.048550479114055634, "rewards/rejected": -0.07185763865709305, "step": 1860 }, { "epoch": 0.49, "learning_rate": 3.0260342347574916e-06, "logits/chosen": 0.9937931895256042, "logits/rejected": 1.0553154945373535, "logps/chosen": -299.058837890625, "logps/rejected": -248.8879852294922, "loss": 0.6899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.036436520516872406, "rewards/margins": 0.07167023420333862, "rewards/rejected": -0.10810675472021103, "step": 1870 }, { "epoch": 0.49, "learning_rate": 3.0036793954375358e-06, "logits/chosen": 0.9339237213134766, "logits/rejected": 1.0194135904312134, "logps/chosen": -279.79034423828125, "logps/rejected": -242.7287139892578, "loss": 0.691, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.015874430537223816, "rewards/margins": 0.0718928873538971, "rewards/rejected": -0.08776732534170151, "step": 1880 }, { "epoch": 0.49, "learning_rate": 2.981282499033009e-06, "logits/chosen": 0.9355725049972534, "logits/rejected": 0.9969936609268188, "logps/chosen": -294.20538330078125, "logps/rejected": -244.11160278320312, "loss": 0.6908, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.026097718626260757, "rewards/margins": 0.05047178268432617, "rewards/rejected": -0.07656950503587723, "step": 1890 }, { "epoch": 0.5, "learning_rate": 2.9588454156783163e-06, "logits/chosen": 1.0147713422775269, "logits/rejected": 0.9884480237960815, "logps/chosen": -278.02313232421875, "logps/rejected": -251.70596313476562, "loss": 0.6902, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02444344013929367, "rewards/margins": 0.054150182753801346, "rewards/rejected": -0.07859362661838531, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": 0.9226279258728027, "eval_logits/rejected": 1.0212137699127197, "eval_logps/chosen": -280.8078308105469, "eval_logps/rejected": -252.7657470703125, "eval_loss": 0.6909632682800293, "eval_rewards/accuracies": 0.6215000152587891, "eval_rewards/chosen": -0.03261038661003113, "eval_rewards/margins": 0.05688786879181862, "eval_rewards/rejected": -0.08949825167655945, "eval_runtime": 539.8982, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 1900 }, { "epoch": 0.5, "learning_rate": 2.9363700188634597e-06, "logits/chosen": 0.9406150579452515, "logits/rejected": 1.0022201538085938, "logps/chosen": -272.09759521484375, "logps/rejected": -249.77279663085938, "loss": 0.6921, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0340249240398407, "rewards/margins": 0.041387807577848434, "rewards/rejected": -0.07541273534297943, "step": 1910 }, { "epoch": 0.5, "learning_rate": 2.9138581852776053e-06, "logits/chosen": 0.9434909820556641, "logits/rejected": 1.056489109992981, "logps/chosen": -277.2883605957031, "logps/rejected": -262.8563537597656, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": -0.031123792752623558, "rewards/margins": 0.06892909109592438, "rewards/rejected": -0.10005287826061249, "step": 1920 }, { "epoch": 0.51, "learning_rate": 2.8913117946523805e-06, "logits/chosen": 1.0239412784576416, "logits/rejected": 0.993172824382782, "logps/chosen": -261.37689208984375, "logps/rejected": -221.52090454101562, "loss": 0.6916, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.042845502495765686, "rewards/margins": 0.056726813316345215, "rewards/rejected": -0.0995723158121109, "step": 1930 }, { "epoch": 0.51, "learning_rate": 2.8687327296049126e-06, "logits/chosen": 0.9209731817245483, "logits/rejected": 1.0005519390106201, "logps/chosen": -281.11822509765625, "logps/rejected": -259.76007080078125, "loss": 0.6912, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028859639540314674, "rewards/margins": 0.047795943915843964, "rewards/rejected": -0.07665558159351349, "step": 1940 }, { "epoch": 0.51, "learning_rate": 2.8461228754806376e-06, "logits/chosen": 0.958030104637146, "logits/rejected": 1.0400612354278564, "logps/chosen": -250.44058227539062, "logps/rejected": -242.2317352294922, "loss": 0.691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0336209237575531, "rewards/margins": 0.05756276845932007, "rewards/rejected": -0.09118369966745377, "step": 1950 }, { "epoch": 0.51, "learning_rate": 2.823484120195865e-06, "logits/chosen": 0.9849356412887573, "logits/rejected": 0.9542206525802612, "logps/chosen": -209.1316375732422, "logps/rejected": -223.1927947998047, "loss": 0.6909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030019324272871017, "rewards/margins": 0.057300496846437454, "rewards/rejected": -0.08731982111930847, "step": 1960 }, { "epoch": 0.52, "learning_rate": 2.8008183540801486e-06, "logits/chosen": 0.8806187510490417, "logits/rejected": 0.982339084148407, "logps/chosen": -288.1579895019531, "logps/rejected": -268.0417785644531, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": -0.03183884546160698, "rewards/margins": 0.042079776525497437, "rewards/rejected": -0.07391862571239471, "step": 1970 }, { "epoch": 0.52, "learning_rate": 2.7781274697184353e-06, "logits/chosen": 0.9701333045959473, "logits/rejected": 1.0392358303070068, "logps/chosen": -274.5201110839844, "logps/rejected": -255.364501953125, "loss": 0.69, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03297718986868858, "rewards/margins": 0.0656973272562027, "rewards/rejected": -0.09867450594902039, "step": 1980 }, { "epoch": 0.52, "learning_rate": 2.7554133617930397e-06, "logits/chosen": 0.9770027995109558, "logits/rejected": 0.9862260818481445, "logps/chosen": -272.98602294921875, "logps/rejected": -268.11175537109375, "loss": 0.692, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.037066780030727386, "rewards/margins": 0.03600457310676575, "rewards/rejected": -0.07307135313749313, "step": 1990 }, { "epoch": 0.52, "learning_rate": 2.7326779269254363e-06, "logits/chosen": 0.9106483459472656, "logits/rejected": 1.0019176006317139, "logps/chosen": -254.740234375, "logps/rejected": -229.75634765625, "loss": 0.6919, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026663145050406456, "rewards/margins": 0.03192012384533882, "rewards/rejected": -0.05858327075839043, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": 0.9258546829223633, "eval_logits/rejected": 1.025207757949829, "eval_logps/chosen": -279.9319763183594, "eval_logps/rejected": -251.49105834960938, "eval_loss": 0.6909723281860352, "eval_rewards/accuracies": 0.6274999976158142, "eval_rewards/chosen": -0.02385157160460949, "eval_rewards/margins": 0.05289952829480171, "eval_rewards/rejected": -0.07675110548734665, "eval_runtime": 540.2889, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.925, "step": 2000 }, { "epoch": 0.53, "learning_rate": 2.7099230635178954e-06, "logits/chosen": 0.9473625421524048, "logits/rejected": 1.0636024475097656, "logps/chosen": -250.43350219726562, "logps/rejected": -213.73043823242188, "loss": 0.6897, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.016012001782655716, "rewards/margins": 0.0778764933347702, "rewards/rejected": -0.09388849884271622, "step": 2010 }, { "epoch": 0.53, "learning_rate": 2.6871506715949608e-06, "logits/chosen": 1.0584747791290283, "logits/rejected": 1.0787056684494019, "logps/chosen": -277.7005615234375, "logps/rejected": -263.6285400390625, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": -0.028144020587205887, "rewards/margins": 0.07585910707712173, "rewards/rejected": -0.10400311648845673, "step": 2020 }, { "epoch": 0.53, "learning_rate": 2.6643626526448063e-06, "logits/chosen": 0.9738826751708984, "logits/rejected": 1.0668303966522217, "logps/chosen": -245.468994140625, "logps/rejected": -252.04458618164062, "loss": 0.6901, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.026749875396490097, "rewards/margins": 0.06751126796007156, "rewards/rejected": -0.09426114708185196, "step": 2030 }, { "epoch": 0.53, "learning_rate": 2.6415609094604562e-06, "logits/chosen": 0.9280556440353394, "logits/rejected": 1.0651742219924927, "logps/chosen": -304.1370849609375, "logps/rejected": -259.7930603027344, "loss": 0.6903, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03546469286084175, "rewards/margins": 0.06755216419696808, "rewards/rejected": -0.10301685333251953, "step": 2040 }, { "epoch": 0.54, "learning_rate": 2.618747345980904e-06, "logits/chosen": 0.908758282661438, "logits/rejected": 1.0054810047149658, "logps/chosen": -289.89154052734375, "logps/rejected": -250.53231811523438, "loss": 0.6905, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028398990631103516, "rewards/margins": 0.06537959724664688, "rewards/rejected": -0.093778595328331, "step": 2050 }, { "epoch": 0.54, "learning_rate": 2.595923867132136e-06, "logits/chosen": 1.0136375427246094, "logits/rejected": 1.071046233177185, "logps/chosen": -227.84829711914062, "logps/rejected": -231.29788208007812, "loss": 0.6912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.025034338235855103, "rewards/margins": 0.06069794297218323, "rewards/rejected": -0.08573228865861893, "step": 2060 }, { "epoch": 0.54, "learning_rate": 2.5730923786680672e-06, "logits/chosen": 0.9761725664138794, "logits/rejected": 1.0631458759307861, "logps/chosen": -275.0975341796875, "logps/rejected": -230.28170776367188, "loss": 0.6909, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03186946362257004, "rewards/margins": 0.056202489882707596, "rewards/rejected": -0.08807194232940674, "step": 2070 }, { "epoch": 0.54, "learning_rate": 2.5502547870114137e-06, "logits/chosen": 0.9701007604598999, "logits/rejected": 1.0082242488861084, "logps/chosen": -231.7493133544922, "logps/rejected": -252.67343139648438, "loss": 0.6909, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02439451590180397, "rewards/margins": 0.04928570240736008, "rewards/rejected": -0.07368021458387375, "step": 2080 }, { "epoch": 0.55, "learning_rate": 2.527412999094507e-06, "logits/chosen": 0.9784774780273438, "logits/rejected": 1.0191878080368042, "logps/chosen": -283.6234130859375, "logps/rejected": -256.2926940917969, "loss": 0.6911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03525594249367714, "rewards/margins": 0.05471722036600113, "rewards/rejected": -0.08997315913438797, "step": 2090 }, { "epoch": 0.55, "learning_rate": 2.504568922200064e-06, "logits/chosen": 0.9242954254150391, "logits/rejected": 0.9784539937973022, "logps/chosen": -265.85198974609375, "logps/rejected": -248.41354370117188, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03961525112390518, "rewards/margins": 0.04367566108703613, "rewards/rejected": -0.08329091221094131, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": 0.9476714730262756, "eval_logits/rejected": 1.0475825071334839, "eval_logps/chosen": -281.3605651855469, "eval_logps/rejected": -253.0793914794922, "eval_loss": 0.690941333770752, "eval_rewards/accuracies": 0.6345000267028809, "eval_rewards/chosen": -0.03813740611076355, "eval_rewards/margins": 0.054497238248586655, "eval_rewards/rejected": -0.0926346406340599, "eval_runtime": 539.933, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 2100 }, { "epoch": 0.55, "learning_rate": 2.4817244638019333e-06, "logits/chosen": 0.9688129425048828, "logits/rejected": 1.0826255083084106, "logps/chosen": -259.4082336425781, "logps/rejected": -252.29434204101562, "loss": 0.6898, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03624454885721207, "rewards/margins": 0.06441988795995712, "rewards/rejected": -0.10066443681716919, "step": 2110 }, { "epoch": 0.55, "learning_rate": 2.4588815314058155e-06, "logits/chosen": 0.9535354375839233, "logits/rejected": 0.9888173937797546, "logps/chosen": -308.61053466796875, "logps/rejected": -290.3708801269531, "loss": 0.692, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.033887576311826706, "rewards/margins": 0.038659535348415375, "rewards/rejected": -0.07254711538553238, "step": 2120 }, { "epoch": 0.56, "learning_rate": 2.4360420323899922e-06, "logits/chosen": 0.9049111604690552, "logits/rejected": 1.0025997161865234, "logps/chosen": -300.1152648925781, "logps/rejected": -246.1627197265625, "loss": 0.69, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03963657096028328, "rewards/margins": 0.07257360219955444, "rewards/rejected": -0.11221016943454742, "step": 2130 }, { "epoch": 0.56, "learning_rate": 2.4132078738460585e-06, "logits/chosen": 0.985696017742157, "logits/rejected": 0.9788693189620972, "logps/chosen": -273.555419921875, "logps/rejected": -260.4004211425781, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04467379301786423, "rewards/margins": 0.04357798025012016, "rewards/rejected": -0.08825178444385529, "step": 2140 }, { "epoch": 0.56, "learning_rate": 2.3903809624196826e-06, "logits/chosen": 0.989804744720459, "logits/rejected": 1.0961401462554932, "logps/chosen": -260.736328125, "logps/rejected": -231.63491821289062, "loss": 0.691, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.051245905458927155, "rewards/margins": 0.05609399080276489, "rewards/rejected": -0.10733989626169205, "step": 2150 }, { "epoch": 0.57, "learning_rate": 2.3675632041513978e-06, "logits/chosen": 1.0626736879348755, "logits/rejected": 1.1053330898284912, "logps/chosen": -243.4201202392578, "logps/rejected": -225.9866180419922, "loss": 0.6922, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.052421581000089645, "rewards/margins": 0.037004150450229645, "rewards/rejected": -0.08942572772502899, "step": 2160 }, { "epoch": 0.57, "learning_rate": 2.3447565043174533e-06, "logits/chosen": 0.9754394292831421, "logits/rejected": 0.9671268463134766, "logps/chosen": -273.8443603515625, "logps/rejected": -249.9600372314453, "loss": 0.6911, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.043098267167806625, "rewards/margins": 0.04836183041334152, "rewards/rejected": -0.09146009385585785, "step": 2170 }, { "epoch": 0.57, "learning_rate": 2.321962767270724e-06, "logits/chosen": 1.0004395246505737, "logits/rejected": 1.0548702478408813, "logps/chosen": -258.23388671875, "logps/rejected": -254.6455841064453, "loss": 0.6909, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.042513586580753326, "rewards/margins": 0.06627936661243439, "rewards/rejected": -0.10879294574260712, "step": 2180 }, { "epoch": 0.57, "learning_rate": 2.299183896281692e-06, "logits/chosen": 0.932357907295227, "logits/rejected": 1.1255356073379517, "logps/chosen": -272.53533935546875, "logps/rejected": -232.40127563476562, "loss": 0.6905, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.040888115763664246, "rewards/margins": 0.07338190823793411, "rewards/rejected": -0.11427001655101776, "step": 2190 }, { "epoch": 0.58, "learning_rate": 2.2764217933795297e-06, "logits/chosen": 0.9959739446640015, "logits/rejected": 0.9297063946723938, "logps/chosen": -290.0731201171875, "logps/rejected": -259.72259521484375, "loss": 0.6917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.048611197620630264, "rewards/margins": 0.04705143719911575, "rewards/rejected": -0.09566263109445572, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": 0.9399436116218567, "eval_logits/rejected": 1.0406934022903442, "eval_logps/chosen": -281.7611389160156, "eval_logps/rejected": -253.66929626464844, "eval_loss": 0.690939724445343, "eval_rewards/accuracies": 0.6324999928474426, "eval_rewards/chosen": -0.042143091559410095, "eval_rewards/margins": 0.05639072135090828, "eval_rewards/rejected": -0.09853381663560867, "eval_runtime": 539.9748, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 2200 }, { "epoch": 0.58, "learning_rate": 2.2536783591932786e-06, "logits/chosen": 1.089950680732727, "logits/rejected": 0.9905519485473633, "logps/chosen": -274.2284851074219, "logps/rejected": -254.03823852539062, "loss": 0.6907, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03807093948125839, "rewards/margins": 0.06290793418884277, "rewards/rejected": -0.10097887367010117, "step": 2210 }, { "epoch": 0.58, "learning_rate": 2.230955492793149e-06, "logits/chosen": 0.9911603927612305, "logits/rejected": 1.117432951927185, "logps/chosen": -274.63616943359375, "logps/rejected": -236.0727996826172, "loss": 0.6904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029746342450380325, "rewards/margins": 0.05460263043642044, "rewards/rejected": -0.08434897661209106, "step": 2220 }, { "epoch": 0.58, "learning_rate": 2.208255091531947e-06, "logits/chosen": 0.9844030141830444, "logits/rejected": 1.0057882070541382, "logps/chosen": -267.25860595703125, "logps/rejected": -221.1670684814453, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": -0.02886761724948883, "rewards/margins": 0.06318513303995132, "rewards/rejected": -0.09205274283885956, "step": 2230 }, { "epoch": 0.59, "learning_rate": 2.1855790508866435e-06, "logits/chosen": 0.9558299779891968, "logits/rejected": 1.0252538919448853, "logps/chosen": -292.6085510253906, "logps/rejected": -274.9212951660156, "loss": 0.6913, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02827531099319458, "rewards/margins": 0.041763827204704285, "rewards/rejected": -0.07003913819789886, "step": 2240 }, { "epoch": 0.59, "learning_rate": 2.162929264300107e-06, "logits/chosen": 1.0085655450820923, "logits/rejected": 1.0989354848861694, "logps/chosen": -255.2216796875, "logps/rejected": -226.0516815185547, "loss": 0.691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.036384742707014084, "rewards/margins": 0.05588601902127266, "rewards/rejected": -0.09227076172828674, "step": 2250 }, { "epoch": 0.59, "learning_rate": 2.1403076230230006e-06, "logits/chosen": 0.9903377294540405, "logits/rejected": 1.0552234649658203, "logps/chosen": -270.6020202636719, "logps/rejected": -208.5309295654297, "loss": 0.6921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04310330003499985, "rewards/margins": 0.04129331558942795, "rewards/rejected": -0.0843966156244278, "step": 2260 }, { "epoch": 0.59, "learning_rate": 2.11771601595586e-06, "logits/chosen": 0.9834343791007996, "logits/rejected": 1.0014784336090088, "logps/chosen": -259.8085632324219, "logps/rejected": -261.5582580566406, "loss": 0.6916, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.023330455645918846, "rewards/margins": 0.04660937935113907, "rewards/rejected": -0.06993982940912247, "step": 2270 }, { "epoch": 0.6, "learning_rate": 2.0951563294913737e-06, "logits/chosen": 0.9085767865180969, "logits/rejected": 1.0599794387817383, "logps/chosen": -264.39208984375, "logps/rejected": -222.37948608398438, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03771457076072693, "rewards/margins": 0.04855852574110031, "rewards/rejected": -0.08627309650182724, "step": 2280 }, { "epoch": 0.6, "learning_rate": 2.0726304473568693e-06, "logits/chosen": 0.9768549799919128, "logits/rejected": 1.0272681713104248, "logps/chosen": -250.63037109375, "logps/rejected": -270.17938232421875, "loss": 0.6915, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03126218914985657, "rewards/margins": 0.057791270315647125, "rewards/rejected": -0.08905345946550369, "step": 2290 }, { "epoch": 0.6, "learning_rate": 2.050140250457023e-06, "logits/chosen": 0.9690292477607727, "logits/rejected": 0.9458838701248169, "logps/chosen": -250.7799072265625, "logps/rejected": -240.507568359375, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": -0.026477226987481117, "rewards/margins": 0.053064655512571335, "rewards/rejected": -0.079541876912117, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": 0.9399006366729736, "eval_logits/rejected": 1.0408267974853516, "eval_logps/chosen": -280.7285461425781, "eval_logps/rejected": -252.42715454101562, "eval_loss": 0.6909221410751343, "eval_rewards/accuracies": 0.6334999799728394, "eval_rewards/chosen": -0.03181701526045799, "eval_rewards/margins": 0.05429535731673241, "eval_rewards/rejected": -0.0861123651266098, "eval_runtime": 540.0931, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "step": 2300 }, { "epoch": 0.6, "learning_rate": 2.0276876167168042e-06, "logits/chosen": 0.9707075357437134, "logits/rejected": 1.1204993724822998, "logps/chosen": -285.9052734375, "logps/rejected": -265.517333984375, "loss": 0.6905, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02710430696606636, "rewards/margins": 0.0629369467496872, "rewards/rejected": -0.09004124999046326, "step": 2310 }, { "epoch": 0.61, "learning_rate": 2.0052744209246682e-06, "logits/chosen": 0.9497090578079224, "logits/rejected": 1.0253454446792603, "logps/chosen": -259.4183044433594, "logps/rejected": -253.4176483154297, "loss": 0.691, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03496091067790985, "rewards/margins": 0.045159030705690384, "rewards/rejected": -0.08011993765830994, "step": 2320 }, { "epoch": 0.61, "learning_rate": 1.9829025345760127e-06, "logits/chosen": 0.9671627283096313, "logits/rejected": 1.0244706869125366, "logps/chosen": -316.0784912109375, "logps/rejected": -272.86224365234375, "loss": 0.691, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03136713430285454, "rewards/margins": 0.05638208985328674, "rewards/rejected": -0.08774922788143158, "step": 2330 }, { "epoch": 0.61, "learning_rate": 1.9605738257169115e-06, "logits/chosen": 1.038586974143982, "logits/rejected": 1.0072470903396606, "logps/chosen": -250.32870483398438, "logps/rejected": -251.5189971923828, "loss": 0.691, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.04340856149792671, "rewards/margins": 0.0459757074713707, "rewards/rejected": -0.08938425779342651, "step": 2340 }, { "epoch": 0.62, "learning_rate": 1.9382901587881275e-06, "logits/chosen": 0.9293440580368042, "logits/rejected": 1.0626415014266968, "logps/chosen": -260.1195373535156, "logps/rejected": -246.67117309570312, "loss": 0.6912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03792440518736839, "rewards/margins": 0.0486772395670414, "rewards/rejected": -0.08660164475440979, "step": 2350 }, { "epoch": 0.62, "learning_rate": 1.916053394469437e-06, "logits/chosen": 0.9554189443588257, "logits/rejected": 1.0219160318374634, "logps/chosen": -227.74856567382812, "logps/rejected": -232.154296875, "loss": 0.691, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03027189150452614, "rewards/margins": 0.05248479172587395, "rewards/rejected": -0.08275668323040009, "step": 2360 }, { "epoch": 0.62, "learning_rate": 1.8938653895242604e-06, "logits/chosen": 0.9694328308105469, "logits/rejected": 0.998115062713623, "logps/chosen": -242.61392211914062, "logps/rejected": -225.9537811279297, "loss": 0.691, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01270161010324955, "rewards/margins": 0.06611243635416031, "rewards/rejected": -0.07881404459476471, "step": 2370 }, { "epoch": 0.62, "learning_rate": 1.8717279966446267e-06, "logits/chosen": 0.939893901348114, "logits/rejected": 1.0379140377044678, "logps/chosen": -271.5564270019531, "logps/rejected": -256.105712890625, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.024293258786201477, "rewards/margins": 0.05607147887349129, "rewards/rejected": -0.08036474138498306, "step": 2380 }, { "epoch": 0.63, "learning_rate": 1.8496430642964698e-06, "logits/chosen": 0.9116802215576172, "logits/rejected": 1.035742998123169, "logps/chosen": -255.07760620117188, "logps/rejected": -239.1924285888672, "loss": 0.6922, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.019214514642953873, "rewards/margins": 0.05538605526089668, "rewards/rejected": -0.07460056990385056, "step": 2390 }, { "epoch": 0.63, "learning_rate": 1.827612436565286e-06, "logits/chosen": 0.926386833190918, "logits/rejected": 0.9954707026481628, "logps/chosen": -254.49209594726562, "logps/rejected": -233.5257568359375, "loss": 0.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027828719466924667, "rewards/margins": 0.059567712247371674, "rewards/rejected": -0.08739643543958664, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": 0.9198330044746399, "eval_logits/rejected": 1.021889328956604, "eval_logps/chosen": -280.510009765625, "eval_logps/rejected": -252.31210327148438, "eval_loss": 0.6909388303756714, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": -0.029631877318024635, "eval_rewards/margins": 0.05532996356487274, "eval_rewards/rejected": -0.08496184647083282, "eval_runtime": 539.9384, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 2400 }, { "epoch": 0.63, "learning_rate": 1.8056379530021492e-06, "logits/chosen": 0.8990974426269531, "logits/rejected": 1.0346615314483643, "logps/chosen": -256.46343994140625, "logps/rejected": -225.152099609375, "loss": 0.691, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029439201578497887, "rewards/margins": 0.060266874730587006, "rewards/rejected": -0.08970607817173004, "step": 2410 }, { "epoch": 0.63, "learning_rate": 1.7837214484701154e-06, "logits/chosen": 0.9923946261405945, "logits/rejected": 1.0339380502700806, "logps/chosen": -273.4414367675781, "logps/rejected": -264.6556091308594, "loss": 0.6892, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.029256407171487808, "rewards/margins": 0.07676726579666138, "rewards/rejected": -0.10602366924285889, "step": 2420 }, { "epoch": 0.64, "learning_rate": 1.7618647529910043e-06, "logits/chosen": 0.9024986028671265, "logits/rejected": 1.0799829959869385, "logps/chosen": -260.97979736328125, "logps/rejected": -240.7161102294922, "loss": 0.6897, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025985723361372948, "rewards/margins": 0.06577527523040771, "rewards/rejected": -0.09176099300384521, "step": 2430 }, { "epoch": 0.64, "learning_rate": 1.7400696915925996e-06, "logits/chosen": 0.9433887600898743, "logits/rejected": 1.0706889629364014, "logps/chosen": -278.13543701171875, "logps/rejected": -270.07415771484375, "loss": 0.6916, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01627534069120884, "rewards/margins": 0.07549260556697845, "rewards/rejected": -0.09176793694496155, "step": 2440 }, { "epoch": 0.64, "learning_rate": 1.718338084156254e-06, "logits/chosen": 0.9400017857551575, "logits/rejected": 0.9848964810371399, "logps/chosen": -277.2012634277344, "logps/rejected": -248.524169921875, "loss": 0.6908, "rewards/accuracies": 0.6875, "rewards/chosen": -0.023953277617692947, "rewards/margins": 0.05445709824562073, "rewards/rejected": -0.07841037213802338, "step": 2450 }, { "epoch": 0.64, "learning_rate": 1.6966717452649372e-06, "logits/chosen": 0.9121321439743042, "logits/rejected": 0.9821128845214844, "logps/chosen": -280.4564208984375, "logps/rejected": -272.4418640136719, "loss": 0.6909, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.022658145055174828, "rewards/margins": 0.058780424296855927, "rewards/rejected": -0.0814385712146759, "step": 2460 }, { "epoch": 0.65, "learning_rate": 1.6750724840517103e-06, "logits/chosen": 0.9408855438232422, "logits/rejected": 1.041475534439087, "logps/chosen": -295.4039306640625, "logps/rejected": -253.6959686279297, "loss": 0.6892, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02255399525165558, "rewards/margins": 0.0683935284614563, "rewards/rejected": -0.09094752371311188, "step": 2470 }, { "epoch": 0.65, "learning_rate": 1.6535421040486686e-06, "logits/chosen": 0.9445545077323914, "logits/rejected": 0.9898989796638489, "logps/chosen": -257.03387451171875, "logps/rejected": -232.8082275390625, "loss": 0.6907, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.022778628394007683, "rewards/margins": 0.05287964269518852, "rewards/rejected": -0.07565827667713165, "step": 2480 }, { "epoch": 0.65, "learning_rate": 1.6320824030363458e-06, "logits/chosen": 0.8453947901725769, "logits/rejected": 1.0863592624664307, "logps/chosen": -268.630615234375, "logps/rejected": -246.2313690185547, "loss": 0.691, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.030399400740861893, "rewards/margins": 0.0645337849855423, "rewards/rejected": -0.09493318945169449, "step": 2490 }, { "epoch": 0.65, "learning_rate": 1.6106951728936028e-06, "logits/chosen": 0.9887116551399231, "logits/rejected": 1.091596007347107, "logps/chosen": -232.3385772705078, "logps/rejected": -221.32968139648438, "loss": 0.6908, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03866693750023842, "rewards/margins": 0.05171254277229309, "rewards/rejected": -0.09037948399782181, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": 0.9195509552955627, "eval_logits/rejected": 1.0213452577590942, "eval_logps/chosen": -281.2753601074219, "eval_logps/rejected": -253.40109252929688, "eval_loss": 0.6909087896347046, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": -0.037285856902599335, "eval_rewards/margins": 0.05856594070792198, "eval_rewards/rejected": -0.09585181623697281, "eval_runtime": 539.985, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 2500 }, { "epoch": 0.66, "learning_rate": 1.5893821994479996e-06, "logits/chosen": 1.0217519998550415, "logits/rejected": 1.1035264730453491, "logps/chosen": -280.2012939453125, "logps/rejected": -262.41033935546875, "loss": 0.6902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03267248719930649, "rewards/margins": 0.07917577773332596, "rewards/rejected": -0.11184825003147125, "step": 2510 }, { "epoch": 0.66, "learning_rate": 1.5681452623266868e-06, "logits/chosen": 0.9456014633178711, "logits/rejected": 1.0302622318267822, "logps/chosen": -300.8179931640625, "logps/rejected": -262.069580078125, "loss": 0.6915, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03674257546663284, "rewards/margins": 0.056660883128643036, "rewards/rejected": -0.09340345859527588, "step": 2520 }, { "epoch": 0.66, "learning_rate": 1.5469861348078014e-06, "logits/chosen": 0.9554840326309204, "logits/rejected": 0.9924384951591492, "logps/chosen": -268.2906188964844, "logps/rejected": -242.49649047851562, "loss": 0.6904, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.032142072916030884, "rewards/margins": 0.056136567145586014, "rewards/rejected": -0.0882786363363266, "step": 2530 }, { "epoch": 0.66, "learning_rate": 1.5259065836724035e-06, "logits/chosen": 0.9138981103897095, "logits/rejected": 0.9927932620048523, "logps/chosen": -269.4898986816406, "logps/rejected": -257.8581848144531, "loss": 0.6908, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026234816759824753, "rewards/margins": 0.050769805908203125, "rewards/rejected": -0.07700462639331818, "step": 2540 }, { "epoch": 0.67, "learning_rate": 1.5049083690569456e-06, "logits/chosen": 0.8760209083557129, "logits/rejected": 0.9947643280029297, "logps/chosen": -277.1868591308594, "logps/rejected": -236.7860870361328, "loss": 0.6896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.045319218188524246, "rewards/margins": 0.06617378443479538, "rewards/rejected": -0.11149300634860992, "step": 2550 }, { "epoch": 0.67, "learning_rate": 1.4839932443063057e-06, "logits/chosen": 1.0274193286895752, "logits/rejected": 1.0255438089370728, "logps/chosen": -241.68191528320312, "logps/rejected": -215.37057495117188, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.046610768884420395, "rewards/margins": 0.05322499945759773, "rewards/rejected": -0.09983576834201813, "step": 2560 }, { "epoch": 0.67, "learning_rate": 1.4631629558273803e-06, "logits/chosen": 0.9781502485275269, "logits/rejected": 1.0266082286834717, "logps/chosen": -291.5887756347656, "logps/rejected": -256.538818359375, "loss": 0.6903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.028579484671354294, "rewards/margins": 0.07505873590707779, "rewards/rejected": -0.10363821685314178, "step": 2570 }, { "epoch": 0.68, "learning_rate": 1.4424192429432657e-06, "logits/chosen": 0.9894530177116394, "logits/rejected": 1.038171648979187, "logps/chosen": -308.27691650390625, "logps/rejected": -225.60122680664062, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": -0.03209725767374039, "rewards/margins": 0.06519778817892075, "rewards/rejected": -0.09729506075382233, "step": 2580 }, { "epoch": 0.68, "learning_rate": 1.421763837748016e-06, "logits/chosen": 0.9771683812141418, "logits/rejected": 1.058801531791687, "logps/chosen": -268.15179443359375, "logps/rejected": -273.706298828125, "loss": 0.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05141326040029526, "rewards/margins": 0.058643460273742676, "rewards/rejected": -0.11005672067403793, "step": 2590 }, { "epoch": 0.68, "learning_rate": 1.401198464962021e-06, "logits/chosen": 1.0130704641342163, "logits/rejected": 0.9349973797798157, "logps/chosen": -273.2847900390625, "logps/rejected": -240.21829223632812, "loss": 0.6907, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0360480472445488, "rewards/margins": 0.05611228942871094, "rewards/rejected": -0.09216034412384033, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": 0.9161260724067688, "eval_logits/rejected": 1.017266035079956, "eval_logps/chosen": -281.78839111328125, "eval_logps/rejected": -254.0473175048828, "eval_loss": 0.6909086108207703, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -0.042415801435709, "eval_rewards/margins": 0.05989806354045868, "eval_rewards/rejected": -0.10231386125087738, "eval_runtime": 540.1872, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 2600 }, { "epoch": 0.68, "learning_rate": 1.3807248417879896e-06, "logits/chosen": 0.932266354560852, "logits/rejected": 0.9696345329284668, "logps/chosen": -259.15472412109375, "logps/rejected": -220.0342559814453, "loss": 0.6914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02964337170124054, "rewards/margins": 0.05693807080388069, "rewards/rejected": -0.08658144623041153, "step": 2610 }, { "epoch": 0.69, "learning_rate": 1.3603446777675665e-06, "logits/chosen": 0.9445293545722961, "logits/rejected": 0.9707993268966675, "logps/chosen": -260.50518798828125, "logps/rejected": -241.0626983642578, "loss": 0.6909, "rewards/accuracies": 0.59375, "rewards/chosen": -0.038475148379802704, "rewards/margins": 0.05124124884605408, "rewards/rejected": -0.08971639722585678, "step": 2620 }, { "epoch": 0.69, "learning_rate": 1.3400596746385817e-06, "logits/chosen": 0.9341602325439453, "logits/rejected": 0.9812172055244446, "logps/chosen": -291.263671875, "logps/rejected": -256.05816650390625, "loss": 0.6898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03991129249334335, "rewards/margins": 0.06936169415712357, "rewards/rejected": -0.10927299410104752, "step": 2630 }, { "epoch": 0.69, "learning_rate": 1.3198715261929587e-06, "logits/chosen": 0.9518648386001587, "logits/rejected": 1.0701024532318115, "logps/chosen": -322.59869384765625, "logps/rejected": -246.1614532470703, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": -0.032660938799381256, "rewards/margins": 0.045336611568927765, "rewards/rejected": -0.07799754291772842, "step": 2640 }, { "epoch": 0.69, "learning_rate": 1.2997819181352823e-06, "logits/chosen": 0.9637743234634399, "logits/rejected": 0.9643661379814148, "logps/chosen": -299.9573669433594, "logps/rejected": -267.3519592285156, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03847068175673485, "rewards/margins": 0.05295687913894653, "rewards/rejected": -0.09142756462097168, "step": 2650 }, { "epoch": 0.7, "learning_rate": 1.2797925279420454e-06, "logits/chosen": 0.9999778866767883, "logits/rejected": 1.0599424839019775, "logps/chosen": -291.417724609375, "logps/rejected": -258.858642578125, "loss": 0.6911, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04292257875204086, "rewards/margins": 0.045674268156290054, "rewards/rejected": -0.08859684318304062, "step": 2660 }, { "epoch": 0.7, "learning_rate": 1.2599050247215764e-06, "logits/chosen": 0.9801284670829773, "logits/rejected": 1.0555846691131592, "logps/chosen": -286.49017333984375, "logps/rejected": -262.8829650878906, "loss": 0.69, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02869715727865696, "rewards/margins": 0.07526004314422607, "rewards/rejected": -0.10395719856023788, "step": 2670 }, { "epoch": 0.7, "learning_rate": 1.2401210690746705e-06, "logits/chosen": 0.9294122457504272, "logits/rejected": 1.0954596996307373, "logps/chosen": -264.6773986816406, "logps/rejected": -241.242431640625, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03240332007408142, "rewards/margins": 0.04624287039041519, "rewards/rejected": -0.07864619046449661, "step": 2680 }, { "epoch": 0.7, "learning_rate": 1.2204423129559306e-06, "logits/chosen": 0.8595544695854187, "logits/rejected": 1.0383957624435425, "logps/chosen": -296.6278076171875, "logps/rejected": -270.6478576660156, "loss": 0.6912, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030149826779961586, "rewards/margins": 0.045799605548381805, "rewards/rejected": -0.07594943791627884, "step": 2690 }, { "epoch": 0.71, "learning_rate": 1.20087039953583e-06, "logits/chosen": 0.9707738757133484, "logits/rejected": 0.9931126832962036, "logps/chosen": -281.4847412109375, "logps/rejected": -259.38934326171875, "loss": 0.6905, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.023116273805499077, "rewards/margins": 0.05793018266558647, "rewards/rejected": -0.0810464546084404, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": 0.9119435548782349, "eval_logits/rejected": 1.0139191150665283, "eval_logps/chosen": -281.0736083984375, "eval_logps/rejected": -253.19638061523438, "eval_loss": 0.6908898949623108, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.03526770696043968, "eval_rewards/margins": 0.05853661522269249, "eval_rewards/rejected": -0.09380432963371277, "eval_runtime": 540.0121, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 2700 }, { "epoch": 0.71, "learning_rate": 1.181406963063507e-06, "logits/chosen": 0.9695969820022583, "logits/rejected": 0.9412325024604797, "logps/chosen": -277.9027099609375, "logps/rejected": -233.14767456054688, "loss": 0.6915, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029822606593370438, "rewards/margins": 0.05514361336827278, "rewards/rejected": -0.08496621251106262, "step": 2710 }, { "epoch": 0.71, "learning_rate": 1.1620536287303052e-06, "logits/chosen": 0.9639630317687988, "logits/rejected": 1.0453459024429321, "logps/chosen": -261.4854736328125, "logps/rejected": -243.4965057373047, "loss": 0.6913, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.037622541189193726, "rewards/margins": 0.05851644277572632, "rewards/rejected": -0.09613899141550064, "step": 2720 }, { "epoch": 0.71, "learning_rate": 1.1428120125340717e-06, "logits/chosen": 1.0106990337371826, "logits/rejected": 0.9288986325263977, "logps/chosen": -273.11346435546875, "logps/rejected": -243.2933807373047, "loss": 0.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03193587437272072, "rewards/margins": 0.055615246295928955, "rewards/rejected": -0.08755112439393997, "step": 2730 }, { "epoch": 0.72, "learning_rate": 1.123683721144223e-06, "logits/chosen": 0.9593019485473633, "logits/rejected": 0.9824401140213013, "logps/chosen": -273.22357177734375, "logps/rejected": -241.69149780273438, "loss": 0.6893, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03742024675011635, "rewards/margins": 0.07019098103046417, "rewards/rejected": -0.10761122405529022, "step": 2740 }, { "epoch": 0.72, "learning_rate": 1.1046703517675848e-06, "logits/chosen": 0.9543115496635437, "logits/rejected": 0.9525305032730103, "logps/chosen": -247.79141235351562, "logps/rejected": -250.76815795898438, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": -0.03152688220143318, "rewards/margins": 0.05581844598054886, "rewards/rejected": -0.08734532445669174, "step": 2750 }, { "epoch": 0.72, "learning_rate": 1.085773492015028e-06, "logits/chosen": 0.9415119290351868, "logits/rejected": 1.0495812892913818, "logps/chosen": -288.77117919921875, "logps/rejected": -236.2522735595703, "loss": 0.69, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.034685637801885605, "rewards/margins": 0.07457654178142548, "rewards/rejected": -0.10926218330860138, "step": 2760 }, { "epoch": 0.72, "learning_rate": 1.0669947197689034e-06, "logits/chosen": 0.9325952529907227, "logits/rejected": 0.9842671155929565, "logps/chosen": -230.1918487548828, "logps/rejected": -243.35086059570312, "loss": 0.6915, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.04897845536470413, "rewards/margins": 0.04100048914551735, "rewards/rejected": -0.08997894823551178, "step": 2770 }, { "epoch": 0.73, "learning_rate": 1.048335603051291e-06, "logits/chosen": 0.8943045735359192, "logits/rejected": 1.0183279514312744, "logps/chosen": -272.97314453125, "logps/rejected": -255.63125610351562, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": -0.03204040229320526, "rewards/margins": 0.0708232969045639, "rewards/rejected": -0.10286370664834976, "step": 2780 }, { "epoch": 0.73, "learning_rate": 1.0297976998930665e-06, "logits/chosen": 0.897107720375061, "logits/rejected": 0.9336503744125366, "logps/chosen": -278.9725341796875, "logps/rejected": -243.04238891601562, "loss": 0.6913, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.027060145512223244, "rewards/margins": 0.05337075516581535, "rewards/rejected": -0.08043090254068375, "step": 2790 }, { "epoch": 0.73, "learning_rate": 1.0113825582038078e-06, "logits/chosen": 0.9530097246170044, "logits/rejected": 1.0400608777999878, "logps/chosen": -271.3522033691406, "logps/rejected": -225.7040557861328, "loss": 0.692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03952925279736519, "rewards/margins": 0.05476094409823418, "rewards/rejected": -0.09429020434617996, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": 0.9141185283660889, "eval_logits/rejected": 1.0163326263427734, "eval_logps/chosen": -280.8155517578125, "eval_logps/rejected": -252.75259399414062, "eval_loss": 0.6908916234970093, "eval_rewards/accuracies": 0.6305000185966492, "eval_rewards/chosen": -0.0326874740421772, "eval_rewards/margins": 0.05667929723858833, "eval_rewards/rejected": -0.08936676383018494, "eval_runtime": 540.3199, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.925, "step": 2800 }, { "epoch": 0.74, "learning_rate": 9.930917156425477e-07, "logits/chosen": 0.9315062761306763, "logits/rejected": 1.0539835691452026, "logps/chosen": -247.2057342529297, "logps/rejected": -225.90493774414062, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02216990664601326, "rewards/margins": 0.07055743038654327, "rewards/rejected": -0.09272731840610504, "step": 2810 }, { "epoch": 0.74, "learning_rate": 9.749266994893756e-07, "logits/chosen": 0.9194700121879578, "logits/rejected": 1.008080244064331, "logps/chosen": -242.8909454345703, "logps/rejected": -255.8279266357422, "loss": 0.6914, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.033027276396751404, "rewards/margins": 0.043912582099437714, "rewards/rejected": -0.07693986594676971, "step": 2820 }, { "epoch": 0.74, "learning_rate": 9.56889026517913e-07, "logits/chosen": 0.95648193359375, "logits/rejected": 1.0707709789276123, "logps/chosen": -272.9548645019531, "logps/rejected": -241.94235229492188, "loss": 0.6899, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.030579229816794395, "rewards/margins": 0.06354820728302002, "rewards/rejected": -0.09412743896245956, "step": 2830 }, { "epoch": 0.74, "learning_rate": 9.389802028686617e-07, "logits/chosen": 0.9589303135871887, "logits/rejected": 0.9868467450141907, "logps/chosen": -219.3035125732422, "logps/rejected": -192.64849853515625, "loss": 0.6904, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0318605974316597, "rewards/margins": 0.05495452880859375, "rewards/rejected": -0.08681513369083405, "step": 2840 }, { "epoch": 0.75, "learning_rate": 9.212017239232427e-07, "logits/chosen": 0.9364684820175171, "logits/rejected": 1.0579065084457397, "logps/chosen": -272.2196350097656, "logps/rejected": -280.43341064453125, "loss": 0.6894, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.020410938188433647, "rewards/margins": 0.0782955139875412, "rewards/rejected": -0.0987064465880394, "step": 2850 }, { "epoch": 0.75, "learning_rate": 9.03555074179533e-07, "logits/chosen": 0.9313532114028931, "logits/rejected": 1.04340398311615, "logps/chosen": -267.8077087402344, "logps/rejected": -246.677001953125, "loss": 0.6908, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024608857929706573, "rewards/margins": 0.06871498376131058, "rewards/rejected": -0.09332384169101715, "step": 2860 }, { "epoch": 0.75, "learning_rate": 8.860417271277067e-07, "logits/chosen": 0.9149921536445618, "logits/rejected": 1.0074148178100586, "logps/chosen": -243.07144165039062, "logps/rejected": -241.79171752929688, "loss": 0.6902, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029673978686332703, "rewards/margins": 0.0581536665558815, "rewards/rejected": -0.0878276452422142, "step": 2870 }, { "epoch": 0.75, "learning_rate": 8.686631451272029e-07, "logits/chosen": 0.9447315335273743, "logits/rejected": 0.9529320001602173, "logps/chosen": -269.72869873046875, "logps/rejected": -222.9635467529297, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03748570382595062, "rewards/margins": 0.05406556650996208, "rewards/rejected": -0.0915512815117836, "step": 2880 }, { "epoch": 0.76, "learning_rate": 8.514207792846168e-07, "logits/chosen": 1.0064308643341064, "logits/rejected": 1.0328409671783447, "logps/chosen": -253.03274536132812, "logps/rejected": -257.581298828125, "loss": 0.69, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03226761519908905, "rewards/margins": 0.06126902252435684, "rewards/rejected": -0.09353663772344589, "step": 2890 }, { "epoch": 0.76, "learning_rate": 8.343160693325356e-07, "logits/chosen": 0.9581842422485352, "logits/rejected": 1.0024080276489258, "logps/chosen": -233.0767364501953, "logps/rejected": -227.9841766357422, "loss": 0.6906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.035527393221855164, "rewards/margins": 0.056871771812438965, "rewards/rejected": -0.09239916503429413, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": 0.9098308682441711, "eval_logits/rejected": 1.0122586488723755, "eval_logps/chosen": -280.8845520019531, "eval_logps/rejected": -252.85272216796875, "eval_loss": 0.6908985376358032, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -0.033377815037965775, "eval_rewards/margins": 0.0569901280105114, "eval_rewards/rejected": -0.09036794304847717, "eval_runtime": 540.038, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "step": 2900 }, { "epoch": 0.76, "learning_rate": 8.173504435093174e-07, "logits/chosen": 0.9396654963493347, "logits/rejected": 0.9836663007736206, "logps/chosen": -286.25970458984375, "logps/rejected": -269.1986083984375, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03581177443265915, "rewards/margins": 0.04660804197192192, "rewards/rejected": -0.08241982758045197, "step": 2910 }, { "epoch": 0.76, "learning_rate": 8.00525318439836e-07, "logits/chosen": 0.9897071123123169, "logits/rejected": 1.0295822620391846, "logps/chosen": -272.0080871582031, "logps/rejected": -232.94650268554688, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.035867296159267426, "rewards/margins": 0.05984731763601303, "rewards/rejected": -0.09571461379528046, "step": 2920 }, { "epoch": 0.77, "learning_rate": 7.838420990171927e-07, "logits/chosen": 0.9099165797233582, "logits/rejected": 1.017539381980896, "logps/chosen": -313.5694885253906, "logps/rejected": -268.1796569824219, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.026468079537153244, "rewards/margins": 0.05932525545358658, "rewards/rejected": -0.08579333126544952, "step": 2930 }, { "epoch": 0.77, "learning_rate": 7.673021782854084e-07, "logits/chosen": 0.9644147753715515, "logits/rejected": 0.9656912088394165, "logps/chosen": -250.9320831298828, "logps/rejected": -264.95233154296875, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": -0.03290482237935066, "rewards/margins": 0.0584709532558918, "rewards/rejected": -0.09137578308582306, "step": 2940 }, { "epoch": 0.77, "learning_rate": 7.509069373231039e-07, "logits/chosen": 0.9676412343978882, "logits/rejected": 0.9627419710159302, "logps/chosen": -294.781982421875, "logps/rejected": -271.5765380859375, "loss": 0.6905, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.022627348080277443, "rewards/margins": 0.06841419637203217, "rewards/rejected": -0.09104155004024506, "step": 2950 }, { "epoch": 0.77, "learning_rate": 7.346577451281822e-07, "logits/chosen": 0.9224090576171875, "logits/rejected": 1.0375105142593384, "logps/chosen": -251.53451538085938, "logps/rejected": -233.12393188476562, "loss": 0.6909, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04297469183802605, "rewards/margins": 0.05884693190455437, "rewards/rejected": -0.10182162374258041, "step": 2960 }, { "epoch": 0.78, "learning_rate": 7.185559585035138e-07, "logits/chosen": 0.9037634134292603, "logits/rejected": 0.9682399034500122, "logps/chosen": -286.7387390136719, "logps/rejected": -260.4252014160156, "loss": 0.6901, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.023535221815109253, "rewards/margins": 0.057094089686870575, "rewards/rejected": -0.08062931150197983, "step": 2970 }, { "epoch": 0.78, "learning_rate": 7.026029219436504e-07, "logits/chosen": 0.9559614062309265, "logits/rejected": 1.007653832435608, "logps/chosen": -306.8362121582031, "logps/rejected": -272.37249755859375, "loss": 0.6904, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029121745377779007, "rewards/margins": 0.0643700435757637, "rewards/rejected": -0.09349179267883301, "step": 2980 }, { "epoch": 0.78, "learning_rate": 6.867999675225523e-07, "logits/chosen": 0.9615996479988098, "logits/rejected": 0.9802320599555969, "logps/chosen": -297.98504638671875, "logps/rejected": -250.20980834960938, "loss": 0.6911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02488793060183525, "rewards/margins": 0.05079926922917366, "rewards/rejected": -0.07568720728158951, "step": 2990 }, { "epoch": 0.79, "learning_rate": 6.711484147823663e-07, "logits/chosen": 0.9520236849784851, "logits/rejected": 0.9852927923202515, "logps/chosen": -270.9884033203125, "logps/rejected": -242.1695098876953, "loss": 0.6904, "rewards/accuracies": 0.65625, "rewards/chosen": -0.029557928442955017, "rewards/margins": 0.06538344919681549, "rewards/rejected": -0.0949413850903511, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": 0.9123407602310181, "eval_logits/rejected": 1.0147464275360107, "eval_logps/chosen": -280.66253662109375, "eval_logps/rejected": -252.71670532226562, "eval_loss": 0.6908931732177734, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -0.031157268211245537, "eval_rewards/margins": 0.05785065144300461, "eval_rewards/rejected": -0.0890079066157341, "eval_runtime": 540.0515, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "step": 3000 }, { "epoch": 0.79, "learning_rate": 6.556495706232413e-07, "logits/chosen": 0.9562872052192688, "logits/rejected": 1.0361840724945068, "logps/chosen": -274.87127685546875, "logps/rejected": -239.40811157226562, "loss": 0.6906, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.02163364365696907, "rewards/margins": 0.06655408442020416, "rewards/rejected": -0.08818772435188293, "step": 3010 }, { "epoch": 0.79, "learning_rate": 6.403047291942057e-07, "logits/chosen": 0.9342554211616516, "logits/rejected": 0.9498234987258911, "logps/chosen": -278.9698486328125, "logps/rejected": -264.5009765625, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.016571203246712685, "rewards/margins": 0.055479150265455246, "rewards/rejected": -0.07205035537481308, "step": 3020 }, { "epoch": 0.79, "learning_rate": 6.251151717851023e-07, "logits/chosen": 0.9194384813308716, "logits/rejected": 0.9668914079666138, "logps/chosen": -291.434814453125, "logps/rejected": -275.8680114746094, "loss": 0.6891, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03377396613359451, "rewards/margins": 0.07892084121704102, "rewards/rejected": -0.11269481480121613, "step": 3030 }, { "epoch": 0.8, "learning_rate": 6.100821667196041e-07, "logits/chosen": 0.9412601590156555, "logits/rejected": 1.0217000246047974, "logps/chosen": -304.5139465332031, "logps/rejected": -248.54312133789062, "loss": 0.6907, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01636696420609951, "rewards/margins": 0.06269043684005737, "rewards/rejected": -0.07905739545822144, "step": 3040 }, { "epoch": 0.8, "learning_rate": 5.952069692493062e-07, "logits/chosen": 0.88300621509552, "logits/rejected": 1.0030475854873657, "logps/chosen": -301.0616149902344, "logps/rejected": -266.23419189453125, "loss": 0.6905, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011049589142203331, "rewards/margins": 0.07019064575433731, "rewards/rejected": -0.0812402293086052, "step": 3050 }, { "epoch": 0.8, "learning_rate": 5.80490821448918e-07, "logits/chosen": 0.9647413492202759, "logits/rejected": 0.9922765493392944, "logps/chosen": -222.5147705078125, "logps/rejected": -206.96414184570312, "loss": 0.6909, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03366454318165779, "rewards/margins": 0.058607954531908035, "rewards/rejected": -0.09227249771356583, "step": 3060 }, { "epoch": 0.8, "learning_rate": 5.659349521125459e-07, "logits/chosen": 0.9332239031791687, "logits/rejected": 1.0312001705169678, "logps/chosen": -238.71188354492188, "logps/rejected": -243.7023468017578, "loss": 0.6903, "rewards/accuracies": 0.65625, "rewards/chosen": -0.033622294664382935, "rewards/margins": 0.06797768920660019, "rewards/rejected": -0.10159997642040253, "step": 3070 }, { "epoch": 0.81, "learning_rate": 5.5154057665109e-07, "logits/chosen": 0.9673851132392883, "logits/rejected": 1.029404878616333, "logps/chosen": -280.47100830078125, "logps/rejected": -243.7905731201172, "loss": 0.6917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02676049992442131, "rewards/margins": 0.06190108135342598, "rewards/rejected": -0.08866159617900848, "step": 3080 }, { "epoch": 0.81, "learning_rate": 5.373088969907586e-07, "logits/chosen": 0.9254564046859741, "logits/rejected": 1.013877272605896, "logps/chosen": -260.7253723144531, "logps/rejected": -259.11224365234375, "loss": 0.691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.024502191692590714, "rewards/margins": 0.059731971472501755, "rewards/rejected": -0.08423416316509247, "step": 3090 }, { "epoch": 0.81, "learning_rate": 5.23241101472709e-07, "logits/chosen": 0.9964572191238403, "logits/rejected": 0.9820792078971863, "logps/chosen": -255.88662719726562, "logps/rejected": -244.15097045898438, "loss": 0.6905, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02560483291745186, "rewards/margins": 0.0563865527510643, "rewards/rejected": -0.08199138939380646, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": 0.9147237539291382, "eval_logits/rejected": 1.0175365209579468, "eval_logps/chosen": -280.5528869628906, "eval_logps/rejected": -252.58462524414062, "eval_loss": 0.6908957958221436, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": -0.03006073087453842, "eval_rewards/margins": 0.057626351714134216, "eval_rewards/rejected": -0.08768707513809204, "eval_runtime": 540.1337, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "step": 3100 }, { "epoch": 0.81, "learning_rate": 5.09338364753818e-07, "logits/chosen": 0.901587963104248, "logits/rejected": 1.0238076448440552, "logps/chosen": -293.3055114746094, "logps/rejected": -265.13775634765625, "loss": 0.6899, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.022513773292303085, "rewards/margins": 0.08927594870328903, "rewards/rejected": -0.11178971827030182, "step": 3110 }, { "epoch": 0.82, "learning_rate": 4.956018477086005e-07, "logits/chosen": 1.040078043937683, "logits/rejected": 1.0295056104660034, "logps/chosen": -282.08648681640625, "logps/rejected": -230.9876251220703, "loss": 0.6916, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026005476713180542, "rewards/margins": 0.054637789726257324, "rewards/rejected": -0.08064327389001846, "step": 3120 }, { "epoch": 0.82, "learning_rate": 4.820326973322764e-07, "logits/chosen": 0.9570829272270203, "logits/rejected": 1.007840871810913, "logps/chosen": -296.0901794433594, "logps/rejected": -222.00198364257812, "loss": 0.6906, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.015858953818678856, "rewards/margins": 0.0702957734465599, "rewards/rejected": -0.08615472912788391, "step": 3130 }, { "epoch": 0.82, "learning_rate": 4.686320466449981e-07, "logits/chosen": 0.9432961344718933, "logits/rejected": 0.9931074976921082, "logps/chosen": -237.6160430908203, "logps/rejected": -222.57870483398438, "loss": 0.6902, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.024945750832557678, "rewards/margins": 0.07617698609828949, "rewards/rejected": -0.10112272202968597, "step": 3140 }, { "epoch": 0.82, "learning_rate": 4.554010145972418e-07, "logits/chosen": 0.9430766105651855, "logits/rejected": 1.0184309482574463, "logps/chosen": -286.0646667480469, "logps/rejected": -229.7809600830078, "loss": 0.6911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.017250075936317444, "rewards/margins": 0.05181562900543213, "rewards/rejected": -0.06906570494174957, "step": 3150 }, { "epoch": 0.83, "learning_rate": 4.4234070597637455e-07, "logits/chosen": 0.8948880434036255, "logits/rejected": 1.0683404207229614, "logps/chosen": -271.1024169921875, "logps/rejected": -254.7742462158203, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.032865189015865326, "rewards/margins": 0.04334001615643501, "rewards/rejected": -0.07620520889759064, "step": 3160 }, { "epoch": 0.83, "learning_rate": 4.2945221131440783e-07, "logits/chosen": 0.9620285034179688, "logits/rejected": 1.0088149309158325, "logps/chosen": -246.1608123779297, "logps/rejected": -221.75363159179688, "loss": 0.6903, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.027930408716201782, "rewards/margins": 0.05746666342020035, "rewards/rejected": -0.08539707213640213, "step": 3170 }, { "epoch": 0.83, "learning_rate": 4.167366067969381e-07, "logits/chosen": 0.895652174949646, "logits/rejected": 0.9522021412849426, "logps/chosen": -272.8707580566406, "logps/rejected": -259.20855712890625, "loss": 0.6915, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0206812284886837, "rewards/margins": 0.05796490237116814, "rewards/rejected": -0.07864613831043243, "step": 3180 }, { "epoch": 0.83, "learning_rate": 4.041949541732826e-07, "logits/chosen": 0.9851544499397278, "logits/rejected": 1.0554238557815552, "logps/chosen": -296.3244323730469, "logps/rejected": -245.6664581298828, "loss": 0.6915, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02956976927816868, "rewards/margins": 0.04805067181587219, "rewards/rejected": -0.07762044668197632, "step": 3190 }, { "epoch": 0.84, "learning_rate": 3.9182830066782614e-07, "logits/chosen": 0.9625126123428345, "logits/rejected": 0.9740635752677917, "logps/chosen": -282.29425048828125, "logps/rejected": -246.2125244140625, "loss": 0.6919, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03583105653524399, "rewards/margins": 0.048236239701509476, "rewards/rejected": -0.08406729251146317, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": 0.9153636693954468, "eval_logits/rejected": 1.0176055431365967, "eval_logps/chosen": -280.5576477050781, "eval_logps/rejected": -252.59996032714844, "eval_loss": 0.6908729076385498, "eval_rewards/accuracies": 0.6305000185966492, "eval_rewards/chosen": -0.03010854683816433, "eval_rewards/margins": 0.057731661945581436, "eval_rewards/rejected": -0.08784020692110062, "eval_runtime": 540.1227, "eval_samples_per_second": 3.703, "eval_steps_per_second": 0.926, "step": 3200 }, { "epoch": 0.84, "learning_rate": 3.796376788925771e-07, "logits/chosen": 0.9004298448562622, "logits/rejected": 1.0678789615631104, "logps/chosen": -247.7801513671875, "logps/rejected": -225.3026580810547, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03286176174879074, "rewards/margins": 0.062369298189878464, "rewards/rejected": -0.0952310562133789, "step": 3210 }, { "epoch": 0.84, "learning_rate": 3.676241067609465e-07, "logits/chosen": 0.9522799253463745, "logits/rejected": 0.9462326169013977, "logps/chosen": -242.72482299804688, "logps/rejected": -237.6994171142578, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.030536144971847534, "rewards/margins": 0.06319095939397812, "rewards/rejected": -0.09372710436582565, "step": 3220 }, { "epoch": 0.85, "learning_rate": 3.5578858740274976e-07, "logits/chosen": 0.9089903831481934, "logits/rejected": 1.0164532661437988, "logps/chosen": -300.99224853515625, "logps/rejected": -260.6708984375, "loss": 0.6911, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.021878289058804512, "rewards/margins": 0.06206582114100456, "rewards/rejected": -0.08394411206245422, "step": 3230 }, { "epoch": 0.85, "learning_rate": 3.44132109080447e-07, "logits/chosen": 0.9136902093887329, "logits/rejected": 1.064955234527588, "logps/chosen": -260.19952392578125, "logps/rejected": -237.27487182617188, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03714042156934738, "rewards/margins": 0.04816558212041855, "rewards/rejected": -0.08530601114034653, "step": 3240 }, { "epoch": 0.85, "learning_rate": 3.3265564510662344e-07, "logits/chosen": 0.9555643200874329, "logits/rejected": 1.0349557399749756, "logps/chosen": -254.3995361328125, "logps/rejected": -219.08493041992188, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.02656802162528038, "rewards/margins": 0.052377671003341675, "rewards/rejected": -0.07894569635391235, "step": 3250 }, { "epoch": 0.85, "learning_rate": 3.213601537627195e-07, "logits/chosen": 0.9827576875686646, "logits/rejected": 0.9898948669433594, "logps/chosen": -261.27630615234375, "logps/rejected": -252.62423706054688, "loss": 0.6912, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.031087204813957214, "rewards/margins": 0.05801438167691231, "rewards/rejected": -0.08910159021615982, "step": 3260 }, { "epoch": 0.86, "learning_rate": 3.1024657821901063e-07, "logits/chosen": 0.9491473436355591, "logits/rejected": 0.972917914390564, "logps/chosen": -255.64334106445312, "logps/rejected": -237.9709930419922, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": -0.025073718279600143, "rewards/margins": 0.06804076582193375, "rewards/rejected": -0.09311448037624359, "step": 3270 }, { "epoch": 0.86, "learning_rate": 2.9931584645585654e-07, "logits/chosen": 1.007514238357544, "logits/rejected": 1.0354502201080322, "logps/chosen": -285.310302734375, "logps/rejected": -281.1996154785156, "loss": 0.6916, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02509785071015358, "rewards/margins": 0.07314437627792358, "rewards/rejected": -0.09824222326278687, "step": 3280 }, { "epoch": 0.86, "learning_rate": 2.885688711862136e-07, "logits/chosen": 0.9445541501045227, "logits/rejected": 0.9918941259384155, "logps/chosen": -257.47137451171875, "logps/rejected": -222.67733764648438, "loss": 0.6903, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.025721009820699692, "rewards/margins": 0.0427216961979866, "rewards/rejected": -0.068442702293396, "step": 3290 }, { "epoch": 0.86, "learning_rate": 2.7800654977942486e-07, "logits/chosen": 0.9048360586166382, "logits/rejected": 1.0405280590057373, "logps/chosen": -294.37847900390625, "logps/rejected": -225.70297241210938, "loss": 0.69, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02359098568558693, "rewards/margins": 0.06500183045864105, "rewards/rejected": -0.08859282732009888, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": 0.9186109900474548, "eval_logits/rejected": 1.0211718082427979, "eval_logps/chosen": -280.2095947265625, "eval_logps/rejected": -252.20504760742188, "eval_loss": 0.6908652782440186, "eval_rewards/accuracies": 0.6284999847412109, "eval_rewards/chosen": -0.02662779949605465, "eval_rewards/margins": 0.057263679802417755, "eval_rewards/rejected": -0.08389147371053696, "eval_runtime": 540.4002, "eval_samples_per_second": 3.701, "eval_steps_per_second": 0.925, "step": 3300 }, { "epoch": 0.87, "learning_rate": 2.6762976418628797e-07, "logits/chosen": 0.9340456128120422, "logits/rejected": 0.9448652267456055, "logps/chosen": -252.797119140625, "logps/rejected": -231.66714477539062, "loss": 0.6914, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022808540612459183, "rewards/margins": 0.05553862452507019, "rewards/rejected": -0.07834717631340027, "step": 3310 }, { "epoch": 0.87, "learning_rate": 2.5743938086541354e-07, "logits/chosen": 0.9630203247070312, "logits/rejected": 1.0022109746932983, "logps/chosen": -297.3249816894531, "logps/rejected": -251.38656616210938, "loss": 0.6895, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.015477204695343971, "rewards/margins": 0.08188708126544952, "rewards/rejected": -0.09736428409814835, "step": 3320 }, { "epoch": 0.87, "learning_rate": 2.4743625071087574e-07, "logits/chosen": 0.9089611768722534, "logits/rejected": 1.0690263509750366, "logps/chosen": -269.80987548828125, "logps/rejected": -246.6451873779297, "loss": 0.69, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.031479865312576294, "rewards/margins": 0.0678834319114685, "rewards/rejected": -0.0993632897734642, "step": 3330 }, { "epoch": 0.87, "learning_rate": 2.3762120898116498e-07, "logits/chosen": 0.9765059351921082, "logits/rejected": 1.011185884475708, "logps/chosen": -267.643798828125, "logps/rejected": -244.47586059570312, "loss": 0.6913, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02102813683450222, "rewards/margins": 0.03919995576143265, "rewards/rejected": -0.060228098183870316, "step": 3340 }, { "epoch": 0.88, "learning_rate": 2.2799507522944048e-07, "logits/chosen": 0.9900724291801453, "logits/rejected": 1.0086749792099, "logps/chosen": -296.67889404296875, "logps/rejected": -239.91189575195312, "loss": 0.6925, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.028908709064126015, "rewards/margins": 0.030585547909140587, "rewards/rejected": -0.0594942569732666, "step": 3350 }, { "epoch": 0.88, "learning_rate": 2.1855865323510056e-07, "logits/chosen": 0.9651070833206177, "logits/rejected": 1.0190773010253906, "logps/chosen": -290.01953125, "logps/rejected": -266.74298095703125, "loss": 0.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.017114948481321335, "rewards/margins": 0.06442873179912567, "rewards/rejected": -0.08154366910457611, "step": 3360 }, { "epoch": 0.88, "learning_rate": 2.0931273093666575e-07, "logits/chosen": 1.0086729526519775, "logits/rejected": 1.0279282331466675, "logps/chosen": -267.48614501953125, "logps/rejected": -265.8891296386719, "loss": 0.6907, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02099434658885002, "rewards/margins": 0.061572205275297165, "rewards/rejected": -0.08256654441356659, "step": 3370 }, { "epoch": 0.88, "learning_rate": 2.002580803659873e-07, "logits/chosen": 1.0000813007354736, "logits/rejected": 1.0106067657470703, "logps/chosen": -293.6463623046875, "logps/rejected": -262.5312805175781, "loss": 0.6916, "rewards/accuracies": 0.59375, "rewards/chosen": -0.036777563393116, "rewards/margins": 0.04339155554771423, "rewards/rejected": -0.08016912639141083, "step": 3380 }, { "epoch": 0.89, "learning_rate": 1.913954575837826e-07, "logits/chosen": 1.0087939500808716, "logits/rejected": 1.079708456993103, "logps/chosen": -292.1297912597656, "logps/rejected": -263.822509765625, "loss": 0.6904, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.028077151626348495, "rewards/margins": 0.06453180313110352, "rewards/rejected": -0.09260895103216171, "step": 3390 }, { "epoch": 0.89, "learning_rate": 1.827256026165028e-07, "logits/chosen": 1.0100739002227783, "logits/rejected": 1.02089524269104, "logps/chosen": -272.09161376953125, "logps/rejected": -239.74560546875, "loss": 0.689, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02850997820496559, "rewards/margins": 0.07207761704921722, "rewards/rejected": -0.10058760643005371, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": 0.9202005863189697, "eval_logits/rejected": 1.0222870111465454, "eval_logps/chosen": -280.4384460449219, "eval_logps/rejected": -252.48487854003906, "eval_loss": 0.6908697485923767, "eval_rewards/accuracies": 0.628000020980835, "eval_rewards/chosen": -0.028916185721755028, "eval_rewards/margins": 0.057773273438215256, "eval_rewards/rejected": -0.08668945729732513, "eval_runtime": 540.2108, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 3400 }, { "epoch": 0.89, "learning_rate": 1.7424923939454274e-07, "logits/chosen": 1.0115320682525635, "logits/rejected": 1.0266879796981812, "logps/chosen": -282.57763671875, "logps/rejected": -256.1681213378906, "loss": 0.6902, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031413815915584564, "rewards/margins": 0.056411970406770706, "rewards/rejected": -0.08782579004764557, "step": 3410 }, { "epoch": 0.9, "learning_rate": 1.6596707569179304e-07, "logits/chosen": 0.9267918467521667, "logits/rejected": 0.988551139831543, "logps/chosen": -267.50616455078125, "logps/rejected": -228.9858856201172, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": -0.022525835782289505, "rewards/margins": 0.055544476956129074, "rewards/rejected": -0.07807030528783798, "step": 3420 }, { "epoch": 0.9, "learning_rate": 1.578798030665385e-07, "logits/chosen": 0.9413129687309265, "logits/rejected": 0.9737070798873901, "logps/chosen": -287.58221435546875, "logps/rejected": -231.7978973388672, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": -0.031874872744083405, "rewards/margins": 0.03842931613326073, "rewards/rejected": -0.07030418515205383, "step": 3430 }, { "epoch": 0.9, "learning_rate": 1.499880968037165e-07, "logits/chosen": 0.9008700251579285, "logits/rejected": 1.0831358432769775, "logps/chosen": -283.8552551269531, "logps/rejected": -254.0209503173828, "loss": 0.6905, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.02647927962243557, "rewards/margins": 0.07812824845314026, "rewards/rejected": -0.10460753738880157, "step": 3440 }, { "epoch": 0.9, "learning_rate": 1.4229261585852805e-07, "logits/chosen": 0.9461116790771484, "logits/rejected": 1.0905543565750122, "logps/chosen": -259.1544494628906, "logps/rejected": -243.8948211669922, "loss": 0.6902, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03435998409986496, "rewards/margins": 0.07146745920181274, "rewards/rejected": -0.1058274507522583, "step": 3450 }, { "epoch": 0.91, "learning_rate": 1.3479400280141886e-07, "logits/chosen": 0.9727323651313782, "logits/rejected": 0.9646995663642883, "logps/chosen": -240.6188507080078, "logps/rejected": -220.088623046875, "loss": 0.6921, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.021371301263570786, "rewards/margins": 0.04031342267990112, "rewards/rejected": -0.06168472766876221, "step": 3460 }, { "epoch": 0.91, "learning_rate": 1.2749288376442044e-07, "logits/chosen": 1.0163447856903076, "logits/rejected": 1.0307289361953735, "logps/chosen": -249.5280303955078, "logps/rejected": -242.83419799804688, "loss": 0.6904, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.033246301114559174, "rewards/margins": 0.05100921913981438, "rewards/rejected": -0.08425550907850266, "step": 3470 }, { "epoch": 0.91, "learning_rate": 1.203898683888713e-07, "logits/chosen": 0.9769922494888306, "logits/rejected": 1.0272125005722046, "logps/chosen": -255.9679718017578, "logps/rejected": -241.6696319580078, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.036718789488077164, "rewards/margins": 0.06190019100904465, "rewards/rejected": -0.09861898422241211, "step": 3480 }, { "epoch": 0.91, "learning_rate": 1.1348554977451132e-07, "logits/chosen": 1.0243645906448364, "logits/rejected": 1.0316869020462036, "logps/chosen": -243.71621704101562, "logps/rejected": -253.5388946533203, "loss": 0.6902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03732157126069069, "rewards/margins": 0.06564446538686752, "rewards/rejected": -0.10296603292226791, "step": 3490 }, { "epoch": 0.92, "learning_rate": 1.0678050442995802e-07, "logits/chosen": 0.9439903497695923, "logits/rejected": 0.9749709963798523, "logps/chosen": -262.73333740234375, "logps/rejected": -220.5067138671875, "loss": 0.6901, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03468143194913864, "rewards/margins": 0.0676378607749939, "rewards/rejected": -0.10231930017471313, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": 0.9216287136077881, "eval_logits/rejected": 1.0239145755767822, "eval_logps/chosen": -280.4474792480469, "eval_logps/rejected": -252.50457763671875, "eval_loss": 0.6908650398254395, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.029006626456975937, "eval_rewards/margins": 0.05787980556488037, "eval_rewards/rejected": -0.0868864357471466, "eval_runtime": 539.9895, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 3500 }, { "epoch": 0.92, "learning_rate": 1.0027529222456755e-07, "logits/chosen": 0.9812003374099731, "logits/rejected": 0.9633600115776062, "logps/chosen": -264.53521728515625, "logps/rejected": -254.2129364013672, "loss": 0.6916, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03213541954755783, "rewards/margins": 0.0566454641520977, "rewards/rejected": -0.08878089487552643, "step": 3510 }, { "epoch": 0.92, "learning_rate": 9.397045634168766e-08, "logits/chosen": 0.9166300892829895, "logits/rejected": 1.0722475051879883, "logps/chosen": -275.48828125, "logps/rejected": -235.26626586914062, "loss": 0.69, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.016414564102888107, "rewards/margins": 0.06454362720251083, "rewards/rejected": -0.08095818758010864, "step": 3520 }, { "epoch": 0.92, "learning_rate": 8.78665232332998e-08, "logits/chosen": 0.9330304265022278, "logits/rejected": 1.0522311925888062, "logps/chosen": -260.5693054199219, "logps/rejected": -258.9266052246094, "loss": 0.691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025154178962111473, "rewards/margins": 0.05904405564069748, "rewards/rejected": -0.08419822156429291, "step": 3530 }, { "epoch": 0.93, "learning_rate": 8.196400257606208e-08, "logits/chosen": 0.9113238453865051, "logits/rejected": 0.9963987469673157, "logps/chosen": -279.2100524902344, "logps/rejected": -249.4517059326172, "loss": 0.6921, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.020202763378620148, "rewards/margins": 0.04745917767286301, "rewards/rejected": -0.06766194850206375, "step": 3540 }, { "epoch": 0.93, "learning_rate": 7.626338722875076e-08, "logits/chosen": 1.0007197856903076, "logits/rejected": 0.9505215883255005, "logps/chosen": -280.51666259765625, "logps/rejected": -254.579833984375, "loss": 0.6908, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03505931794643402, "rewards/margins": 0.04930321127176285, "rewards/rejected": -0.08436252176761627, "step": 3550 }, { "epoch": 0.93, "learning_rate": 7.076515319110688e-08, "logits/chosen": 1.0191797018051147, "logits/rejected": 1.0009005069732666, "logps/chosen": -292.7381591796875, "logps/rejected": -252.88143920898438, "loss": 0.6903, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029474809765815735, "rewards/margins": 0.05774076655507088, "rewards/rejected": -0.08721558004617691, "step": 3560 }, { "epoch": 0.93, "learning_rate": 6.54697595640899e-08, "logits/chosen": 1.0553741455078125, "logits/rejected": 0.9938759803771973, "logps/chosen": -280.03265380859375, "logps/rejected": -261.9696044921875, "loss": 0.6904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.024510715156793594, "rewards/margins": 0.05735497549176216, "rewards/rejected": -0.08186569064855576, "step": 3570 }, { "epoch": 0.94, "learning_rate": 6.037764851154426e-08, "logits/chosen": 0.9185417890548706, "logits/rejected": 1.0824509859085083, "logps/chosen": -279.9969787597656, "logps/rejected": -240.0286865234375, "loss": 0.6912, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029702965170145035, "rewards/margins": 0.05176641792058945, "rewards/rejected": -0.08146937936544418, "step": 3580 }, { "epoch": 0.94, "learning_rate": 5.548924522327748e-08, "logits/chosen": 0.894769012928009, "logits/rejected": 1.0935579538345337, "logps/chosen": -275.4918518066406, "logps/rejected": -258.45953369140625, "loss": 0.6911, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02207362651824951, "rewards/margins": 0.06469397246837616, "rewards/rejected": -0.08676759898662567, "step": 3590 }, { "epoch": 0.94, "learning_rate": 5.0804957879556915e-08, "logits/chosen": 0.9377716183662415, "logits/rejected": 1.0579009056091309, "logps/chosen": -318.527587890625, "logps/rejected": -260.9480285644531, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": -0.03672604635357857, "rewards/margins": 0.05514238029718399, "rewards/rejected": -0.09186841547489166, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": 0.9221275448799133, "eval_logits/rejected": 1.0243616104125977, "eval_logps/chosen": -280.42578125, "eval_logps/rejected": -252.4630584716797, "eval_loss": 0.6908705830574036, "eval_rewards/accuracies": 0.6290000081062317, "eval_rewards/chosen": -0.02878967486321926, "eval_rewards/margins": 0.057681918144226074, "eval_rewards/rejected": -0.08647158741950989, "eval_runtime": 539.9286, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 3600 }, { "epoch": 0.94, "learning_rate": 4.632517761702815e-08, "logits/chosen": 0.9917756915092468, "logits/rejected": 1.043176293373108, "logps/chosen": -285.3194274902344, "logps/rejected": -251.86471557617188, "loss": 0.6901, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.020223677158355713, "rewards/margins": 0.0789574682712555, "rewards/rejected": -0.0991811528801918, "step": 3610 }, { "epoch": 0.95, "learning_rate": 4.205027849605359e-08, "logits/chosen": 0.9845743179321289, "logits/rejected": 0.9572548866271973, "logps/chosen": -231.9497833251953, "logps/rejected": -235.6708984375, "loss": 0.6913, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.024526074528694153, "rewards/margins": 0.0476609468460083, "rewards/rejected": -0.07218702882528305, "step": 3620 }, { "epoch": 0.95, "learning_rate": 3.798061746947995e-08, "logits/chosen": 0.9714505076408386, "logits/rejected": 0.9893890619277954, "logps/chosen": -345.9156188964844, "logps/rejected": -263.2041931152344, "loss": 0.6918, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.022486504167318344, "rewards/margins": 0.052516452968120575, "rewards/rejected": -0.07500295341014862, "step": 3630 }, { "epoch": 0.95, "learning_rate": 3.411653435283158e-08, "logits/chosen": 0.9488525390625, "logits/rejected": 1.0411800146102905, "logps/chosen": -294.39727783203125, "logps/rejected": -257.3688659667969, "loss": 0.6915, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03506433218717575, "rewards/margins": 0.0475606694817543, "rewards/rejected": -0.08262500911951065, "step": 3640 }, { "epoch": 0.96, "learning_rate": 3.04583517959367e-08, "logits/chosen": 0.9405366778373718, "logits/rejected": 0.9672233462333679, "logps/chosen": -299.8280334472656, "logps/rejected": -250.7571258544922, "loss": 0.6905, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011996401473879814, "rewards/margins": 0.08034516870975494, "rewards/rejected": -0.09234156459569931, "step": 3650 }, { "epoch": 0.96, "learning_rate": 2.7006375255985984e-08, "logits/chosen": 0.9549415707588196, "logits/rejected": 1.0070356130599976, "logps/chosen": -241.44503784179688, "logps/rejected": -215.5791778564453, "loss": 0.6908, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029812267050147057, "rewards/margins": 0.04664590209722519, "rewards/rejected": -0.0764581710100174, "step": 3660 }, { "epoch": 0.96, "learning_rate": 2.3760892972027328e-08, "logits/chosen": 0.8861829042434692, "logits/rejected": 1.0260193347930908, "logps/chosen": -290.62060546875, "logps/rejected": -238.04080200195312, "loss": 0.6907, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025474613532423973, "rewards/margins": 0.061755161732435226, "rewards/rejected": -0.08722977340221405, "step": 3670 }, { "epoch": 0.96, "learning_rate": 2.072217594089765e-08, "logits/chosen": 0.9960481524467468, "logits/rejected": 0.9741401672363281, "logps/chosen": -247.71484375, "logps/rejected": -232.7592315673828, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.025015806779265404, "rewards/margins": 0.04509962350130081, "rewards/rejected": -0.07011543214321136, "step": 3680 }, { "epoch": 0.97, "learning_rate": 1.789047789459375e-08, "logits/chosen": 0.9169954061508179, "logits/rejected": 0.9940811991691589, "logps/chosen": -279.7234191894531, "logps/rejected": -257.5582580566406, "loss": 0.6901, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029348302632570267, "rewards/margins": 0.05934641510248184, "rewards/rejected": -0.08869470655918121, "step": 3690 }, { "epoch": 0.97, "learning_rate": 1.5266035279088708e-08, "logits/chosen": 0.9306381940841675, "logits/rejected": 0.9920527338981628, "logps/chosen": -320.2879333496094, "logps/rejected": -246.09512329101562, "loss": 0.6914, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02664271369576454, "rewards/margins": 0.05591448396444321, "rewards/rejected": -0.08255720138549805, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": 0.9215983152389526, "eval_logits/rejected": 1.0239914655685425, "eval_logps/chosen": -280.43499755859375, "eval_logps/rejected": -252.45912170410156, "eval_loss": 0.6908671855926514, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.028881965205073357, "eval_rewards/margins": 0.05755016952753067, "eval_rewards/rejected": -0.08643212914466858, "eval_runtime": 540.2057, "eval_samples_per_second": 3.702, "eval_steps_per_second": 0.926, "step": 3700 }, { "epoch": 0.97, "learning_rate": 1.2849067234584623e-08, "logits/chosen": 0.9069113731384277, "logits/rejected": 0.9997833371162415, "logps/chosen": -258.52178955078125, "logps/rejected": -240.258544921875, "loss": 0.6913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02280072309076786, "rewards/margins": 0.054703257977962494, "rewards/rejected": -0.0775039792060852, "step": 3710 }, { "epoch": 0.97, "learning_rate": 1.0639775577218625e-08, "logits/chosen": 0.9457674026489258, "logits/rejected": 0.987481415271759, "logps/chosen": -288.9521484375, "logps/rejected": -266.4629821777344, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.019102510064840317, "rewards/margins": 0.06657516956329346, "rewards/rejected": -0.08567767590284348, "step": 3720 }, { "epoch": 0.98, "learning_rate": 8.638344782207486e-09, "logits/chosen": 0.9037330746650696, "logits/rejected": 1.0098841190338135, "logps/chosen": -303.5415954589844, "logps/rejected": -282.108642578125, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.038687679916620255, "rewards/margins": 0.04073931276798248, "rewards/rejected": -0.07942698895931244, "step": 3730 }, { "epoch": 0.98, "learning_rate": 6.84494196844715e-09, "logits/chosen": 0.9645431637763977, "logits/rejected": 1.0531866550445557, "logps/chosen": -265.5387268066406, "logps/rejected": -245.14968872070312, "loss": 0.6901, "rewards/accuracies": 0.625, "rewards/chosen": -0.018353218212723732, "rewards/margins": 0.05940115451812744, "rewards/rejected": -0.07775436341762543, "step": 3740 }, { "epoch": 0.98, "learning_rate": 5.259716884556121e-09, "logits/chosen": 0.9109350442886353, "logits/rejected": 1.009334683418274, "logps/chosen": -311.3597412109375, "logps/rejected": -255.84878540039062, "loss": 0.691, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02166152000427246, "rewards/margins": 0.06039486080408096, "rewards/rejected": -0.08205638825893402, "step": 3750 }, { "epoch": 0.98, "learning_rate": 3.882801896372967e-09, "logits/chosen": 0.8763138055801392, "logits/rejected": 1.017446517944336, "logps/chosen": -282.8580627441406, "logps/rejected": -261.3891906738281, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024912741035223007, "rewards/margins": 0.045935411006212234, "rewards/rejected": -0.07084815204143524, "step": 3760 }, { "epoch": 0.99, "learning_rate": 2.7143119759026614e-09, "logits/chosen": 0.9283539056777954, "logits/rejected": 1.0674773454666138, "logps/chosen": -270.36358642578125, "logps/rejected": -274.0054626464844, "loss": 0.6914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01020726840943098, "rewards/margins": 0.06225059553980827, "rewards/rejected": -0.07245786488056183, "step": 3770 }, { "epoch": 0.99, "learning_rate": 1.754344691717591e-09, "logits/chosen": 0.8904141187667847, "logits/rejected": 0.9620451927185059, "logps/chosen": -264.84930419921875, "logps/rejected": -218.98477172851562, "loss": 0.6906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.012690825387835503, "rewards/margins": 0.06626447290182114, "rewards/rejected": -0.07895530760288239, "step": 3780 }, { "epoch": 0.99, "learning_rate": 1.0029802008096335e-09, "logits/chosen": 1.0099847316741943, "logits/rejected": 0.9373987913131714, "logps/chosen": -268.06048583984375, "logps/rejected": -228.6332550048828, "loss": 0.6902, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.028048217296600342, "rewards/margins": 0.06466346234083176, "rewards/rejected": -0.0927116721868515, "step": 3790 }, { "epoch": 0.99, "learning_rate": 4.602812418974534e-10, "logits/chosen": 0.8952652812004089, "logits/rejected": 1.017452597618103, "logps/chosen": -254.45571899414062, "logps/rejected": -240.29006958007812, "loss": 0.6917, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.032994892448186874, "rewards/margins": 0.04944116994738579, "rewards/rejected": -0.08243606984615326, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": 0.9220536351203918, "eval_logits/rejected": 1.0245850086212158, "eval_logps/chosen": -280.4203796386719, "eval_logps/rejected": -252.4789581298828, "eval_loss": 0.690862774848938, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.028735652565956116, "eval_rewards/margins": 0.057894736528396606, "eval_rewards/rejected": -0.08663039654493332, "eval_runtime": 539.9529, "eval_samples_per_second": 3.704, "eval_steps_per_second": 0.926, "step": 3800 }, { "epoch": 1.0, "learning_rate": 1.2629313018819312e-10, "logits/chosen": 0.895196795463562, "logits/rejected": 1.000282645225525, "logps/chosen": -302.4236145019531, "logps/rejected": -285.1495361328125, "loss": 0.6914, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.016779940575361252, "rewards/margins": 0.05169288069009781, "rewards/rejected": -0.06847281754016876, "step": 3810 }, { "epoch": 1.0, "learning_rate": 1.0437535929996855e-12, "logits/chosen": 0.982556164264679, "logits/rejected": 0.9328534007072449, "logps/chosen": -285.391845703125, "logps/rejected": -266.6493225097656, "loss": 0.6902, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026478338986635208, "rewards/margins": 0.06039903312921524, "rewards/rejected": -0.08687736093997955, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.691200782103989, "train_runtime": 55857.7878, "train_samples_per_second": 1.094, "train_steps_per_second": 0.068 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }