{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995451099317665, "eval_steps": 100, "global_step": 1648, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.5390625, "learning_rate": 3.03030303030303e-09, "logits/chosen": -3.4050943851470947, "logits/rejected": -3.1368675231933594, "logps/chosen": -118.80651092529297, "logps/rejected": -84.5186767578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.65625, "learning_rate": 3.0303030303030305e-08, "logits/chosen": -3.4118552207946777, "logits/rejected": -3.234715700149536, "logps/chosen": -112.32723236083984, "logps/rejected": -153.78240966796875, "loss": 0.693, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0004033078148495406, "rewards/margins": 0.00041662290459498763, "rewards/margins_max": 0.0021899566054344177, "rewards/margins_min": -0.0013567109126597643, "rewards/margins_std": 0.002507872646674514, "rewards/rejected": -1.3315144315129146e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.8828125, "learning_rate": 6.060606060606061e-08, "logits/chosen": -3.4354801177978516, "logits/rejected": -3.176407814025879, "logps/chosen": -118.2829360961914, "logps/rejected": -184.0032958984375, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00016470803529955447, "rewards/margins": 1.1057045412599109e-05, "rewards/margins_max": 0.00216041412204504, "rewards/margins_min": -0.002138300333172083, "rewards/margins_std": 0.0030396501533687115, "rewards/rejected": 0.0001536509662400931, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.8671875, "learning_rate": 9.09090909090909e-08, "logits/chosen": -3.4078497886657715, "logits/rejected": -3.205293655395508, "logps/chosen": -127.51212310791016, "logps/rejected": -157.24716186523438, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.00028022227343171835, "rewards/margins": 0.0013433375861495733, "rewards/margins_max": 0.0038831476122140884, "rewards/margins_min": -0.0011964720906689763, "rewards/margins_std": 0.003591833170503378, "rewards/rejected": -0.0010631154291331768, "step": 30 }, { "epoch": 0.02, "grad_norm": 1.71875, "learning_rate": 1.2121212121212122e-07, "logits/chosen": -3.4350059032440186, "logits/rejected": -3.2142701148986816, "logps/chosen": -121.0025634765625, "logps/rejected": -145.43264770507812, "loss": 0.6915, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0012690603034570813, "rewards/margins": 0.003167077898979187, "rewards/margins_max": 0.005492820404469967, "rewards/margins_min": 0.0008413357427343726, "rewards/margins_std": 0.003289096523076296, "rewards/rejected": -0.0018980179447680712, "step": 40 }, { "epoch": 0.03, "grad_norm": 1.75, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -3.4711899757385254, "logits/rejected": -3.23637056350708, "logps/chosen": -114.65794372558594, "logps/rejected": -166.53250122070312, "loss": 0.6909, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0023323404602706432, "rewards/margins": 0.004964292515069246, "rewards/margins_max": 0.007555422373116016, "rewards/margins_min": 0.0023731617256999016, "rewards/margins_std": 0.0036644123028963804, "rewards/rejected": -0.0026319522876292467, "step": 50 }, { "epoch": 0.04, "grad_norm": 2.0, "learning_rate": 1.818181818181818e-07, "logits/chosen": -3.4538276195526123, "logits/rejected": -3.1886672973632812, "logps/chosen": -109.4487533569336, "logps/rejected": -172.9461669921875, "loss": 0.6898, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.003310429397970438, "rewards/margins": 0.007032909896224737, "rewards/margins_max": 0.011247309856116772, "rewards/margins_min": 0.0028185099363327026, "rewards/margins_std": 0.005960061680525541, "rewards/rejected": -0.003722480731084943, "step": 60 }, { "epoch": 0.04, "grad_norm": 1.8359375, "learning_rate": 2.121212121212121e-07, "logits/chosen": -3.4295284748077393, "logits/rejected": -3.1960196495056152, "logps/chosen": -125.6326904296875, "logps/rejected": -177.14407348632812, "loss": 0.688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0038899246137589216, "rewards/margins": 0.009991476312279701, "rewards/margins_max": 0.013599397614598274, "rewards/margins_min": 0.006383554544299841, "rewards/margins_std": 0.005102371331304312, "rewards/rejected": -0.006101551465690136, "step": 70 }, { "epoch": 0.05, "grad_norm": 1.65625, "learning_rate": 2.4242424242424244e-07, "logits/chosen": -3.4546008110046387, "logits/rejected": -3.259620189666748, "logps/chosen": -105.1754150390625, "logps/rejected": -149.17739868164062, "loss": 0.6864, "rewards/accuracies": 0.875, "rewards/chosen": 0.004929172340780497, "rewards/margins": 0.013479220680892467, "rewards/margins_max": 0.019636893644928932, "rewards/margins_min": 0.007321546785533428, "rewards/margins_std": 0.008708265610039234, "rewards/rejected": -0.008550046943128109, "step": 80 }, { "epoch": 0.05, "grad_norm": 1.8828125, "learning_rate": 2.727272727272727e-07, "logits/chosen": -3.4643356800079346, "logits/rejected": -3.227538585662842, "logps/chosen": -122.521240234375, "logps/rejected": -154.5928497314453, "loss": 0.6834, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.008893799968063831, "rewards/margins": 0.021132633090019226, "rewards/margins_max": 0.027842596173286438, "rewards/margins_min": 0.014422670006752014, "rewards/margins_std": 0.00948932021856308, "rewards/rejected": -0.012238833121955395, "step": 90 }, { "epoch": 0.06, "grad_norm": 1.71875, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -3.4493160247802734, "logits/rejected": -3.2681262493133545, "logps/chosen": -111.17362976074219, "logps/rejected": -195.10665893554688, "loss": 0.6813, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.009519929066300392, "rewards/margins": 0.023902228102087975, "rewards/margins_max": 0.0342855267226696, "rewards/margins_min": 0.013518924824893475, "rewards/margins_std": 0.014684207737445831, "rewards/rejected": -0.014382297173142433, "step": 100 }, { "epoch": 0.07, "grad_norm": 1.8125, "learning_rate": 3.333333333333333e-07, "logits/chosen": -3.3599190711975098, "logits/rejected": -3.1862587928771973, "logps/chosen": -117.963134765625, "logps/rejected": -156.41465759277344, "loss": 0.6783, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.008118102326989174, "rewards/margins": 0.030242860317230225, "rewards/margins_max": 0.040411077439785004, "rewards/margins_min": 0.020074646919965744, "rewards/margins_std": 0.014380025677382946, "rewards/rejected": -0.0221247561275959, "step": 110 }, { "epoch": 0.07, "grad_norm": 1.609375, "learning_rate": 3.636363636363636e-07, "logits/chosen": -3.4270176887512207, "logits/rejected": -3.26020884513855, "logps/chosen": -109.92674255371094, "logps/rejected": -144.8209228515625, "loss": 0.6757, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.010196666233241558, "rewards/margins": 0.03683094307780266, "rewards/margins_max": 0.0503312349319458, "rewards/margins_min": 0.023330653086304665, "rewards/margins_std": 0.019092293456196785, "rewards/rejected": -0.026634279638528824, "step": 120 }, { "epoch": 0.08, "grad_norm": 1.6484375, "learning_rate": 3.939393939393939e-07, "logits/chosen": -3.4294886589050293, "logits/rejected": -3.2369658946990967, "logps/chosen": -126.5963363647461, "logps/rejected": -178.168212890625, "loss": 0.6705, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.009229905903339386, "rewards/margins": 0.043803971260786057, "rewards/margins_max": 0.06445904076099396, "rewards/margins_min": 0.023148905485868454, "rewards/margins_std": 0.02921067550778389, "rewards/rejected": -0.03457406908273697, "step": 130 }, { "epoch": 0.08, "grad_norm": 1.7421875, "learning_rate": 4.242424242424242e-07, "logits/chosen": -3.441509246826172, "logits/rejected": -3.2002804279327393, "logps/chosen": -130.19882202148438, "logps/rejected": -178.93299865722656, "loss": 0.6643, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0069100684486329556, "rewards/margins": 0.05727902799844742, "rewards/margins_max": 0.07939378917217255, "rewards/margins_min": 0.03516425937414169, "rewards/margins_std": 0.03127499669790268, "rewards/rejected": -0.050368957221508026, "step": 140 }, { "epoch": 0.09, "grad_norm": 2.125, "learning_rate": 4.545454545454545e-07, "logits/chosen": -3.466477155685425, "logits/rejected": -3.275334119796753, "logps/chosen": -124.30558776855469, "logps/rejected": -181.0602569580078, "loss": 0.6634, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.008361677639186382, "rewards/margins": 0.06292758136987686, "rewards/margins_max": 0.08513649553060532, "rewards/margins_min": 0.04071866348385811, "rewards/margins_std": 0.0314081534743309, "rewards/rejected": -0.054565899074077606, "step": 150 }, { "epoch": 0.1, "grad_norm": 1.9140625, "learning_rate": 4.848484848484849e-07, "logits/chosen": -3.3910937309265137, "logits/rejected": -3.2401318550109863, "logps/chosen": -99.83372497558594, "logps/rejected": -162.27804565429688, "loss": 0.6578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006624563131481409, "rewards/margins": 0.06912867724895477, "rewards/margins_max": 0.1061328873038292, "rewards/margins_min": 0.032124463468790054, "rewards/margins_std": 0.05233185365796089, "rewards/rejected": -0.0625041052699089, "step": 160 }, { "epoch": 0.1, "grad_norm": 1.9375, "learning_rate": 4.999859762744229e-07, "logits/chosen": -3.403299331665039, "logits/rejected": -3.2240326404571533, "logps/chosen": -101.01579284667969, "logps/rejected": -159.98538208007812, "loss": 0.6513, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.00876183807849884, "rewards/margins": 0.08230775594711304, "rewards/margins_max": 0.11581530421972275, "rewards/margins_min": 0.048800211399793625, "rewards/margins_std": 0.047386832535266876, "rewards/rejected": -0.0735459253191948, "step": 170 }, { "epoch": 0.11, "grad_norm": 1.953125, "learning_rate": 4.998737959095448e-07, "logits/chosen": -3.4143004417419434, "logits/rejected": -3.1833884716033936, "logps/chosen": -105.91117095947266, "logps/rejected": -135.10708618164062, "loss": 0.6477, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.002522200345993042, "rewards/margins": 0.09561987221240997, "rewards/margins_max": 0.14069953560829163, "rewards/margins_min": 0.05054020881652832, "rewards/margins_std": 0.06375227868556976, "rewards/rejected": -0.09309767186641693, "step": 180 }, { "epoch": 0.12, "grad_norm": 2.046875, "learning_rate": 4.996494855203493e-07, "logits/chosen": -3.47766375541687, "logits/rejected": -3.207594633102417, "logps/chosen": -113.1792221069336, "logps/rejected": -175.5690460205078, "loss": 0.6359, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0006620381027460098, "rewards/margins": 0.11444854736328125, "rewards/margins_max": 0.15897879004478455, "rewards/margins_min": 0.06991832703351974, "rewards/margins_std": 0.06297525763511658, "rewards/rejected": -0.11378651857376099, "step": 190 }, { "epoch": 0.12, "grad_norm": 2.09375, "learning_rate": 4.993131457653681e-07, "logits/chosen": -3.4641525745391846, "logits/rejected": -3.2808594703674316, "logps/chosen": -100.21434020996094, "logps/rejected": -156.946044921875, "loss": 0.6389, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.01082407496869564, "rewards/margins": 0.11547299474477768, "rewards/margins_max": 0.16020886600017548, "rewards/margins_min": 0.07073714584112167, "rewards/margins_std": 0.06326606869697571, "rewards/rejected": -0.12629708647727966, "step": 200 }, { "epoch": 0.13, "grad_norm": 2.34375, "learning_rate": 4.988649275759334e-07, "logits/chosen": -3.428915500640869, "logits/rejected": -3.1432971954345703, "logps/chosen": -110.36918640136719, "logps/rejected": -165.47640991210938, "loss": 0.6294, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.012739740312099457, "rewards/margins": 0.12643049657344818, "rewards/margins_max": 0.18977002799510956, "rewards/margins_min": 0.0630909651517868, "rewards/margins_std": 0.0895756185054779, "rewards/rejected": -0.13917024433612823, "step": 210 }, { "epoch": 0.13, "grad_norm": 1.875, "learning_rate": 4.983050320884483e-07, "logits/chosen": -3.4887309074401855, "logits/rejected": -3.2058892250061035, "logps/chosen": -126.2535629272461, "logps/rejected": -187.00015258789062, "loss": 0.6223, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01476545911282301, "rewards/margins": 0.15231844782829285, "rewards/margins_max": 0.2008267194032669, "rewards/margins_min": 0.10381016880273819, "rewards/margins_std": 0.06860103458166122, "rewards/rejected": -0.16708388924598694, "step": 220 }, { "epoch": 0.14, "grad_norm": 2.125, "learning_rate": 4.976337105541267e-07, "logits/chosen": -3.403496503829956, "logits/rejected": -3.164135217666626, "logps/chosen": -130.16421508789062, "logps/rejected": -158.1027374267578, "loss": 0.6215, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.04382479190826416, "rewards/margins": 0.14249354600906372, "rewards/margins_max": 0.20711453258991241, "rewards/margins_min": 0.07787257432937622, "rewards/margins_std": 0.09138786792755127, "rewards/rejected": -0.1863183230161667, "step": 230 }, { "epoch": 0.15, "grad_norm": 2.234375, "learning_rate": 4.968512642262464e-07, "logits/chosen": -3.423377513885498, "logits/rejected": -3.2418792247772217, "logps/chosen": -104.84086608886719, "logps/rejected": -180.81430053710938, "loss": 0.5991, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.016986923292279243, "rewards/margins": 0.21991240978240967, "rewards/margins_max": 0.29189637303352356, "rewards/margins_min": 0.14792843163013458, "rewards/margins_std": 0.10180072486400604, "rewards/rejected": -0.23689934611320496, "step": 240 }, { "epoch": 0.15, "grad_norm": 2.03125, "learning_rate": 4.959580442249614e-07, "logits/chosen": -3.5027713775634766, "logits/rejected": -3.174872875213623, "logps/chosen": -121.6041259765625, "logps/rejected": -184.39622497558594, "loss": 0.5971, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.03534800559282303, "rewards/margins": 0.20791885256767273, "rewards/margins_max": 0.3044799268245697, "rewards/margins_min": 0.11135780811309814, "rewards/margins_std": 0.13655796647071838, "rewards/rejected": -0.24326686561107635, "step": 250 }, { "epoch": 0.16, "grad_norm": 2.515625, "learning_rate": 4.94954451379739e-07, "logits/chosen": -3.4629738330841064, "logits/rejected": -3.254920244216919, "logps/chosen": -126.48948669433594, "logps/rejected": -186.8356475830078, "loss": 0.5893, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0572446808218956, "rewards/margins": 0.242076113820076, "rewards/margins_max": 0.3453850746154785, "rewards/margins_min": 0.13876716792583466, "rewards/margins_std": 0.1461009383201599, "rewards/rejected": -0.2993208169937134, "step": 260 }, { "epoch": 0.16, "grad_norm": 2.296875, "learning_rate": 4.938409360494883e-07, "logits/chosen": -3.4049344062805176, "logits/rejected": -3.1644232273101807, "logps/chosen": -117.92335510253906, "logps/rejected": -183.36587524414062, "loss": 0.5894, "rewards/accuracies": 0.9375, "rewards/chosen": -0.053103696554899216, "rewards/margins": 0.23206424713134766, "rewards/margins_max": 0.3247820734977722, "rewards/margins_min": 0.1393464058637619, "rewards/margins_std": 0.13112285733222961, "rewards/rejected": -0.28516796231269836, "step": 270 }, { "epoch": 0.17, "grad_norm": 2.296875, "learning_rate": 4.926179979204632e-07, "logits/chosen": -3.4576289653778076, "logits/rejected": -3.24690318107605, "logps/chosen": -123.93232727050781, "logps/rejected": -194.07188415527344, "loss": 0.5881, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0886482372879982, "rewards/margins": 0.2548428773880005, "rewards/margins_max": 0.3780335485935211, "rewards/margins_min": 0.13165222108364105, "rewards/margins_std": 0.17421790957450867, "rewards/rejected": -0.3434911370277405, "step": 280 }, { "epoch": 0.18, "grad_norm": 2.171875, "learning_rate": 4.912861857820302e-07, "logits/chosen": -3.3650596141815186, "logits/rejected": -3.2302684783935547, "logps/chosen": -111.41851806640625, "logps/rejected": -206.7620849609375, "loss": 0.5658, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10818646103143692, "rewards/margins": 0.2999250292778015, "rewards/margins_max": 0.4337504506111145, "rewards/margins_min": 0.1660996377468109, "rewards/margins_std": 0.18925771117210388, "rewards/rejected": -0.40811148285865784, "step": 290 }, { "epoch": 0.18, "grad_norm": 2.8125, "learning_rate": 4.898460972804008e-07, "logits/chosen": -3.420971632003784, "logits/rejected": -3.1563363075256348, "logps/chosen": -122.5914077758789, "logps/rejected": -199.3488006591797, "loss": 0.549, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08986975252628326, "rewards/margins": 0.31944385170936584, "rewards/margins_max": 0.4357198178768158, "rewards/margins_min": 0.2031678408384323, "rewards/margins_std": 0.16443908214569092, "rewards/rejected": -0.4093135893344879, "step": 300 }, { "epoch": 0.19, "grad_norm": 2.625, "learning_rate": 4.882983786504399e-07, "logits/chosen": -3.4148566722869873, "logits/rejected": -3.1982669830322266, "logps/chosen": -136.39987182617188, "logps/rejected": -236.8019256591797, "loss": 0.544, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.15588413178920746, "rewards/margins": 0.3820186257362366, "rewards/margins_max": 0.5593412518501282, "rewards/margins_min": 0.20469605922698975, "rewards/margins_std": 0.25077205896377563, "rewards/rejected": -0.5379027724266052, "step": 310 }, { "epoch": 0.19, "grad_norm": 2.421875, "learning_rate": 4.866437244256695e-07, "logits/chosen": -3.411226987838745, "logits/rejected": -3.205670118331909, "logps/chosen": -129.0833282470703, "logps/rejected": -206.7379913330078, "loss": 0.5469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.14963629841804504, "rewards/margins": 0.37762314081192017, "rewards/margins_max": 0.5498967170715332, "rewards/margins_min": 0.20534953474998474, "rewards/margins_std": 0.24363164603710175, "rewards/rejected": -0.5272594690322876, "step": 320 }, { "epoch": 0.2, "grad_norm": 2.625, "learning_rate": 4.848828771266001e-07, "logits/chosen": -3.5033020973205566, "logits/rejected": -3.273409366607666, "logps/chosen": -156.30401611328125, "logps/rejected": -197.61549377441406, "loss": 0.5528, "rewards/accuracies": 0.875, "rewards/chosen": -0.19874437153339386, "rewards/margins": 0.3051304817199707, "rewards/margins_max": 0.47203540802001953, "rewards/margins_min": 0.13822560012340546, "rewards/margins_std": 0.2360391616821289, "rewards/rejected": -0.5038748979568481, "step": 330 }, { "epoch": 0.21, "grad_norm": 2.703125, "learning_rate": 4.830166269275266e-07, "logits/chosen": -3.443110942840576, "logits/rejected": -3.2243683338165283, "logps/chosen": -148.33139038085938, "logps/rejected": -215.1509246826172, "loss": 0.5203, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.20774701237678528, "rewards/margins": 0.386242538690567, "rewards/margins_max": 0.5645895600318909, "rewards/margins_min": 0.20789547264575958, "rewards/margins_std": 0.2522208094596863, "rewards/rejected": -0.5939895510673523, "step": 340 }, { "epoch": 0.21, "grad_norm": 3.234375, "learning_rate": 4.8104581130194e-07, "logits/chosen": -3.4214928150177, "logits/rejected": -3.2243239879608154, "logps/chosen": -139.30186462402344, "logps/rejected": -219.8242645263672, "loss": 0.5001, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22474750876426697, "rewards/margins": 0.41614609956741333, "rewards/margins_max": 0.6199635863304138, "rewards/margins_min": 0.21232867240905762, "rewards/margins_std": 0.2882413864135742, "rewards/rejected": -0.6408936977386475, "step": 350 }, { "epoch": 0.22, "grad_norm": 2.96875, "learning_rate": 4.789713146467143e-07, "logits/chosen": -3.401512861251831, "logits/rejected": -3.190495252609253, "logps/chosen": -146.44760131835938, "logps/rejected": -218.2511444091797, "loss": 0.5132, "rewards/accuracies": 0.875, "rewards/chosen": -0.2993583083152771, "rewards/margins": 0.397621214389801, "rewards/margins_max": 0.5621191263198853, "rewards/margins_min": 0.2331233024597168, "rewards/margins_std": 0.23263517022132874, "rewards/rejected": -0.6969794631004333, "step": 360 }, { "epoch": 0.22, "grad_norm": 2.734375, "learning_rate": 4.767940678852368e-07, "logits/chosen": -3.405986785888672, "logits/rejected": -3.1692707538604736, "logps/chosen": -153.48361206054688, "logps/rejected": -218.69650268554688, "loss": 0.5181, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.3115563988685608, "rewards/margins": 0.4646086096763611, "rewards/margins_max": 0.6958065032958984, "rewards/margins_min": 0.23341062664985657, "rewards/margins_std": 0.32696327567100525, "rewards/rejected": -0.7761648893356323, "step": 370 }, { "epoch": 0.23, "grad_norm": 3.171875, "learning_rate": 4.7451504804965823e-07, "logits/chosen": -3.3532516956329346, "logits/rejected": -3.2124714851379395, "logps/chosen": -138.7807159423828, "logps/rejected": -285.9217834472656, "loss": 0.4719, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.31872931122779846, "rewards/margins": 0.5318517088890076, "rewards/margins_max": 0.7545040249824524, "rewards/margins_min": 0.30919957160949707, "rewards/margins_std": 0.31487777829170227, "rewards/rejected": -0.850581169128418, "step": 380 }, { "epoch": 0.24, "grad_norm": 3.078125, "learning_rate": 4.7213527784245395e-07, "logits/chosen": -3.4123377799987793, "logits/rejected": -3.1568984985351562, "logps/chosen": -145.73069763183594, "logps/rejected": -233.9403533935547, "loss": 0.4702, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.35064345598220825, "rewards/margins": 0.5569049715995789, "rewards/margins_max": 0.7923904657363892, "rewards/margins_min": 0.3214194178581238, "rewards/margins_std": 0.33302679657936096, "rewards/rejected": -0.9075484275817871, "step": 390 }, { "epoch": 0.24, "grad_norm": 3.375, "learning_rate": 4.6965582517748917e-07, "logits/chosen": -3.3509891033172607, "logits/rejected": -3.1365480422973633, "logps/chosen": -157.23922729492188, "logps/rejected": -269.90496826171875, "loss": 0.4719, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4497006833553314, "rewards/margins": 0.6059035062789917, "rewards/margins_max": 0.8811772465705872, "rewards/margins_min": 0.33062973618507385, "rewards/margins_std": 0.38929590582847595, "rewards/rejected": -1.0556042194366455, "step": 400 }, { "epoch": 0.25, "grad_norm": 3.46875, "learning_rate": 4.6707780270079635e-07, "logits/chosen": -3.4341049194335938, "logits/rejected": -3.1959335803985596, "logps/chosen": -172.27877807617188, "logps/rejected": -291.41204833984375, "loss": 0.4602, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.44683152437210083, "rewards/margins": 0.6504033207893372, "rewards/margins_max": 0.9344033002853394, "rewards/margins_min": 0.36640337109565735, "rewards/margins_std": 0.40163666009902954, "rewards/rejected": -1.0972349643707275, "step": 410 }, { "epoch": 0.25, "grad_norm": 4.0, "learning_rate": 4.6440236729127876e-07, "logits/chosen": -3.3718056678771973, "logits/rejected": -3.1097371578216553, "logps/chosen": -169.48446655273438, "logps/rejected": -302.61016845703125, "loss": 0.4337, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5628658533096313, "rewards/margins": 0.7150717973709106, "rewards/margins_max": 1.017971396446228, "rewards/margins_min": 0.4121721684932709, "rewards/margins_std": 0.42836475372314453, "rewards/rejected": -1.277937650680542, "step": 420 }, { "epoch": 0.26, "grad_norm": 3.6875, "learning_rate": 4.616307195415654e-07, "logits/chosen": -3.3012547492980957, "logits/rejected": -3.127958059310913, "logps/chosen": -167.62088012695312, "logps/rejected": -272.51800537109375, "loss": 0.4286, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5819985270500183, "rewards/margins": 0.7929113507270813, "rewards/margins_max": 1.1863847970962524, "rewards/margins_min": 0.3994379937648773, "rewards/margins_std": 0.5564553737640381, "rewards/rejected": -1.3749098777770996, "step": 430 }, { "epoch": 0.27, "grad_norm": 3.421875, "learning_rate": 4.587641032192488e-07, "logits/chosen": -3.333758592605591, "logits/rejected": -3.151676654815674, "logps/chosen": -181.68984985351562, "logps/rejected": -296.03082275390625, "loss": 0.3825, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7118282914161682, "rewards/margins": 0.7851115465164185, "rewards/margins_max": 1.1220605373382568, "rewards/margins_min": 0.4481624662876129, "rewards/margins_std": 0.4765179753303528, "rewards/rejected": -1.4969398975372314, "step": 440 }, { "epoch": 0.27, "grad_norm": 4.71875, "learning_rate": 4.558038047087486e-07, "logits/chosen": -3.2601521015167236, "logits/rejected": -3.0268001556396484, "logps/chosen": -185.16275024414062, "logps/rejected": -308.6865234375, "loss": 0.3983, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7758798599243164, "rewards/margins": 0.886875331401825, "rewards/margins_max": 1.3197309970855713, "rewards/margins_min": 0.454019695520401, "rewards/margins_std": 0.6121503114700317, "rewards/rejected": -1.6627552509307861, "step": 450 }, { "epoch": 0.28, "grad_norm": 5.09375, "learning_rate": 4.527511524340508e-07, "logits/chosen": -3.229076862335205, "logits/rejected": -3.024235486984253, "logps/chosen": -197.80221557617188, "logps/rejected": -347.35479736328125, "loss": 0.3766, "rewards/accuracies": 0.875, "rewards/chosen": -0.8850440979003906, "rewards/margins": 1.0945155620574951, "rewards/margins_max": 1.7435153722763062, "rewards/margins_min": 0.4455157220363617, "rewards/margins_std": 0.9178244471549988, "rewards/rejected": -1.9795596599578857, "step": 460 }, { "epoch": 0.29, "grad_norm": 4.65625, "learning_rate": 4.49607516262582e-07, "logits/chosen": -3.2619071006774902, "logits/rejected": -3.051602840423584, "logps/chosen": -231.6276397705078, "logps/rejected": -371.31866455078125, "loss": 0.3899, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.0247620344161987, "rewards/margins": 1.10407555103302, "rewards/margins_max": 1.7106988430023193, "rewards/margins_min": 0.49745243787765503, "rewards/margins_std": 0.8578945994377136, "rewards/rejected": -2.1288375854492188, "step": 470 }, { "epoch": 0.29, "grad_norm": 3.90625, "learning_rate": 4.4637430689048626e-07, "logits/chosen": -3.2792510986328125, "logits/rejected": -3.040688991546631, "logps/chosen": -198.93948364257812, "logps/rejected": -341.1888427734375, "loss": 0.3362, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.007499098777771, "rewards/margins": 1.0881038904190063, "rewards/margins_max": 1.634576439857483, "rewards/margins_min": 0.5416311025619507, "rewards/margins_std": 0.772828996181488, "rewards/rejected": -2.0956027507781982, "step": 480 }, { "epoch": 0.3, "grad_norm": 5.34375, "learning_rate": 4.4305297520957944e-07, "logits/chosen": -3.227466106414795, "logits/rejected": -3.0447263717651367, "logps/chosen": -218.6245574951172, "logps/rejected": -389.6742248535156, "loss": 0.325, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.055724024772644, "rewards/margins": 1.3826014995574951, "rewards/margins_max": 1.864708662033081, "rewards/margins_min": 0.9004942178726196, "rewards/margins_std": 0.6818027496337891, "rewards/rejected": -2.438325881958008, "step": 490 }, { "epoch": 0.3, "grad_norm": 5.125, "learning_rate": 4.396450116562669e-07, "logits/chosen": -3.2189323902130127, "logits/rejected": -3.0369110107421875, "logps/chosen": -239.6689910888672, "logps/rejected": -412.5196838378906, "loss": 0.3732, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3009732961654663, "rewards/margins": 1.1667307615280151, "rewards/margins_max": 1.9371881484985352, "rewards/margins_min": 0.39627307653427124, "rewards/margins_std": 1.0895916223526, "rewards/rejected": -2.4677042961120605, "step": 500 }, { "epoch": 0.31, "grad_norm": 9.625, "learning_rate": 4.3615194554271483e-07, "logits/chosen": -3.2492318153381348, "logits/rejected": -3.042893886566162, "logps/chosen": -261.33001708984375, "logps/rejected": -447.9276428222656, "loss": 0.3188, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3674191236495972, "rewards/margins": 1.4249876737594604, "rewards/margins_max": 2.208099842071533, "rewards/margins_min": 0.6418755054473877, "rewards/margins_std": 1.1074877977371216, "rewards/rejected": -2.7924067974090576, "step": 510 }, { "epoch": 0.32, "grad_norm": 5.8125, "learning_rate": 4.325753443705767e-07, "logits/chosen": -3.217289686203003, "logits/rejected": -3.0037760734558105, "logps/chosen": -238.28817749023438, "logps/rejected": -447.4773864746094, "loss": 0.2919, "rewards/accuracies": 0.9375, "rewards/chosen": -1.272761344909668, "rewards/margins": 1.5220317840576172, "rewards/margins_max": 2.1593213081359863, "rewards/margins_min": 0.8847425580024719, "rewards/margins_std": 0.9012632369995117, "rewards/rejected": -2.794793128967285, "step": 520 }, { "epoch": 0.32, "grad_norm": 5.25, "learning_rate": 4.289168131275822e-07, "logits/chosen": -3.1981008052825928, "logits/rejected": -2.9687576293945312, "logps/chosen": -248.31906127929688, "logps/rejected": -526.4910888671875, "loss": 0.3017, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4457924365997314, "rewards/margins": 1.7948505878448486, "rewards/margins_max": 2.742126226425171, "rewards/margins_min": 0.8475747108459473, "rewards/margins_std": 1.339650273323059, "rewards/rejected": -3.240643262863159, "step": 530 }, { "epoch": 0.33, "grad_norm": 7.125, "learning_rate": 4.251779935673044e-07, "logits/chosen": -3.1895217895507812, "logits/rejected": -2.982194423675537, "logps/chosen": -287.61956787109375, "logps/rejected": -510.9176330566406, "loss": 0.3085, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6795003414154053, "rewards/margins": 1.9267494678497314, "rewards/margins_max": 2.750401735305786, "rewards/margins_min": 1.1030967235565186, "rewards/margins_std": 1.1648204326629639, "rewards/rejected": -3.606250047683716, "step": 540 }, { "epoch": 0.33, "grad_norm": 5.21875, "learning_rate": 4.213605634724283e-07, "logits/chosen": -3.2357590198516846, "logits/rejected": -2.958969831466675, "logps/chosen": -260.2586975097656, "logps/rejected": -494.48944091796875, "loss": 0.2896, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4327127933502197, "rewards/margins": 1.9400370121002197, "rewards/margins_max": 3.0310521125793457, "rewards/margins_min": 0.8490220308303833, "rewards/margins_std": 1.5429283380508423, "rewards/rejected": -3.3727500438690186, "step": 550 }, { "epoch": 0.34, "grad_norm": 8.5625, "learning_rate": 4.174662359018515e-07, "logits/chosen": -3.204619884490967, "logits/rejected": -2.958706855773926, "logps/chosen": -267.38641357421875, "logps/rejected": -474.48919677734375, "loss": 0.3059, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5095288753509521, "rewards/margins": 1.707524299621582, "rewards/margins_max": 2.5599889755249023, "rewards/margins_min": 0.8550596237182617, "rewards/margins_std": 1.2055673599243164, "rewards/rejected": -3.217053174972534, "step": 560 }, { "epoch": 0.35, "grad_norm": 8.125, "learning_rate": 4.134967584219549e-07, "logits/chosen": -3.152198314666748, "logits/rejected": -2.9612772464752197, "logps/chosen": -279.5748291015625, "logps/rejected": -507.6549377441406, "loss": 0.2771, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6234207153320312, "rewards/margins": 1.8984496593475342, "rewards/margins_max": 2.928384780883789, "rewards/margins_min": 0.8685151934623718, "rewards/margins_std": 1.456547737121582, "rewards/rejected": -3.5218708515167236, "step": 570 }, { "epoch": 0.35, "grad_norm": 6.21875, "learning_rate": 4.09453912322388e-07, "logits/chosen": -3.1358606815338135, "logits/rejected": -2.9630966186523438, "logps/chosen": -279.37298583984375, "logps/rejected": -529.50732421875, "loss": 0.3097, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6641263961791992, "rewards/margins": 2.1226673126220703, "rewards/margins_max": 3.121896266937256, "rewards/margins_min": 1.1234381198883057, "rewards/margins_std": 1.413123369216919, "rewards/rejected": -3.7867934703826904, "step": 580 }, { "epoch": 0.36, "grad_norm": 5.0625, "learning_rate": 4.0533951181672137e-07, "logits/chosen": -3.190006971359253, "logits/rejected": -3.0020487308502197, "logps/chosen": -259.03961181640625, "logps/rejected": -524.3358154296875, "loss": 0.2348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5405361652374268, "rewards/margins": 1.977190613746643, "rewards/margins_max": 2.800302028656006, "rewards/margins_min": 1.1540789604187012, "rewards/margins_std": 1.164055585861206, "rewards/rejected": -3.5177268981933594, "step": 590 }, { "epoch": 0.36, "grad_norm": 13.6875, "learning_rate": 4.011554032283242e-07, "logits/chosen": -3.20314359664917, "logits/rejected": -2.951345682144165, "logps/chosen": -268.9664001464844, "logps/rejected": -507.90948486328125, "loss": 0.2691, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6380701065063477, "rewards/margins": 2.148407459259033, "rewards/margins_max": 3.0667290687561035, "rewards/margins_min": 1.2300859689712524, "rewards/margins_std": 1.2987029552459717, "rewards/rejected": -3.7864773273468018, "step": 600 }, { "epoch": 0.37, "grad_norm": 11.0, "learning_rate": 3.9690346416183314e-07, "logits/chosen": -3.1131813526153564, "logits/rejected": -2.9457013607025146, "logps/chosen": -289.3182373046875, "logps/rejected": -539.81298828125, "loss": 0.2696, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7127681970596313, "rewards/margins": 2.1862919330596924, "rewards/margins_max": 3.286137104034424, "rewards/margins_min": 1.08644700050354, "rewards/margins_std": 1.5554157495498657, "rewards/rejected": -3.8990604877471924, "step": 610 }, { "epoch": 0.38, "grad_norm": 13.125, "learning_rate": 3.9258560266058334e-07, "logits/chosen": -3.1740329265594482, "logits/rejected": -3.020383834838867, "logps/chosen": -305.628173828125, "logps/rejected": -612.4352416992188, "loss": 0.2626, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.030916690826416, "rewards/margins": 2.430964946746826, "rewards/margins_max": 3.6694366931915283, "rewards/margins_min": 1.1924933195114136, "rewards/margins_std": 1.7514636516571045, "rewards/rejected": -4.461881160736084, "step": 620 }, { "epoch": 0.38, "grad_norm": 7.78125, "learning_rate": 3.882037563503806e-07, "logits/chosen": -3.1754307746887207, "logits/rejected": -2.973268985748291, "logps/chosen": -300.1325988769531, "logps/rejected": -587.9697265625, "loss": 0.2976, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.968949556350708, "rewards/margins": 2.2382559776306152, "rewards/margins_max": 3.457249402999878, "rewards/margins_min": 1.0192627906799316, "rewards/margins_std": 1.7239166498184204, "rewards/rejected": -4.207205295562744, "step": 630 }, { "epoch": 0.39, "grad_norm": 10.0625, "learning_rate": 3.8375989156999803e-07, "logits/chosen": -3.1942696571350098, "logits/rejected": -3.031660556793213, "logps/chosen": -277.3270263671875, "logps/rejected": -628.7125244140625, "loss": 0.2381, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.681290864944458, "rewards/margins": 2.714406967163086, "rewards/margins_max": 3.8859188556671143, "rewards/margins_min": 1.5428953170776367, "rewards/margins_std": 1.6567678451538086, "rewards/rejected": -4.395698070526123, "step": 640 }, { "epoch": 0.39, "grad_norm": 11.0, "learning_rate": 3.7925600248878865e-07, "logits/chosen": -3.0972537994384766, "logits/rejected": -2.915043592453003, "logps/chosen": -309.4454650878906, "logps/rejected": -582.4273071289062, "loss": 0.2835, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8756641149520874, "rewards/margins": 2.2718684673309326, "rewards/margins_max": 3.4796149730682373, "rewards/margins_min": 1.064121961593628, "rewards/margins_std": 1.7080116271972656, "rewards/rejected": -4.1475324630737305, "step": 650 }, { "epoch": 0.4, "grad_norm": 8.625, "learning_rate": 3.746941102118081e-07, "logits/chosen": -3.1687800884246826, "logits/rejected": -2.932328224182129, "logps/chosen": -321.97003173828125, "logps/rejected": -614.5501708984375, "loss": 0.2362, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.9460325241088867, "rewards/margins": 2.4259374141693115, "rewards/margins_max": 3.647425413131714, "rewards/margins_min": 1.2044496536254883, "rewards/margins_std": 1.7274446487426758, "rewards/rejected": -4.371970176696777, "step": 660 }, { "epoch": 0.41, "grad_norm": 7.0625, "learning_rate": 3.700762618728508e-07, "logits/chosen": -3.105429172515869, "logits/rejected": -2.8866400718688965, "logps/chosen": -302.71575927734375, "logps/rejected": -675.2667236328125, "loss": 0.2554, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.801047682762146, "rewards/margins": 2.991947650909424, "rewards/margins_max": 4.486474990844727, "rewards/margins_min": 1.4974205493927002, "rewards/margins_std": 2.1135807037353516, "rewards/rejected": -4.792995452880859, "step": 670 }, { "epoch": 0.41, "grad_norm": 3.828125, "learning_rate": 3.654045297158057e-07, "logits/chosen": -3.164304256439209, "logits/rejected": -2.9912238121032715, "logps/chosen": -284.3798828125, "logps/rejected": -547.4119873046875, "loss": 0.2154, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7268387079238892, "rewards/margins": 2.4997963905334473, "rewards/margins_max": 3.5198917388916016, "rewards/margins_min": 1.4797013998031616, "rewards/margins_std": 1.4426321983337402, "rewards/rejected": -4.226634979248047, "step": 680 }, { "epoch": 0.42, "grad_norm": 5.65625, "learning_rate": 3.606810101647431e-07, "logits/chosen": -3.19686222076416, "logits/rejected": -2.9458096027374268, "logps/chosen": -318.38238525390625, "logps/rejected": -583.8440551757812, "loss": 0.2514, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.952331304550171, "rewards/margins": 2.3931100368499756, "rewards/margins_max": 3.5836410522460938, "rewards/margins_min": 1.2025787830352783, "rewards/margins_std": 1.6836650371551514, "rewards/rejected": -4.3454413414001465, "step": 690 }, { "epoch": 0.42, "grad_norm": 8.25, "learning_rate": 3.559078228831526e-07, "logits/chosen": -3.1194119453430176, "logits/rejected": -2.977457046508789, "logps/chosen": -285.14794921875, "logps/rejected": -596.258056640625, "loss": 0.2487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8175990581512451, "rewards/margins": 2.7383248805999756, "rewards/margins_max": 3.9793529510498047, "rewards/margins_min": 1.4972972869873047, "rewards/margins_std": 1.7550785541534424, "rewards/rejected": -4.555924415588379, "step": 700 }, { "epoch": 0.43, "grad_norm": 8.625, "learning_rate": 3.510871098227503e-07, "logits/chosen": -3.2031445503234863, "logits/rejected": -2.9235167503356934, "logps/chosen": -335.8504638671875, "logps/rejected": -606.4368896484375, "loss": 0.2089, "rewards/accuracies": 0.9375, "rewards/chosen": -2.097665309906006, "rewards/margins": 2.564664125442505, "rewards/margins_max": 3.574242115020752, "rewards/margins_min": 1.5550854206085205, "rewards/margins_std": 1.4277592897415161, "rewards/rejected": -4.662329196929932, "step": 710 }, { "epoch": 0.44, "grad_norm": 12.5625, "learning_rate": 3.462210342622853e-07, "logits/chosen": -3.1175758838653564, "logits/rejected": -2.900524616241455, "logps/chosen": -316.82696533203125, "logps/rejected": -691.4205932617188, "loss": 0.2222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.02522349357605, "rewards/margins": 3.1580607891082764, "rewards/margins_max": 4.554937839508057, "rewards/margins_min": 1.761183500289917, "rewards/margins_std": 1.9754825830459595, "rewards/rejected": -5.183283805847168, "step": 720 }, { "epoch": 0.44, "grad_norm": 6.53125, "learning_rate": 3.4131177983677614e-07, "logits/chosen": -3.1615021228790283, "logits/rejected": -2.9676241874694824, "logps/chosen": -309.53033447265625, "logps/rejected": -637.4588623046875, "loss": 0.3259, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.038564920425415, "rewards/margins": 2.944957971572876, "rewards/margins_max": 4.9082746505737305, "rewards/margins_min": 0.9816409349441528, "rewards/margins_std": 2.7765493392944336, "rewards/rejected": -4.983522891998291, "step": 730 }, { "epoch": 0.45, "grad_norm": 16.375, "learning_rate": 3.363615495576114e-07, "logits/chosen": -3.172344207763672, "logits/rejected": -2.932992935180664, "logps/chosen": -322.94537353515625, "logps/rejected": -613.6011962890625, "loss": 0.2776, "rewards/accuracies": 0.875, "rewards/chosen": -1.8910267353057861, "rewards/margins": 2.67472243309021, "rewards/margins_max": 4.281624794006348, "rewards/margins_min": 1.0678198337554932, "rewards/margins_std": 2.272503614425659, "rewards/rejected": -4.565749168395996, "step": 740 }, { "epoch": 0.45, "grad_norm": 6.71875, "learning_rate": 3.31372564823956e-07, "logits/chosen": -3.1593432426452637, "logits/rejected": -2.9213168621063232, "logps/chosen": -300.8254089355469, "logps/rejected": -554.28173828125, "loss": 0.2749, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9807758331298828, "rewards/margins": 2.3717269897460938, "rewards/margins_max": 3.653285503387451, "rewards/margins_min": 1.0901682376861572, "rewards/margins_std": 1.8123977184295654, "rewards/rejected": -4.352502346038818, "step": 750 }, { "epoch": 0.46, "grad_norm": 12.9375, "learning_rate": 3.2634706442590585e-07, "logits/chosen": -3.1167142391204834, "logits/rejected": -2.943542718887329, "logps/chosen": -320.70477294921875, "logps/rejected": -626.4170532226562, "loss": 0.2486, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1256468296051025, "rewards/margins": 2.6652743816375732, "rewards/margins_max": 4.235686302185059, "rewards/margins_min": 1.0948628187179565, "rewards/margins_std": 2.220897674560547, "rewards/rejected": -4.790921211242676, "step": 760 }, { "epoch": 0.47, "grad_norm": 6.46875, "learning_rate": 3.2128730353983824e-07, "logits/chosen": -3.12074875831604, "logits/rejected": -2.914388418197632, "logps/chosen": -299.7171936035156, "logps/rejected": -626.5369262695312, "loss": 0.2256, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.9603103399276733, "rewards/margins": 2.961648464202881, "rewards/margins_max": 4.249537467956543, "rewards/margins_min": 1.673760175704956, "rewards/margins_std": 1.8213493824005127, "rewards/rejected": -4.921958923339844, "step": 770 }, { "epoch": 0.47, "grad_norm": 7.1875, "learning_rate": 3.161955527164092e-07, "logits/chosen": -3.1619656085968018, "logits/rejected": -2.9911611080169678, "logps/chosen": -314.0920104980469, "logps/rejected": -611.2723999023438, "loss": 0.2768, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0593760013580322, "rewards/margins": 2.617433547973633, "rewards/margins_max": 3.8496804237365723, "rewards/margins_min": 1.3851864337921143, "rewards/margins_std": 1.7426605224609375, "rewards/rejected": -4.676808834075928, "step": 780 }, { "epoch": 0.48, "grad_norm": 9.8125, "learning_rate": 3.11074096861651e-07, "logits/chosen": -3.1253132820129395, "logits/rejected": -2.948439836502075, "logps/chosen": -307.06842041015625, "logps/rejected": -656.5684814453125, "loss": 0.2793, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.9304516315460205, "rewards/margins": 3.045482873916626, "rewards/margins_max": 4.538393974304199, "rewards/margins_min": 1.5525717735290527, "rewards/margins_std": 2.111295223236084, "rewards/rejected": -4.9759345054626465, "step": 790 }, { "epoch": 0.49, "grad_norm": 7.9375, "learning_rate": 3.0592523421162923e-07, "logits/chosen": -3.14684796333313, "logits/rejected": -2.9417788982391357, "logps/chosen": -311.675048828125, "logps/rejected": -673.3569946289062, "loss": 0.2156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.02833890914917, "rewards/margins": 3.267068386077881, "rewards/margins_max": 4.807781219482422, "rewards/margins_min": 1.7263562679290771, "rewards/margins_std": 2.178896427154541, "rewards/rejected": -5.295407295227051, "step": 800 }, { "epoch": 0.49, "grad_norm": 6.0, "learning_rate": 3.0075127530111604e-07, "logits/chosen": -3.143428325653076, "logits/rejected": -2.8957009315490723, "logps/chosen": -306.6546936035156, "logps/rejected": -662.9694213867188, "loss": 0.1952, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7990596294403076, "rewards/margins": 3.0265026092529297, "rewards/margins_max": 4.350580215454102, "rewards/margins_min": 1.7024250030517578, "rewards/margins_std": 1.8725284337997437, "rewards/rejected": -4.825562000274658, "step": 810 }, { "epoch": 0.5, "grad_norm": 6.65625, "learning_rate": 2.9555454192674635e-07, "logits/chosen": -3.1340126991271973, "logits/rejected": -2.944532871246338, "logps/chosen": -296.8179016113281, "logps/rejected": -654.5142822265625, "loss": 0.205, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8869205713272095, "rewards/margins": 3.0365943908691406, "rewards/margins_max": 4.509632110595703, "rewards/margins_min": 1.563556432723999, "rewards/margins_std": 2.0831899642944336, "rewards/rejected": -4.9235148429870605, "step": 820 }, { "epoch": 0.5, "grad_norm": 5.1875, "learning_rate": 2.903373661051188e-07, "logits/chosen": -3.221536636352539, "logits/rejected": -3.0096938610076904, "logps/chosen": -318.65667724609375, "logps/rejected": -705.4375, "loss": 0.1578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9611215591430664, "rewards/margins": 3.1713013648986816, "rewards/margins_max": 4.483790874481201, "rewards/margins_min": 1.858811378479004, "rewards/margins_std": 1.8561407327651978, "rewards/rejected": -5.13242244720459, "step": 830 }, { "epoch": 0.51, "grad_norm": 5.40625, "learning_rate": 2.851020890263113e-07, "logits/chosen": -3.156846046447754, "logits/rejected": -2.902345657348633, "logps/chosen": -333.4150085449219, "logps/rejected": -682.9464111328125, "loss": 0.2228, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0641589164733887, "rewards/margins": 3.215531826019287, "rewards/margins_max": 4.6603498458862305, "rewards/margins_min": 1.7707140445709229, "rewards/margins_std": 2.043280839920044, "rewards/rejected": -5.279690742492676, "step": 840 }, { "epoch": 0.52, "grad_norm": 8.75, "learning_rate": 2.798510600032803e-07, "logits/chosen": -3.1748039722442627, "logits/rejected": -2.9051835536956787, "logps/chosen": -336.9716796875, "logps/rejected": -674.93212890625, "loss": 0.2113, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1552605628967285, "rewards/margins": 3.1116385459899902, "rewards/margins_max": 4.737573623657227, "rewards/margins_min": 1.4857032299041748, "rewards/margins_std": 2.29941987991333, "rewards/rejected": -5.266899108886719, "step": 850 }, { "epoch": 0.52, "grad_norm": 8.5625, "learning_rate": 2.745866354176137e-07, "logits/chosen": -3.108320713043213, "logits/rejected": -2.8731348514556885, "logps/chosen": -337.5401916503906, "logps/rejected": -751.5953369140625, "loss": 0.2018, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.291527509689331, "rewards/margins": 3.7230491638183594, "rewards/margins_max": 5.445635795593262, "rewards/margins_min": 2.000462770462036, "rewards/margins_std": 2.436105251312256, "rewards/rejected": -6.014577388763428, "step": 860 }, { "epoch": 0.53, "grad_norm": 6.84375, "learning_rate": 2.693111776621136e-07, "logits/chosen": -3.124844789505005, "logits/rejected": -2.866428852081299, "logps/chosen": -368.060302734375, "logps/rejected": -777.5119018554688, "loss": 0.2207, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.525402069091797, "rewards/margins": 3.681495189666748, "rewards/margins_max": 5.426387786865234, "rewards/margins_min": 1.936603307723999, "rewards/margins_std": 2.4676499366760254, "rewards/rejected": -6.206897735595703, "step": 870 }, { "epoch": 0.53, "grad_norm": 7.625, "learning_rate": 2.640270540806793e-07, "logits/chosen": -3.0661511421203613, "logits/rejected": -2.8748667240142822, "logps/chosen": -348.7222595214844, "logps/rejected": -695.6060791015625, "loss": 0.2213, "rewards/accuracies": 0.875, "rewards/chosen": -2.3182373046875, "rewards/margins": 3.1713945865631104, "rewards/margins_max": 4.741438388824463, "rewards/margins_min": 1.6013505458831787, "rewards/margins_std": 2.2203774452209473, "rewards/rejected": -5.489631652832031, "step": 880 }, { "epoch": 0.54, "grad_norm": 5.75, "learning_rate": 2.5873663590597063e-07, "logits/chosen": -3.138188362121582, "logits/rejected": -2.8532166481018066, "logps/chosen": -332.19305419921875, "logps/rejected": -696.7973022460938, "loss": 0.2109, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0587775707244873, "rewards/margins": 3.378685474395752, "rewards/margins_max": 4.750607967376709, "rewards/margins_min": 2.0067625045776367, "rewards/margins_std": 1.9401918649673462, "rewards/rejected": -5.43746280670166, "step": 890 }, { "epoch": 0.55, "grad_norm": 21.875, "learning_rate": 2.5344229719532484e-07, "logits/chosen": -3.1494667530059814, "logits/rejected": -2.9058408737182617, "logps/chosen": -326.4560241699219, "logps/rejected": -656.7224731445312, "loss": 0.2174, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.120781421661377, "rewards/margins": 3.120711088180542, "rewards/margins_max": 4.613424777984619, "rewards/margins_min": 1.627997636795044, "rewards/margins_std": 2.111015796661377, "rewards/rejected": -5.24149227142334, "step": 900 }, { "epoch": 0.55, "grad_norm": 12.0, "learning_rate": 2.481464137654068e-07, "logits/chosen": -3.1502187252044678, "logits/rejected": -2.901831865310669, "logps/chosen": -336.9727783203125, "logps/rejected": -758.7743530273438, "loss": 0.2516, "rewards/accuracies": 0.9375, "rewards/chosen": -2.269904375076294, "rewards/margins": 3.7340915203094482, "rewards/margins_max": 5.41564416885376, "rewards/margins_min": 2.052539110183716, "rewards/margins_std": 2.3780744075775146, "rewards/rejected": -6.003995895385742, "step": 910 }, { "epoch": 0.56, "grad_norm": 8.875, "learning_rate": 2.428513621260683e-07, "logits/chosen": -3.167316436767578, "logits/rejected": -2.951566457748413, "logps/chosen": -350.9910583496094, "logps/rejected": -678.45703125, "loss": 0.2035, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.3870468139648438, "rewards/margins": 3.059993267059326, "rewards/margins_max": 4.496026039123535, "rewards/margins_min": 1.6239604949951172, "rewards/margins_std": 2.0308570861816406, "rewards/rejected": -5.44704008102417, "step": 920 }, { "epoch": 0.56, "grad_norm": 15.5625, "learning_rate": 2.375595184138986e-07, "logits/chosen": -3.1135799884796143, "logits/rejected": -2.903838872909546, "logps/chosen": -323.6688232421875, "logps/rejected": -727.5667114257812, "loss": 0.265, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.1313717365264893, "rewards/margins": 3.3087127208709717, "rewards/margins_max": 4.953644752502441, "rewards/margins_min": 1.6637804508209229, "rewards/margins_std": 2.3262856006622314, "rewards/rejected": -5.440084934234619, "step": 930 }, { "epoch": 0.57, "grad_norm": 17.375, "learning_rate": 2.3227325732593993e-07, "logits/chosen": -3.1387646198272705, "logits/rejected": -2.882930278778076, "logps/chosen": -320.67132568359375, "logps/rejected": -736.220458984375, "loss": 0.1934, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9799764156341553, "rewards/margins": 3.8522841930389404, "rewards/margins_max": 5.505575180053711, "rewards/margins_min": 2.198992967605591, "rewards/margins_std": 2.338106870651245, "rewards/rejected": -5.832260608673096, "step": 940 }, { "epoch": 0.58, "grad_norm": 13.1875, "learning_rate": 2.2699495105405114e-07, "logits/chosen": -3.074521541595459, "logits/rejected": -2.8952858448028564, "logps/chosen": -324.089111328125, "logps/rejected": -743.9403076171875, "loss": 0.2139, "rewards/accuracies": 0.9375, "rewards/chosen": -2.197805881500244, "rewards/margins": 3.5057437419891357, "rewards/margins_max": 5.082973957061768, "rewards/margins_min": 1.928513526916504, "rewards/margins_std": 2.2305400371551514, "rewards/rejected": -5.703549385070801, "step": 950 }, { "epoch": 0.58, "grad_norm": 13.5, "learning_rate": 2.217269682203937e-07, "logits/chosen": -3.0950160026550293, "logits/rejected": -2.8411691188812256, "logps/chosen": -309.6170959472656, "logps/rejected": -720.2401123046875, "loss": 0.23, "rewards/accuracies": 0.9375, "rewards/chosen": -1.961046814918518, "rewards/margins": 3.8151397705078125, "rewards/margins_max": 5.517041206359863, "rewards/margins_min": 2.113239049911499, "rewards/margins_std": 2.406851291656494, "rewards/rejected": -5.776186943054199, "step": 960 }, { "epoch": 0.59, "grad_norm": 2.71875, "learning_rate": 2.164716728145213e-07, "logits/chosen": -3.1319289207458496, "logits/rejected": -2.9833462238311768, "logps/chosen": -353.3096618652344, "logps/rejected": -787.9588623046875, "loss": 0.1898, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2338433265686035, "rewards/margins": 3.6919732093811035, "rewards/margins_max": 5.161503791809082, "rewards/margins_min": 2.222442626953125, "rewards/margins_std": 2.078230381011963, "rewards/rejected": -5.925817012786865, "step": 970 }, { "epoch": 0.59, "grad_norm": 7.09375, "learning_rate": 2.1123142313254704e-07, "logits/chosen": -3.119903087615967, "logits/rejected": -2.9215176105499268, "logps/chosen": -327.38165283203125, "logps/rejected": -698.1976318359375, "loss": 0.2065, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0507330894470215, "rewards/margins": 3.221263885498047, "rewards/margins_max": 4.741235733032227, "rewards/margins_min": 1.7012920379638672, "rewards/margins_std": 2.1495652198791504, "rewards/rejected": -5.271997451782227, "step": 980 }, { "epoch": 0.6, "grad_norm": 10.9375, "learning_rate": 2.0600857071886596e-07, "logits/chosen": -3.111619234085083, "logits/rejected": -2.886859655380249, "logps/chosen": -348.9073181152344, "logps/rejected": -710.0587768554688, "loss": 0.2172, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.1807503700256348, "rewards/margins": 3.2503695487976074, "rewards/margins_max": 4.767851829528809, "rewards/margins_min": 1.7328875064849854, "rewards/margins_std": 2.1460437774658203, "rewards/rejected": -5.431119441986084, "step": 990 }, { "epoch": 0.61, "grad_norm": 9.625, "learning_rate": 2.0080545931090784e-07, "logits/chosen": -3.1535375118255615, "logits/rejected": -2.965236186981201, "logps/chosen": -344.1661071777344, "logps/rejected": -811.3351440429688, "loss": 0.2185, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.2291502952575684, "rewards/margins": 4.049864768981934, "rewards/margins_max": 5.990394592285156, "rewards/margins_min": 2.1093358993530273, "rewards/margins_std": 2.744323253631592, "rewards/rejected": -6.27901554107666, "step": 1000 }, { "epoch": 0.61, "grad_norm": 17.375, "learning_rate": 1.9562442378739238e-07, "logits/chosen": -3.125776767730713, "logits/rejected": -2.9081075191497803, "logps/chosen": -299.2303161621094, "logps/rejected": -708.6509399414062, "loss": 0.2322, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9368846416473389, "rewards/margins": 3.5816776752471924, "rewards/margins_max": 5.060166835784912, "rewards/margins_min": 2.1031877994537354, "rewards/margins_std": 2.09089994430542, "rewards/rejected": -5.518561840057373, "step": 1010 }, { "epoch": 0.62, "grad_norm": 5.96875, "learning_rate": 1.9046778912056043e-07, "logits/chosen": -3.1317784786224365, "logits/rejected": -2.9367737770080566, "logps/chosen": -297.7860412597656, "logps/rejected": -669.5913696289062, "loss": 0.2341, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9930238723754883, "rewards/margins": 3.1364190578460693, "rewards/margins_max": 4.621811866760254, "rewards/margins_min": 1.6510257720947266, "rewards/margins_std": 2.10066294670105, "rewards/rejected": -5.129443168640137, "step": 1020 }, { "epoch": 0.62, "grad_norm": 10.0625, "learning_rate": 1.8533786933285106e-07, "logits/chosen": -3.1418776512145996, "logits/rejected": -2.9141292572021484, "logps/chosen": -344.0506286621094, "logps/rejected": -775.5567626953125, "loss": 0.2484, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0711874961853027, "rewards/margins": 3.7657127380371094, "rewards/margins_max": 5.59161901473999, "rewards/margins_min": 1.9398069381713867, "rewards/margins_std": 2.582221031188965, "rewards/rejected": -5.836900234222412, "step": 1030 }, { "epoch": 0.63, "grad_norm": 13.1875, "learning_rate": 1.8023696645849063e-07, "logits/chosen": -3.1590495109558105, "logits/rejected": -2.9567525386810303, "logps/chosen": -320.74176025390625, "logps/rejected": -711.3518676757812, "loss": 0.1803, "rewards/accuracies": 0.9375, "rewards/chosen": -2.144990921020508, "rewards/margins": 3.530397891998291, "rewards/margins_max": 5.047208309173584, "rewards/margins_min": 2.0135867595672607, "rewards/margins_std": 2.145094394683838, "rewards/rejected": -5.675388336181641, "step": 1040 }, { "epoch": 0.64, "grad_norm": 28.25, "learning_rate": 1.7516736951046394e-07, "logits/chosen": -3.1330792903900146, "logits/rejected": -2.947277545928955, "logps/chosen": -344.958251953125, "logps/rejected": -704.8740234375, "loss": 0.2298, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3996665477752686, "rewards/margins": 3.01820707321167, "rewards/margins_max": 4.695876598358154, "rewards/margins_min": 1.3405380249023438, "rewards/margins_std": 2.3725826740264893, "rewards/rejected": -5.417874336242676, "step": 1050 }, { "epoch": 0.64, "grad_norm": 17.125, "learning_rate": 1.7013135345332651e-07, "logits/chosen": -3.1549530029296875, "logits/rejected": -2.8647735118865967, "logps/chosen": -316.10333251953125, "logps/rejected": -794.4426879882812, "loss": 0.2395, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.046635627746582, "rewards/margins": 3.969254970550537, "rewards/margins_max": 5.835241794586182, "rewards/margins_min": 2.103269577026367, "rewards/margins_std": 2.6389026641845703, "rewards/rejected": -6.0158915519714355, "step": 1060 }, { "epoch": 0.65, "grad_norm": 6.25, "learning_rate": 1.6513117818232216e-07, "logits/chosen": -3.1065542697906494, "logits/rejected": -2.925407648086548, "logps/chosen": -311.7358093261719, "logps/rejected": -697.4700927734375, "loss": 0.2107, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1502268314361572, "rewards/margins": 3.424119234085083, "rewards/margins_max": 5.143943786621094, "rewards/margins_min": 1.7042953968048096, "rewards/margins_std": 2.4321987628936768, "rewards/rejected": -5.574346542358398, "step": 1070 }, { "epoch": 0.66, "grad_norm": 7.1875, "learning_rate": 1.6016908750926284e-07, "logits/chosen": -3.199125289916992, "logits/rejected": -2.9049952030181885, "logps/chosen": -323.9697265625, "logps/rejected": -733.3807373046875, "loss": 0.2562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1301119327545166, "rewards/margins": 3.5133304595947266, "rewards/margins_max": 5.211588382720947, "rewards/margins_min": 1.8150726556777954, "rewards/margins_std": 2.4016995429992676, "rewards/rejected": -5.643442630767822, "step": 1080 }, { "epoch": 0.66, "grad_norm": 2.03125, "learning_rate": 1.5524730815562517e-07, "logits/chosen": -3.093618631362915, "logits/rejected": -2.912240743637085, "logps/chosen": -318.77142333984375, "logps/rejected": -740.55126953125, "loss": 0.1697, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.103801727294922, "rewards/margins": 3.8205482959747314, "rewards/margins_max": 5.348752975463867, "rewards/margins_min": 2.2923431396484375, "rewards/margins_std": 2.161208152770996, "rewards/rejected": -5.924350261688232, "step": 1090 }, { "epoch": 0.67, "grad_norm": 3.8125, "learning_rate": 1.5036804875331733e-07, "logits/chosen": -3.138913869857788, "logits/rejected": -2.938734292984009, "logps/chosen": -347.51422119140625, "logps/rejected": -744.4155883789062, "loss": 0.2236, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3483636379241943, "rewards/margins": 3.551708936691284, "rewards/margins_max": 5.2547502517700195, "rewards/margins_min": 1.8486677408218384, "rewards/margins_std": 2.408463954925537, "rewards/rejected": -5.900073051452637, "step": 1100 }, { "epoch": 0.67, "grad_norm": 7.65625, "learning_rate": 1.455334988535621e-07, "logits/chosen": -3.1878058910369873, "logits/rejected": -2.8721015453338623, "logps/chosen": -320.9921875, "logps/rejected": -697.1282958984375, "loss": 0.2256, "rewards/accuracies": 0.9375, "rewards/chosen": -2.09895658493042, "rewards/margins": 3.339977264404297, "rewards/margins_max": 4.8780035972595215, "rewards/margins_min": 1.8019511699676514, "rewards/margins_std": 2.1750974655151367, "rewards/rejected": -5.438933849334717, "step": 1110 }, { "epoch": 0.68, "grad_norm": 10.9375, "learning_rate": 1.4074582794434387e-07, "logits/chosen": -3.1153368949890137, "logits/rejected": -2.913144826889038, "logps/chosen": -312.5743103027344, "logps/rejected": -745.4815673828125, "loss": 0.1932, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.063194751739502, "rewards/margins": 3.72094988822937, "rewards/margins_max": 5.494509696960449, "rewards/margins_min": 1.9473907947540283, "rewards/margins_std": 2.5081920623779297, "rewards/rejected": -5.784144878387451, "step": 1120 }, { "epoch": 0.69, "grad_norm": 10.0, "learning_rate": 1.36007184476858e-07, "logits/chosen": -3.132167339324951, "logits/rejected": -2.922752857208252, "logps/chosen": -326.2919921875, "logps/rejected": -773.3209228515625, "loss": 0.1876, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1564865112304688, "rewards/margins": 3.9871773719787598, "rewards/margins_max": 6.1475114822387695, "rewards/margins_min": 1.8268429040908813, "rewards/margins_std": 3.0551745891571045, "rewards/rejected": -6.143664360046387, "step": 1130 }, { "epoch": 0.69, "grad_norm": 17.5, "learning_rate": 1.313196949014001e-07, "logits/chosen": -3.1706137657165527, "logits/rejected": -2.879790782928467, "logps/chosen": -335.986328125, "logps/rejected": -716.2806396484375, "loss": 0.251, "rewards/accuracies": 0.875, "rewards/chosen": -2.1077635288238525, "rewards/margins": 3.5597450733184814, "rewards/margins_max": 5.525472164154053, "rewards/margins_min": 1.5940181016921997, "rewards/margins_std": 2.7799577713012695, "rewards/rejected": -5.667508125305176, "step": 1140 }, { "epoch": 0.7, "grad_norm": 22.0, "learning_rate": 1.266854627131295e-07, "logits/chosen": -3.135575771331787, "logits/rejected": -2.9847023487091064, "logps/chosen": -302.3170471191406, "logps/rejected": -696.525390625, "loss": 0.2497, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.0112688541412354, "rewards/margins": 3.4218602180480957, "rewards/margins_max": 4.963052749633789, "rewards/margins_min": 1.8806670904159546, "rewards/margins_std": 2.1795761585235596, "rewards/rejected": -5.433128833770752, "step": 1150 }, { "epoch": 0.7, "grad_norm": 7.84375, "learning_rate": 1.2210656750813203e-07, "logits/chosen": -3.0703582763671875, "logits/rejected": -2.871122121810913, "logps/chosen": -362.7223205566406, "logps/rejected": -770.84912109375, "loss": 0.2566, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.4078450202941895, "rewards/margins": 3.343618392944336, "rewards/margins_max": 5.4634623527526855, "rewards/margins_min": 1.2237741947174072, "rewards/margins_std": 2.9979124069213867, "rewards/rejected": -5.751462936401367, "step": 1160 }, { "epoch": 0.71, "grad_norm": 8.0, "learning_rate": 1.1758506405020885e-07, "logits/chosen": -3.186342716217041, "logits/rejected": -2.8949544429779053, "logps/chosen": -336.4505920410156, "logps/rejected": -697.22412109375, "loss": 0.1831, "rewards/accuracies": 0.9375, "rewards/chosen": -2.248119592666626, "rewards/margins": 3.425248384475708, "rewards/margins_max": 5.014912128448486, "rewards/margins_min": 1.8355858325958252, "rewards/margins_std": 2.2481231689453125, "rewards/rejected": -5.673368453979492, "step": 1170 }, { "epoch": 0.72, "grad_norm": 7.78125, "learning_rate": 1.1312298134880799e-07, "logits/chosen": -3.218524217605591, "logits/rejected": -2.952272891998291, "logps/chosen": -343.3241882324219, "logps/rejected": -660.6552734375, "loss": 0.2477, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2785308361053467, "rewards/margins": 2.914747714996338, "rewards/margins_max": 4.317625999450684, "rewards/margins_min": 1.511869192123413, "rewards/margins_std": 1.983970046043396, "rewards/rejected": -5.193279266357422, "step": 1180 }, { "epoch": 0.72, "grad_norm": 5.40625, "learning_rate": 1.0872232174851281e-07, "logits/chosen": -3.1693577766418457, "logits/rejected": -2.916531562805176, "logps/chosen": -357.85003662109375, "logps/rejected": -758.5368041992188, "loss": 0.2052, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.358463764190674, "rewards/margins": 3.578223705291748, "rewards/margins_max": 5.278543472290039, "rewards/margins_min": 1.8779041767120361, "rewards/margins_std": 2.4046151638031006, "rewards/rejected": -5.936688423156738, "step": 1190 }, { "epoch": 0.73, "grad_norm": 4.6875, "learning_rate": 1.0438506003049735e-07, "logits/chosen": -3.1248936653137207, "logits/rejected": -2.8823959827423096, "logps/chosen": -327.49066162109375, "logps/rejected": -708.4923095703125, "loss": 0.2082, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1310365200042725, "rewards/margins": 3.5814547538757324, "rewards/margins_max": 5.441410064697266, "rewards/margins_min": 1.7214996814727783, "rewards/margins_std": 2.630373954772949, "rewards/rejected": -5.712491035461426, "step": 1200 }, { "epoch": 0.73, "grad_norm": 17.125, "learning_rate": 1.0011314252634908e-07, "logits/chosen": -3.1236038208007812, "logits/rejected": -2.9256346225738525, "logps/chosen": -326.3320617675781, "logps/rejected": -626.0516357421875, "loss": 0.2082, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1023125648498535, "rewards/margins": 2.722761631011963, "rewards/margins_max": 3.6524386405944824, "rewards/margins_min": 1.793083906173706, "rewards/margins_std": 1.3147621154785156, "rewards/rejected": -4.825074195861816, "step": 1210 }, { "epoch": 0.74, "grad_norm": 8.25, "learning_rate": 9.590848624465989e-08, "logits/chosen": -3.152843475341797, "logits/rejected": -2.9420909881591797, "logps/chosen": -330.8764343261719, "logps/rejected": -701.1426391601562, "loss": 0.2014, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0373191833496094, "rewards/margins": 3.338742733001709, "rewards/margins_max": 4.737041473388672, "rewards/margins_min": 1.9404443502426147, "rewards/margins_std": 1.9774929285049438, "rewards/rejected": -5.376061916351318, "step": 1220 }, { "epoch": 0.75, "grad_norm": 6.78125, "learning_rate": 9.17729780107746e-08, "logits/chosen": -3.130746603012085, "logits/rejected": -2.9680044651031494, "logps/chosen": -304.0748596191406, "logps/rejected": -809.8206787109375, "loss": 0.1795, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9301007986068726, "rewards/margins": 4.500051021575928, "rewards/margins_max": 6.72606897354126, "rewards/margins_min": 2.274033308029175, "rewards/margins_std": 3.1480648517608643, "rewards/rejected": -6.430152893066406, "step": 1230 }, { "epoch": 0.75, "grad_norm": 8.8125, "learning_rate": 8.770847362008426e-08, "logits/chosen": -3.1518819332122803, "logits/rejected": -2.929631233215332, "logps/chosen": -314.208251953125, "logps/rejected": -736.5687255859375, "loss": 0.1798, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0510923862457275, "rewards/margins": 3.5633749961853027, "rewards/margins_max": 5.141615867614746, "rewards/margins_min": 1.9851341247558594, "rewards/margins_std": 2.2319698333740234, "rewards/rejected": -5.614466667175293, "step": 1240 }, { "epoch": 0.76, "grad_norm": 12.0625, "learning_rate": 8.371679700524476e-08, "logits/chosen": -3.0984597206115723, "logits/rejected": -2.8767600059509277, "logps/chosen": -379.36767578125, "logps/rejected": -761.9058837890625, "loss": 0.3184, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.6515004634857178, "rewards/margins": 3.444476366043091, "rewards/margins_max": 5.3620100021362305, "rewards/margins_min": 1.526942491531372, "rewards/margins_std": 2.7118022441864014, "rewards/rejected": -6.095976829528809, "step": 1250 }, { "epoch": 0.76, "grad_norm": 8.5, "learning_rate": 7.979973941769255e-08, "logits/chosen": -3.072702646255493, "logits/rejected": -2.9324452877044678, "logps/chosen": -315.5204162597656, "logps/rejected": -710.9278564453125, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1958985328674316, "rewards/margins": 3.221574068069458, "rewards/margins_max": 4.918590068817139, "rewards/margins_min": 1.5245568752288818, "rewards/margins_std": 2.399944305419922, "rewards/rejected": -5.417471885681152, "step": 1260 }, { "epoch": 0.77, "grad_norm": 8.9375, "learning_rate": 7.595905862382704e-08, "logits/chosen": -3.1583075523376465, "logits/rejected": -2.929879665374756, "logps/chosen": -333.9653015136719, "logps/rejected": -782.7625732421875, "loss": 0.1715, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2348623275756836, "rewards/margins": 3.9564566612243652, "rewards/margins_max": 5.616944313049316, "rewards/margins_min": 2.295968532562256, "rewards/margins_std": 2.3482847213745117, "rewards/rejected": -6.191318988800049, "step": 1270 }, { "epoch": 0.78, "grad_norm": 8.5, "learning_rate": 7.219647811621874e-08, "logits/chosen": -3.1165127754211426, "logits/rejected": -3.0133025646209717, "logps/chosen": -289.7486877441406, "logps/rejected": -685.6094970703125, "loss": 0.2154, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0258007049560547, "rewards/margins": 3.255826234817505, "rewards/margins_max": 4.866274833679199, "rewards/margins_min": 1.6453778743743896, "rewards/margins_std": 2.277517795562744, "rewards/rejected": -5.2816267013549805, "step": 1280 }, { "epoch": 0.78, "grad_norm": 5.8125, "learning_rate": 6.851368634019777e-08, "logits/chosen": -3.133932590484619, "logits/rejected": -2.8580222129821777, "logps/chosen": -356.24627685546875, "logps/rejected": -722.9967041015625, "loss": 0.2697, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.307548999786377, "rewards/margins": 3.4757111072540283, "rewards/margins_max": 5.241189479827881, "rewards/margins_min": 1.7102329730987549, "rewards/margins_std": 2.4967634677886963, "rewards/rejected": -5.783260822296143, "step": 1290 }, { "epoch": 0.79, "grad_norm": 9.6875, "learning_rate": 6.491233593616971e-08, "logits/chosen": -3.173882484436035, "logits/rejected": -2.9467577934265137, "logps/chosen": -351.42913818359375, "logps/rejected": -698.7190551757812, "loss": 0.2373, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.2619590759277344, "rewards/margins": 3.2130227088928223, "rewards/margins_max": 4.786438941955566, "rewards/margins_min": 1.6396061182022095, "rewards/margins_std": 2.225146770477295, "rewards/rejected": -5.474982261657715, "step": 1300 }, { "epoch": 0.79, "grad_norm": 5.59375, "learning_rate": 6.139404299799863e-08, "logits/chosen": -3.14751935005188, "logits/rejected": -2.9100093841552734, "logps/chosen": -293.3255920410156, "logps/rejected": -721.6419677734375, "loss": 0.2033, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9109737873077393, "rewards/margins": 4.052936553955078, "rewards/margins_max": 6.207828044891357, "rewards/margins_min": 1.8980449438095093, "rewards/margins_std": 3.0474772453308105, "rewards/rejected": -5.963910102844238, "step": 1310 }, { "epoch": 0.8, "grad_norm": 6.5625, "learning_rate": 5.796038634779057e-08, "logits/chosen": -3.1226589679718018, "logits/rejected": -2.879516363143921, "logps/chosen": -336.1898498535156, "logps/rejected": -733.499267578125, "loss": 0.2087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.25373911857605, "rewards/margins": 3.4867053031921387, "rewards/margins_max": 5.092923164367676, "rewards/margins_min": 1.8804876804351807, "rewards/margins_std": 2.2715346813201904, "rewards/rejected": -5.740444660186768, "step": 1320 }, { "epoch": 0.81, "grad_norm": 9.875, "learning_rate": 5.4612906827402466e-08, "logits/chosen": -3.1812329292297363, "logits/rejected": -2.9600119590759277, "logps/chosen": -335.65435791015625, "logps/rejected": -723.2822265625, "loss": 0.2063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1229610443115234, "rewards/margins": 3.5574288368225098, "rewards/margins_max": 4.972989082336426, "rewards/margins_min": 2.141868829727173, "rewards/margins_std": 2.001904249191284, "rewards/rejected": -5.680389404296875, "step": 1330 }, { "epoch": 0.81, "grad_norm": 15.75, "learning_rate": 5.1353106606994514e-08, "logits/chosen": -3.166288375854492, "logits/rejected": -2.9235687255859375, "logps/chosen": -365.612060546875, "logps/rejected": -702.6682739257812, "loss": 0.2482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.405470371246338, "rewards/margins": 3.1533291339874268, "rewards/margins_max": 5.086661338806152, "rewards/margins_min": 1.2199971675872803, "rewards/margins_std": 2.7341442108154297, "rewards/rejected": -5.558799743652344, "step": 1340 }, { "epoch": 0.82, "grad_norm": 7.75, "learning_rate": 4.818244851093642e-08, "logits/chosen": -3.1529836654663086, "logits/rejected": -2.902099609375, "logps/chosen": -335.6700439453125, "logps/rejected": -789.0332641601562, "loss": 0.183, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0865893363952637, "rewards/margins": 3.7543671131134033, "rewards/margins_max": 5.634464740753174, "rewards/margins_min": 1.874269723892212, "rewards/margins_std": 2.6588597297668457, "rewards/rejected": -5.840956211090088, "step": 1350 }, { "epoch": 0.82, "grad_norm": 7.1875, "learning_rate": 4.5102355361369607e-08, "logits/chosen": -3.128056764602661, "logits/rejected": -2.8609917163848877, "logps/chosen": -304.3298034667969, "logps/rejected": -660.6272583007812, "loss": 0.1713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9923508167266846, "rewards/margins": 3.305687665939331, "rewards/margins_max": 4.834566593170166, "rewards/margins_min": 1.7768090963363647, "rewards/margins_std": 2.162160873413086, "rewards/rejected": -5.298038959503174, "step": 1360 }, { "epoch": 0.83, "grad_norm": 14.0, "learning_rate": 4.21142093397209e-08, "logits/chosen": -3.1291439533233643, "logits/rejected": -2.9123919010162354, "logps/chosen": -331.235107421875, "logps/rejected": -673.1749877929688, "loss": 0.2041, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2359235286712646, "rewards/margins": 3.1155121326446533, "rewards/margins_max": 4.511401176452637, "rewards/margins_min": 1.7196223735809326, "rewards/margins_std": 1.974085807800293, "rewards/rejected": -5.351435661315918, "step": 1370 }, { "epoch": 0.84, "grad_norm": 5.25, "learning_rate": 3.921935136645327e-08, "logits/chosen": -3.1134796142578125, "logits/rejected": -2.8933892250061035, "logps/chosen": -319.20086669921875, "logps/rejected": -797.2918701171875, "loss": 0.1232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.06329607963562, "rewards/margins": 4.198972225189209, "rewards/margins_max": 6.017431259155273, "rewards/margins_min": 2.3805129528045654, "rewards/margins_std": 2.5716898441314697, "rewards/rejected": -6.26226806640625, "step": 1380 }, { "epoch": 0.84, "grad_norm": 4.4375, "learning_rate": 3.6419080499331986e-08, "logits/chosen": -3.1291165351867676, "logits/rejected": -2.9171788692474365, "logps/chosen": -314.25543212890625, "logps/rejected": -668.8634643554688, "loss": 0.2378, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.037966251373291, "rewards/margins": 3.3993752002716064, "rewards/margins_max": 4.708580017089844, "rewards/margins_min": 2.090170383453369, "rewards/margins_std": 1.8514951467514038, "rewards/rejected": -5.437341213226318, "step": 1390 }, { "epoch": 0.85, "grad_norm": 3.8125, "learning_rate": 3.371465335047713e-08, "logits/chosen": -3.1593098640441895, "logits/rejected": -2.941859722137451, "logps/chosen": -320.30194091796875, "logps/rejected": -844.5147705078125, "loss": 0.1994, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0999464988708496, "rewards/margins": 4.604376316070557, "rewards/margins_max": 6.516266822814941, "rewards/margins_min": 2.6924843788146973, "rewards/margins_std": 2.7038230895996094, "rewards/rejected": -6.70432186126709, "step": 1400 }, { "epoch": 0.86, "grad_norm": 8.1875, "learning_rate": 3.110728352246311e-08, "logits/chosen": -3.1501519680023193, "logits/rejected": -2.896111011505127, "logps/chosen": -315.76397705078125, "logps/rejected": -660.5819702148438, "loss": 0.2063, "rewards/accuracies": 0.875, "rewards/chosen": -2.091414213180542, "rewards/margins": 3.229884624481201, "rewards/margins_max": 4.599147796630859, "rewards/margins_min": 1.8606210947036743, "rewards/margins_std": 1.9364306926727295, "rewards/rejected": -5.321299076080322, "step": 1410 }, { "epoch": 0.86, "grad_norm": 4.0, "learning_rate": 2.8598141063718217e-08, "logits/chosen": -3.1871862411499023, "logits/rejected": -2.935353994369507, "logps/chosen": -326.63946533203125, "logps/rejected": -735.0399780273438, "loss": 0.1791, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.211333990097046, "rewards/margins": 3.781127452850342, "rewards/margins_max": 5.652678489685059, "rewards/margins_min": 1.9095767736434937, "rewards/margins_std": 2.646772861480713, "rewards/rejected": -5.992461681365967, "step": 1420 }, { "epoch": 0.87, "grad_norm": 7.8125, "learning_rate": 2.6188351943469966e-08, "logits/chosen": -3.1684048175811768, "logits/rejected": -2.9018070697784424, "logps/chosen": -379.37646484375, "logps/rejected": -710.8772583007812, "loss": 0.2164, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.436944007873535, "rewards/margins": 3.187828540802002, "rewards/margins_max": 4.914183139801025, "rewards/margins_min": 1.461474061012268, "rewards/margins_std": 2.4414334297180176, "rewards/rejected": -5.624772071838379, "step": 1430 }, { "epoch": 0.87, "grad_norm": 3.984375, "learning_rate": 2.3878997546469577e-08, "logits/chosen": -3.173720121383667, "logits/rejected": -2.9183566570281982, "logps/chosen": -352.3770446777344, "logps/rejected": -733.684814453125, "loss": 0.1905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2058091163635254, "rewards/margins": 3.5168144702911377, "rewards/margins_max": 4.97069787979126, "rewards/margins_min": 2.062930107116699, "rewards/margins_std": 2.056102752685547, "rewards/rejected": -5.722623348236084, "step": 1440 }, { "epoch": 0.88, "grad_norm": 8.3125, "learning_rate": 2.1671114187724603e-08, "logits/chosen": -3.183567523956299, "logits/rejected": -2.9641623497009277, "logps/chosen": -312.87066650390625, "logps/rejected": -769.3661499023438, "loss": 0.2043, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0123302936553955, "rewards/margins": 4.006140232086182, "rewards/margins_max": 5.548596382141113, "rewards/margins_min": 2.4636826515197754, "rewards/margins_std": 2.181363582611084, "rewards/rejected": -6.018470287322998, "step": 1450 }, { "epoch": 0.89, "grad_norm": 6.65625, "learning_rate": 1.9565692647456e-08, "logits/chosen": -3.114928722381592, "logits/rejected": -2.891458749771118, "logps/chosen": -323.8619384765625, "logps/rejected": -699.9288940429688, "loss": 0.2067, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1312808990478516, "rewards/margins": 3.579425096511841, "rewards/margins_max": 5.033545017242432, "rewards/margins_min": 2.125305414199829, "rewards/margins_std": 2.0564355850219727, "rewards/rejected": -5.710705757141113, "step": 1460 }, { "epoch": 0.89, "grad_norm": 21.625, "learning_rate": 1.7563677726488645e-08, "logits/chosen": -3.192821502685547, "logits/rejected": -2.9360315799713135, "logps/chosen": -325.11724853515625, "logps/rejected": -732.0574340820312, "loss": 0.1636, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.197850465774536, "rewards/margins": 3.762909412384033, "rewards/margins_max": 5.297985553741455, "rewards/margins_min": 2.2278337478637695, "rewards/margins_std": 2.170924663543701, "rewards/rejected": -5.96075963973999, "step": 1470 }, { "epoch": 0.9, "grad_norm": 16.625, "learning_rate": 1.5665967822275417e-08, "logits/chosen": -3.124788999557495, "logits/rejected": -2.9426681995391846, "logps/chosen": -326.2402648925781, "logps/rejected": -790.9951782226562, "loss": 0.2929, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1549830436706543, "rewards/margins": 3.9306769371032715, "rewards/margins_max": 5.507823944091797, "rewards/margins_min": 2.353529691696167, "rewards/margins_std": 2.2304234504699707, "rewards/rejected": -6.085659980773926, "step": 1480 }, { "epoch": 0.9, "grad_norm": 6.34375, "learning_rate": 1.3873414525744115e-08, "logits/chosen": -3.157076120376587, "logits/rejected": -2.8868565559387207, "logps/chosen": -340.90924072265625, "logps/rejected": -718.08740234375, "loss": 0.1802, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0659420490264893, "rewards/margins": 3.228773593902588, "rewards/margins_max": 4.96280574798584, "rewards/margins_min": 1.4947407245635986, "rewards/margins_std": 2.4522926807403564, "rewards/rejected": -5.29471492767334, "step": 1490 }, { "epoch": 0.91, "grad_norm": 7.59375, "learning_rate": 1.2186822239149158e-08, "logits/chosen": -3.1129629611968994, "logits/rejected": -2.8585550785064697, "logps/chosen": -352.6842346191406, "logps/rejected": -787.0264892578125, "loss": 0.2004, "rewards/accuracies": 0.9375, "rewards/chosen": -2.368502616882324, "rewards/margins": 3.9364609718322754, "rewards/margins_max": 5.6117143630981445, "rewards/margins_min": 2.2612078189849854, "rewards/margins_std": 2.369166135787964, "rewards/rejected": -6.304963111877441, "step": 1500 }, { "epoch": 0.92, "grad_norm": 9.875, "learning_rate": 1.0606947815098467e-08, "logits/chosen": -3.1238672733306885, "logits/rejected": -2.8686344623565674, "logps/chosen": -307.52020263671875, "logps/rejected": -777.89306640625, "loss": 0.217, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9263112545013428, "rewards/margins": 4.194777488708496, "rewards/margins_max": 6.070175647735596, "rewards/margins_min": 2.3193793296813965, "rewards/margins_std": 2.6522135734558105, "rewards/rejected": -6.121088981628418, "step": 1510 }, { "epoch": 0.92, "grad_norm": 37.0, "learning_rate": 9.134500216918722e-09, "logits/chosen": -3.1342949867248535, "logits/rejected": -2.919426441192627, "logps/chosen": -335.2420959472656, "logps/rejected": -697.2754516601562, "loss": 0.2387, "rewards/accuracies": 0.875, "rewards/chosen": -2.338940143585205, "rewards/margins": 3.315152645111084, "rewards/margins_max": 4.826213836669922, "rewards/margins_min": 1.8040918111801147, "rewards/margins_std": 2.136962652206421, "rewards/rejected": -5.654092311859131, "step": 1520 }, { "epoch": 0.93, "grad_norm": 4.0625, "learning_rate": 7.770140200510338e-09, "logits/chosen": -3.099794626235962, "logits/rejected": -2.881441593170166, "logps/chosen": -366.3176574707031, "logps/rejected": -856.1891479492188, "loss": 0.1464, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.4581868648529053, "rewards/margins": 4.456292152404785, "rewards/margins_max": 6.471911430358887, "rewards/margins_min": 2.440671682357788, "rewards/margins_std": 2.8505172729492188, "rewards/rejected": -6.9144792556762695, "step": 1530 }, { "epoch": 0.93, "grad_norm": 18.75, "learning_rate": 6.5144800178352776e-09, "logits/chosen": -3.127145290374756, "logits/rejected": -2.9245965480804443, "logps/chosen": -357.00323486328125, "logps/rejected": -728.6698608398438, "loss": 0.2302, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4451308250427246, "rewards/margins": 3.3258328437805176, "rewards/margins_max": 4.757521152496338, "rewards/margins_min": 1.894144058227539, "rewards/margins_std": 2.0247135162353516, "rewards/rejected": -5.7709641456604, "step": 1540 }, { "epoch": 0.94, "grad_norm": 8.3125, "learning_rate": 5.368083142171409e-09, "logits/chosen": -3.1004772186279297, "logits/rejected": -2.87990403175354, "logps/chosen": -353.71478271484375, "logps/rejected": -771.7055053710938, "loss": 0.2079, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3986122608184814, "rewards/margins": 3.5793299674987793, "rewards/margins_max": 5.495848178863525, "rewards/margins_min": 1.6628128290176392, "rewards/margins_std": 2.710364580154419, "rewards/rejected": -5.97794246673584, "step": 1550 }, { "epoch": 0.95, "grad_norm": 5.78125, "learning_rate": 4.331464015255526e-09, "logits/chosen": -3.149285078048706, "logits/rejected": -2.8654189109802246, "logps/chosen": -333.75341796875, "logps/rejected": -872.8679809570312, "loss": 0.2164, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.111713409423828, "rewards/margins": 4.595676422119141, "rewards/margins_max": 6.780759334564209, "rewards/margins_min": 2.410592555999756, "rewards/margins_std": 3.090175151824951, "rewards/rejected": -6.707389831542969, "step": 1560 }, { "epoch": 0.95, "grad_norm": 15.375, "learning_rate": 3.4050878164293695e-09, "logits/chosen": -3.1380608081817627, "logits/rejected": -2.8875515460968018, "logps/chosen": -356.3845520019531, "logps/rejected": -684.7333984375, "loss": 0.2406, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.311772108078003, "rewards/margins": 2.945913553237915, "rewards/margins_max": 4.341352462768555, "rewards/margins_min": 1.5504741668701172, "rewards/margins_std": 1.9734489917755127, "rewards/rejected": -5.257685661315918, "step": 1570 }, { "epoch": 0.96, "grad_norm": 7.375, "learning_rate": 2.5893702538920537e-09, "logits/chosen": -3.112511396408081, "logits/rejected": -2.9152560234069824, "logps/chosen": -335.6734924316406, "logps/rejected": -702.3567504882812, "loss": 0.2425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.2349658012390137, "rewards/margins": 3.2430100440979004, "rewards/margins_max": 4.741458892822266, "rewards/margins_min": 1.7445614337921143, "rewards/margins_std": 2.119126081466675, "rewards/rejected": -5.477975845336914, "step": 1580 }, { "epoch": 0.96, "grad_norm": 7.65625, "learning_rate": 1.884677378152372e-09, "logits/chosen": -3.1265830993652344, "logits/rejected": -2.887659788131714, "logps/chosen": -346.2403869628906, "logps/rejected": -709.3991088867188, "loss": 0.2288, "rewards/accuracies": 0.875, "rewards/chosen": -2.28578782081604, "rewards/margins": 3.251765727996826, "rewards/margins_max": 5.136622428894043, "rewards/margins_min": 1.3669096231460571, "rewards/margins_std": 2.6655895709991455, "rewards/rejected": -5.5375542640686035, "step": 1590 }, { "epoch": 0.97, "grad_norm": 7.9375, "learning_rate": 1.2913254177648325e-09, "logits/chosen": -3.1185574531555176, "logits/rejected": -2.887942314147949, "logps/chosen": -304.56109619140625, "logps/rejected": -669.3530883789062, "loss": 0.2095, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0278687477111816, "rewards/margins": 3.3642592430114746, "rewards/margins_max": 5.217440128326416, "rewards/margins_min": 1.511077880859375, "rewards/margins_std": 2.6207940578460693, "rewards/rejected": -5.39212703704834, "step": 1600 }, { "epoch": 0.98, "grad_norm": 7.28125, "learning_rate": 8.095806374232295e-10, "logits/chosen": -3.1527209281921387, "logits/rejected": -2.837277889251709, "logps/chosen": -360.6504211425781, "logps/rejected": -759.2733154296875, "loss": 0.2634, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.213390827178955, "rewards/margins": 3.6566321849823, "rewards/margins_max": 5.329567909240723, "rewards/margins_min": 1.9836972951889038, "rewards/margins_std": 2.3658881187438965, "rewards/rejected": -5.870023250579834, "step": 1610 }, { "epoch": 0.98, "grad_norm": 15.4375, "learning_rate": 4.3965921847513576e-10, "logits/chosen": -3.155099391937256, "logits/rejected": -2.9255661964416504, "logps/chosen": -323.4555358886719, "logps/rejected": -777.19677734375, "loss": 0.1381, "rewards/accuracies": 0.9375, "rewards/chosen": -2.105621337890625, "rewards/margins": 4.154127597808838, "rewards/margins_max": 5.744351387023926, "rewards/margins_min": 2.563903570175171, "rewards/margins_std": 2.2489163875579834, "rewards/rejected": -6.259748935699463, "step": 1620 }, { "epoch": 0.99, "grad_norm": 5.40625, "learning_rate": 1.8172716191142134e-10, "logits/chosen": -3.1000237464904785, "logits/rejected": -2.900533676147461, "logps/chosen": -336.09130859375, "logps/rejected": -730.9088745117188, "loss": 0.2025, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1460416316986084, "rewards/margins": 3.563811779022217, "rewards/margins_max": 5.688532829284668, "rewards/margins_min": 1.4390910863876343, "rewards/margins_std": 3.0048089027404785, "rewards/rejected": -5.709853172302246, "step": 1630 }, { "epoch": 0.99, "grad_norm": 8.8125, "learning_rate": 3.59002138737019e-11, "logits/chosen": -3.079613208770752, "logits/rejected": -2.889169931411743, "logps/chosen": -328.8927917480469, "logps/rejected": -725.4284057617188, "loss": 0.1653, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0904576778411865, "rewards/margins": 3.3967292308807373, "rewards/margins_max": 5.103400230407715, "rewards/margins_min": 1.6900584697723389, "rewards/margins_std": 2.413597583770752, "rewards/rejected": -5.487187385559082, "step": 1640 }, { "epoch": 1.0, "eval_logits/chosen": -2.2152950763702393, "eval_logits/rejected": -2.1189112663269043, "eval_logps/chosen": -352.6255798339844, "eval_logps/rejected": -347.4267272949219, "eval_loss": 0.6940016150474548, "eval_rewards/accuracies": 0.5379999876022339, "eval_rewards/chosen": -0.7885112166404724, "eval_rewards/margins": 0.058900706470012665, "eval_rewards/margins_max": 0.7288501262664795, "eval_rewards/margins_min": -0.5646029114723206, "eval_rewards/margins_std": 0.42255058884620667, "eval_rewards/rejected": -0.8474118709564209, "eval_runtime": 884.8339, "eval_samples_per_second": 4.521, "eval_steps_per_second": 0.283, "step": 1648 }, { "epoch": 1.0, "step": 1648, "total_flos": 0.0, "train_loss": 0.33046212517520757, "train_runtime": 17297.4072, "train_samples_per_second": 1.525, "train_steps_per_second": 0.095 } ], "logging_steps": 10, "max_steps": 1648, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }