{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 400, "global_step": 17412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.182401180267334, "learning_rate": 1.148105625717566e-10, "logits/chosen": -2.967046022415161, "logits/rejected": -2.9243061542510986, "logps/chosen": -43.99115753173828, "logps/rejected": -41.627906799316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.3828177452087402, "learning_rate": 1.148105625717566e-09, "logits/chosen": -3.0551016330718994, "logits/rejected": -3.025693893432617, "logps/chosen": -50.46271514892578, "logps/rejected": -49.61042785644531, "loss": 0.693, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 3.583024226827547e-05, "rewards/margins": 0.0002894425706472248, "rewards/rejected": -0.0002536123211029917, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.242837429046631, "learning_rate": 2.296211251435132e-09, "logits/chosen": -3.119117259979248, "logits/rejected": -3.1108996868133545, "logps/chosen": -52.652076721191406, "logps/rejected": -52.98986053466797, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00011003723921021447, "rewards/margins": -4.4298911234363914e-05, "rewards/rejected": 0.00015433612861670554, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.5732250213623047, "learning_rate": 3.4443168771526976e-09, "logits/chosen": -3.0916590690612793, "logits/rejected": -3.067901611328125, "logps/chosen": -56.798622131347656, "logps/rejected": -58.4221076965332, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.500688970030751e-05, "rewards/margins": -0.00017597868281882256, "rewards/rejected": 0.00015097178402356803, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 2.0121591091156006, "learning_rate": 4.592422502870264e-09, "logits/chosen": -3.1051547527313232, "logits/rejected": -3.0735793113708496, "logps/chosen": -55.268089294433594, "logps/rejected": -50.67551803588867, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 4.075562537764199e-05, "rewards/margins": -5.7307326642330736e-05, "rewards/rejected": 9.806293383007869e-05, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 2.3856751918792725, "learning_rate": 5.74052812858783e-09, "logits/chosen": -3.100912570953369, "logits/rejected": -3.0844902992248535, "logps/chosen": -53.11623001098633, "logps/rejected": -51.5071907043457, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 3.748986728169257e-06, "rewards/margins": 7.726665353402495e-05, "rewards/rejected": -7.35176945454441e-05, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.7930078506469727, "learning_rate": 6.888633754305395e-09, "logits/chosen": -3.1540441513061523, "logits/rejected": -3.124316453933716, "logps/chosen": -57.598358154296875, "logps/rejected": -54.17271041870117, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00011934386566281319, "rewards/margins": -0.00021098754950799048, "rewards/rejected": 9.164368384517729e-05, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.204110860824585, "learning_rate": 8.036739380022962e-09, "logits/chosen": -3.0506539344787598, "logits/rejected": -3.030632495880127, "logps/chosen": -53.753807067871094, "logps/rejected": -53.215538024902344, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000154320354340598, "rewards/margins": 6.41578808426857e-05, "rewards/rejected": 9.016246622195467e-05, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.4381542205810547, "learning_rate": 9.184845005740529e-09, "logits/chosen": -3.1597819328308105, "logits/rejected": -3.1264419555664062, "logps/chosen": -59.09504318237305, "logps/rejected": -54.12665557861328, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3803789443045389e-05, "rewards/margins": 0.0003244551189709455, "rewards/rejected": -0.0003106513468082994, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.4764792919158936, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -2.9934918880462646, "logits/rejected": -2.9788215160369873, "logps/chosen": -53.476707458496094, "logps/rejected": -52.831390380859375, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -5.649983722832985e-05, "rewards/margins": -0.00016349827637895942, "rewards/rejected": 0.00010699845006456599, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.485914707183838, "learning_rate": 1.148105625717566e-08, "logits/chosen": -3.1696791648864746, "logits/rejected": -3.107633590698242, "logps/chosen": -55.949684143066406, "logps/rejected": -49.63793182373047, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -4.7669574996689335e-05, "rewards/margins": 3.545046638464555e-05, "rewards/rejected": -8.312005229527131e-05, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.541780710220337, "learning_rate": 1.2629161882893224e-08, "logits/chosen": -3.1218652725219727, "logits/rejected": -3.098087787628174, "logps/chosen": -55.605926513671875, "logps/rejected": -52.333740234375, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00028869349625892937, "rewards/margins": -0.0002611152012832463, "rewards/rejected": -2.7578294975683093e-05, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.5565247535705566, "learning_rate": 1.377726750861079e-08, "logits/chosen": -3.065533399581909, "logits/rejected": -3.0499966144561768, "logps/chosen": -53.1763916015625, "logps/rejected": -55.58396530151367, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": 5.7340563216712326e-05, "rewards/margins": 0.00017559928528498858, "rewards/rejected": -0.00011825871479231864, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 2.139101982116699, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -3.1009914875030518, "logits/rejected": -3.08695650100708, "logps/chosen": -55.18524169921875, "logps/rejected": -53.770782470703125, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 8.389687718590721e-05, "rewards/margins": 0.00018740523955784738, "rewards/rejected": -0.00010350835509598255, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.431870460510254, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -3.122816562652588, "logits/rejected": -3.104506015777588, "logps/chosen": -54.189476013183594, "logps/rejected": -53.77021408081055, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.4902594052255154e-05, "rewards/margins": 9.448556374991313e-05, "rewards/rejected": -0.00012938815052621067, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.2152774333953857, "learning_rate": 1.722158438576349e-08, "logits/chosen": -3.0275559425354004, "logits/rejected": -3.0098488330841064, "logps/chosen": -52.62406539916992, "logps/rejected": -52.41263961791992, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 2.73335517704254e-05, "rewards/margins": 0.00013241718988865614, "rewards/rejected": -0.00010508365085115656, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 2.1547353267669678, "learning_rate": 1.8369690011481057e-08, "logits/chosen": -3.0887503623962402, "logits/rejected": -3.067896604537964, "logps/chosen": -53.48332595825195, "logps/rejected": -54.71419143676758, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 3.922378527931869e-05, "rewards/margins": 0.00015319202793762088, "rewards/rejected": -0.00011396827176213264, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.3667101860046387, "learning_rate": 1.9517795637198624e-08, "logits/chosen": -3.076423168182373, "logits/rejected": -3.0568714141845703, "logps/chosen": -56.27741622924805, "logps/rejected": -51.32807540893555, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 7.188701420091093e-05, "rewards/margins": 0.00013863443746231496, "rewards/rejected": -6.674742326140404e-05, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.604196548461914, "learning_rate": 2.0665901262916187e-08, "logits/chosen": -3.0623817443847656, "logits/rejected": -3.043731451034546, "logps/chosen": -56.39360809326172, "logps/rejected": -53.7656364440918, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00012041317677358165, "rewards/margins": -3.166842361679301e-05, "rewards/rejected": 0.00015208160039037466, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.6357362270355225, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -3.1240854263305664, "logits/rejected": -3.080690860748291, "logps/chosen": -58.188934326171875, "logps/rejected": -52.56037521362305, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.388389763538726e-05, "rewards/margins": 0.00017177227709908038, "rewards/rejected": -0.00022565617109648883, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.5758800506591797, "learning_rate": 2.296211251435132e-08, "logits/chosen": -3.059539556503296, "logits/rejected": -3.0438239574432373, "logps/chosen": -54.10227584838867, "logps/rejected": -54.71996307373047, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00020430810400284827, "rewards/margins": 0.0001816313888411969, "rewards/rejected": 2.2676715161651373e-05, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.2871100902557373, "learning_rate": 2.4110218140068887e-08, "logits/chosen": -3.0141561031341553, "logits/rejected": -3.00553560256958, "logps/chosen": -53.250831604003906, "logps/rejected": -57.2822380065918, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00013214505452197045, "rewards/margins": 4.95816238981206e-05, "rewards/rejected": 8.256339060608298e-05, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.3293843269348145, "learning_rate": 2.5258323765786448e-08, "logits/chosen": -3.0508525371551514, "logits/rejected": -3.01947021484375, "logps/chosen": -52.181129455566406, "logps/rejected": -51.32151412963867, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0001834220893215388, "rewards/margins": 0.00023680910817347467, "rewards/rejected": -5.338704795576632e-05, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.391742467880249, "learning_rate": 2.6406429391504014e-08, "logits/chosen": -3.0510756969451904, "logits/rejected": -3.0328288078308105, "logps/chosen": -48.904396057128906, "logps/rejected": -49.960792541503906, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 5.220425009611063e-05, "rewards/margins": 0.00021240771457087249, "rewards/rejected": -0.00016020346083678305, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 2.250674247741699, "learning_rate": 2.755453501722158e-08, "logits/chosen": -3.0246095657348633, "logits/rejected": -2.9822471141815186, "logps/chosen": -55.94166946411133, "logps/rejected": -52.1525993347168, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00012501263699959964, "rewards/margins": 0.00010458445467520505, "rewards/rejected": 2.0428178686415777e-05, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.317246437072754, "learning_rate": 2.8702640642939148e-08, "logits/chosen": -3.1179134845733643, "logits/rejected": -3.097510576248169, "logps/chosen": -52.26668167114258, "logps/rejected": -51.089698791503906, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.0001339766022283584, "rewards/margins": 5.490519106388092e-05, "rewards/rejected": 7.907139661256224e-05, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 2.3133351802825928, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -3.094203472137451, "logits/rejected": -3.0818848609924316, "logps/chosen": -54.8641357421875, "logps/rejected": -56.6263313293457, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 5.03903029311914e-05, "rewards/margins": 0.00014110926713328809, "rewards/rejected": -9.071899694390595e-05, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.2149734497070312, "learning_rate": 3.099885189437428e-08, "logits/chosen": -3.033080577850342, "logits/rejected": -3.014913558959961, "logps/chosen": -53.12932586669922, "logps/rejected": -54.3194465637207, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -9.781783592188731e-05, "rewards/margins": 4.450721462490037e-05, "rewards/rejected": -0.00014232503599487245, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.4271368980407715, "learning_rate": 3.214695752009185e-08, "logits/chosen": -3.1243245601654053, "logits/rejected": -3.090181589126587, "logps/chosen": -57.60817337036133, "logps/rejected": -53.423431396484375, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 5.003236583434045e-05, "rewards/margins": 0.0001398597814841196, "rewards/rejected": -8.982741564977914e-05, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 2.24564790725708, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -3.047020673751831, "logits/rejected": -3.0328316688537598, "logps/chosen": -55.384986877441406, "logps/rejected": -54.282264709472656, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.540546696167439e-05, "rewards/margins": -8.022228576010093e-05, "rewards/rejected": -5.1831566452165134e-06, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.359998941421509, "learning_rate": 3.444316877152698e-08, "logits/chosen": -3.0025739669799805, "logits/rejected": -2.9939587116241455, "logps/chosen": -52.850181579589844, "logps/rejected": -53.94788360595703, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -7.816951256245375e-05, "rewards/margins": -4.090732545591891e-05, "rewards/rejected": -3.726218710653484e-05, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.47426176071167, "learning_rate": 3.559127439724455e-08, "logits/chosen": -3.0653576850891113, "logits/rejected": -3.05991268157959, "logps/chosen": -53.51900100708008, "logps/rejected": -53.30791473388672, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 4.028494004160166e-06, "rewards/margins": 0.00012998899910598993, "rewards/rejected": -0.00012596049054991454, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.410942554473877, "learning_rate": 3.6739380022962115e-08, "logits/chosen": -3.0229759216308594, "logits/rejected": -2.9965834617614746, "logps/chosen": -54.520713806152344, "logps/rejected": -49.27983093261719, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0001271664659725502, "rewards/margins": 0.00026925824931822717, "rewards/rejected": -0.0003964246716350317, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.349635124206543, "learning_rate": 3.788748564867968e-08, "logits/chosen": -3.0833523273468018, "logits/rejected": -3.059699535369873, "logps/chosen": -55.03084182739258, "logps/rejected": -52.24028778076172, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 2.294254045409616e-05, "rewards/margins": 0.00032292449031956494, "rewards/rejected": -0.00029998194077052176, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.1602048873901367, "learning_rate": 3.903559127439725e-08, "logits/chosen": -3.0051302909851074, "logits/rejected": -2.983626127243042, "logps/chosen": -52.538604736328125, "logps/rejected": -51.94645309448242, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00012052619422320276, "rewards/margins": -1.1196988452866208e-05, "rewards/rejected": -0.00010932923760265112, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.3185722827911377, "learning_rate": 4.018369690011481e-08, "logits/chosen": -2.9774973392486572, "logits/rejected": -2.9381320476531982, "logps/chosen": -56.23899459838867, "logps/rejected": -53.59648513793945, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0002953378134407103, "rewards/margins": 0.0001039146663970314, "rewards/rejected": -0.00039925254532136023, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.4112558364868164, "learning_rate": 4.1331802525832375e-08, "logits/chosen": -3.1283843517303467, "logits/rejected": -3.105407238006592, "logps/chosen": -54.58465576171875, "logps/rejected": -50.559539794921875, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8427713914425112e-05, "rewards/margins": 0.0004831443657167256, "rewards/rejected": -0.0004547166754491627, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.3298943042755127, "learning_rate": 4.247990815154994e-08, "logits/chosen": -3.103458881378174, "logits/rejected": -3.0742361545562744, "logps/chosen": -52.39043426513672, "logps/rejected": -51.40144729614258, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.273760689306073e-05, "rewards/margins": 0.0005517008830793202, "rewards/rejected": -0.0006044383626431227, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 2.0821239948272705, "learning_rate": 4.362801377726751e-08, "logits/chosen": -3.2037720680236816, "logits/rejected": -3.1789143085479736, "logps/chosen": -53.529457092285156, "logps/rejected": -52.24095916748047, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0002573663368821144, "rewards/margins": 0.0002463909622747451, "rewards/rejected": -0.0005037573864683509, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.40598464012146, "learning_rate": 4.4776119402985075e-08, "logits/chosen": -3.1002163887023926, "logits/rejected": -3.0747485160827637, "logps/chosen": -56.086753845214844, "logps/rejected": -55.247955322265625, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.00021222772193141282, "rewards/margins": 0.00019325126777403057, "rewards/rejected": -0.0004054790479131043, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 2.0966548919677734, "learning_rate": 4.592422502870264e-08, "logits/chosen": -3.0705161094665527, "logits/rejected": -3.054738998413086, "logps/chosen": -52.68198776245117, "logps/rejected": -52.8104362487793, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0004277491825632751, "rewards/margins": 1.7190934613608988e-06, "rewards/rejected": -0.00042946828762069345, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.1630496978759766, "eval_logits/rejected": -3.157418727874756, "eval_logps/chosen": -58.6923942565918, "eval_logps/rejected": -63.1541748046875, "eval_loss": 0.6931801438331604, "eval_rewards/accuracies": 0.46538102626800537, "eval_rewards/chosen": 0.00019500928465276957, "eval_rewards/margins": -6.448826025007293e-05, "eval_rewards/rejected": 0.0002594975521788001, "eval_runtime": 382.9357, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 2.1526947021484375, "learning_rate": 4.707233065442021e-08, "logits/chosen": -3.0743823051452637, "logits/rejected": -3.070225238800049, "logps/chosen": -50.76953125, "logps/rejected": -55.5819206237793, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00024037156254053116, "rewards/margins": 0.00029858676134608686, "rewards/rejected": -0.0005389582947827876, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.527576208114624, "learning_rate": 4.8220436280137775e-08, "logits/chosen": -3.0596532821655273, "logits/rejected": -3.051694631576538, "logps/chosen": -54.21385955810547, "logps/rejected": -53.94188690185547, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0004788221849594265, "rewards/margins": -0.00019773165695369244, "rewards/rejected": -0.00028109049890190363, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.2473433017730713, "learning_rate": 4.9368541905855335e-08, "logits/chosen": -3.0878098011016846, "logits/rejected": -3.0740981101989746, "logps/chosen": -53.11975860595703, "logps/rejected": -54.20466232299805, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0002864287234842777, "rewards/margins": 0.00012910208897665143, "rewards/rejected": -0.00041553081246092916, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.511948823928833, "learning_rate": 5.0516647531572895e-08, "logits/chosen": -3.1331732273101807, "logits/rejected": -3.0978844165802, "logps/chosen": -54.333213806152344, "logps/rejected": -53.343994140625, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00028464937349781394, "rewards/margins": 0.0006941338069736958, "rewards/rejected": -0.000978783005848527, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.281709909439087, "learning_rate": 5.166475315729046e-08, "logits/chosen": -3.0553770065307617, "logits/rejected": -3.0229177474975586, "logps/chosen": -56.12068557739258, "logps/rejected": -54.6532096862793, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.000248014519456774, "rewards/margins": 0.0002685516665223986, "rewards/rejected": -0.0005165661568753421, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.403916597366333, "learning_rate": 5.281285878300803e-08, "logits/chosen": -3.024060010910034, "logits/rejected": -3.0043814182281494, "logps/chosen": -56.222320556640625, "logps/rejected": -53.07564163208008, "loss": 0.6928, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00022003026970196515, "rewards/margins": 0.0006629715790040791, "rewards/rejected": -0.000883001834154129, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.208056688308716, "learning_rate": 5.3960964408725595e-08, "logits/chosen": -3.0470211505889893, "logits/rejected": -3.015340566635132, "logps/chosen": -53.32251739501953, "logps/rejected": -51.46124267578125, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0005692066042684019, "rewards/margins": 4.9208720156457275e-05, "rewards/rejected": -0.0006184153025969863, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.447040557861328, "learning_rate": 5.510907003444316e-08, "logits/chosen": -3.044581890106201, "logits/rejected": -3.0393683910369873, "logps/chosen": -54.26829147338867, "logps/rejected": -59.047401428222656, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00029101339168846607, "rewards/margins": 0.0005864130798727274, "rewards/rejected": -0.0008774265879765153, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.480241298675537, "learning_rate": 5.625717566016073e-08, "logits/chosen": -2.9544525146484375, "logits/rejected": -2.9047305583953857, "logps/chosen": -60.655059814453125, "logps/rejected": -51.4771842956543, "loss": 0.6927, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0003955605789087713, "rewards/margins": 0.0009884096216410398, "rewards/rejected": -0.001383970258757472, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 2.2628631591796875, "learning_rate": 5.7405281285878295e-08, "logits/chosen": -3.0166900157928467, "logits/rejected": -2.987926721572876, "logps/chosen": -55.03630447387695, "logps/rejected": -51.71287155151367, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0008643764886073768, "rewards/margins": 0.00037768454058095813, "rewards/rejected": -0.0012420611456036568, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 2.239893674850464, "learning_rate": 5.855338691159586e-08, "logits/chosen": -3.008843421936035, "logits/rejected": -2.9875636100769043, "logps/chosen": -58.34012985229492, "logps/rejected": -52.08111572265625, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0006519248127005994, "rewards/margins": 0.00021181194460950792, "rewards/rejected": -0.000863736669998616, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 2.0704708099365234, "learning_rate": 5.970149253731343e-08, "logits/chosen": -3.056964159011841, "logits/rejected": -3.0313286781311035, "logps/chosen": -56.494102478027344, "logps/rejected": -51.76349639892578, "loss": 0.6929, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0008717130986042321, "rewards/margins": 0.00046385274617932737, "rewards/rejected": -0.001335565815679729, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 2.0647060871124268, "learning_rate": 6.084959816303099e-08, "logits/chosen": -3.0535197257995605, "logits/rejected": -3.011583089828491, "logps/chosen": -55.7488899230957, "logps/rejected": -51.35005569458008, "loss": 0.6926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0004926534602418542, "rewards/margins": 0.001148594543337822, "rewards/rejected": -0.0016412477707490325, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 2.2363359928131104, "learning_rate": 6.199770378874856e-08, "logits/chosen": -3.039377212524414, "logits/rejected": -3.022956132888794, "logps/chosen": -52.94477081298828, "logps/rejected": -53.070411682128906, "loss": 0.6928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0009251947631128132, "rewards/margins": 0.0006015413091517985, "rewards/rejected": -0.0015267360722646117, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.260488510131836, "learning_rate": 6.314580941446614e-08, "logits/chosen": -3.1000843048095703, "logits/rejected": -3.082019329071045, "logps/chosen": -53.64750289916992, "logps/rejected": -52.13825607299805, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0007364696939475834, "rewards/margins": 0.001018259208649397, "rewards/rejected": -0.0017547288443893194, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.5739121437072754, "learning_rate": 6.42939150401837e-08, "logits/chosen": -3.0712523460388184, "logits/rejected": -3.063311815261841, "logps/chosen": -52.2054557800293, "logps/rejected": -55.1589469909668, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0009688477148301899, "rewards/margins": 0.0009185401722788811, "rewards/rejected": -0.00188738782890141, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.2335901260375977, "learning_rate": 6.544202066590127e-08, "logits/chosen": -3.04362154006958, "logits/rejected": -3.034945249557495, "logps/chosen": -51.44483184814453, "logps/rejected": -54.00239944458008, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0008267102530226111, "rewards/margins": 0.0009768879972398281, "rewards/rejected": -0.0018035981338471174, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.7748967409133911, "learning_rate": 6.659012629161883e-08, "logits/chosen": -3.050416946411133, "logits/rejected": -3.044624090194702, "logps/chosen": -51.22686767578125, "logps/rejected": -53.419944763183594, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0011879531666636467, "rewards/margins": 0.0005926621379330754, "rewards/rejected": -0.001780615421012044, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.0524795055389404, "learning_rate": 6.77382319173364e-08, "logits/chosen": -3.0492122173309326, "logits/rejected": -3.027358293533325, "logps/chosen": -54.976097106933594, "logps/rejected": -54.95697784423828, "loss": 0.6928, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0009717288194224238, "rewards/margins": 0.0007361652096733451, "rewards/rejected": -0.001707894029095769, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.3293092250823975, "learning_rate": 6.888633754305396e-08, "logits/chosen": -3.0223147869110107, "logits/rejected": -2.9981188774108887, "logps/chosen": -53.92238235473633, "logps/rejected": -56.933319091796875, "loss": 0.692, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.00038632837822660804, "rewards/margins": 0.0022632996551692486, "rewards/rejected": -0.0026496287900954485, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.2301552295684814, "learning_rate": 7.003444316877152e-08, "logits/chosen": -2.9881367683410645, "logits/rejected": -2.985593318939209, "logps/chosen": -52.6776123046875, "logps/rejected": -53.41646194458008, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.001130065182223916, "rewards/margins": 0.0004246587341185659, "rewards/rejected": -0.0015547239454463124, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.37221097946167, "learning_rate": 7.11825487944891e-08, "logits/chosen": -3.146998882293701, "logits/rejected": -3.120384931564331, "logps/chosen": -55.118141174316406, "logps/rejected": -53.546974182128906, "loss": 0.6922, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0009913553949445486, "rewards/margins": 0.0018093215767294168, "rewards/rejected": -0.002800677204504609, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.475659132003784, "learning_rate": 7.233065442020666e-08, "logits/chosen": -3.130199432373047, "logits/rejected": -3.1033434867858887, "logps/chosen": -54.01044464111328, "logps/rejected": -51.10059356689453, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0014563563745468855, "rewards/margins": 0.0010107369162142277, "rewards/rejected": -0.002467093523591757, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.473634958267212, "learning_rate": 7.347876004592423e-08, "logits/chosen": -3.098565101623535, "logits/rejected": -3.0876259803771973, "logps/chosen": -53.07587432861328, "logps/rejected": -54.5706672668457, "loss": 0.6928, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.001637846464291215, "rewards/margins": 0.0008041782421059906, "rewards/rejected": -0.0024420248810201883, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.685940980911255, "learning_rate": 7.462686567164179e-08, "logits/chosen": -3.1109936237335205, "logits/rejected": -3.112290859222412, "logps/chosen": -51.69086456298828, "logps/rejected": -55.07471466064453, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0021597391460090876, "rewards/margins": 0.00037304952275007963, "rewards/rejected": -0.0025327885523438454, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.2419979572296143, "learning_rate": 7.577497129735936e-08, "logits/chosen": -3.0010995864868164, "logits/rejected": -2.9949069023132324, "logps/chosen": -54.76444625854492, "logps/rejected": -52.52055740356445, "loss": 0.6925, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0013639924582093954, "rewards/margins": 0.0012060193112120032, "rewards/rejected": -0.0025700118858367205, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.2122037410736084, "learning_rate": 7.692307692307692e-08, "logits/chosen": -3.023953914642334, "logits/rejected": -3.019078493118286, "logps/chosen": -53.1700553894043, "logps/rejected": -57.7303466796875, "loss": 0.6929, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0016445998335257173, "rewards/margins": 0.0004528749850578606, "rewards/rejected": -0.002097474876791239, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.33010196685791, "learning_rate": 7.80711825487945e-08, "logits/chosen": -2.975424289703369, "logits/rejected": -2.950366497039795, "logps/chosen": -54.11586380004883, "logps/rejected": -51.203956604003906, "loss": 0.6924, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.001826353371143341, "rewards/margins": 0.001559465192258358, "rewards/rejected": -0.0033858187962323427, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.5904860496520996, "learning_rate": 7.921928817451206e-08, "logits/chosen": -3.1240360736846924, "logits/rejected": -3.094136953353882, "logps/chosen": -59.23029327392578, "logps/rejected": -50.781585693359375, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0014779084594920278, "rewards/margins": 0.0009200021740980446, "rewards/rejected": -0.0023979106917977333, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.203641891479492, "learning_rate": 8.036739380022962e-08, "logits/chosen": -3.0816588401794434, "logits/rejected": -3.053264856338501, "logps/chosen": -55.8792839050293, "logps/rejected": -53.30298614501953, "loss": 0.6923, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0015104495687410235, "rewards/margins": 0.001711997203528881, "rewards/rejected": -0.0032224461901932955, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.2730891704559326, "learning_rate": 8.151549942594719e-08, "logits/chosen": -3.060011625289917, "logits/rejected": -3.031322956085205, "logps/chosen": -54.78464889526367, "logps/rejected": -55.0402946472168, "loss": 0.6921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0013594934716820717, "rewards/margins": 0.002062887419015169, "rewards/rejected": -0.003422380657866597, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.5072357654571533, "learning_rate": 8.266360505166475e-08, "logits/chosen": -3.021775722503662, "logits/rejected": -3.0175387859344482, "logps/chosen": -53.49358367919922, "logps/rejected": -54.68156051635742, "loss": 0.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0017647970234975219, "rewards/margins": 0.0010740322759374976, "rewards/rejected": -0.0028388292994350195, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.3823907375335693, "learning_rate": 8.381171067738232e-08, "logits/chosen": -3.1382100582122803, "logits/rejected": -3.1131160259246826, "logps/chosen": -56.51377487182617, "logps/rejected": -52.933837890625, "loss": 0.692, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0018151247641071677, "rewards/margins": 0.002312289085239172, "rewards/rejected": -0.0041274139657616615, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.1981279850006104, "learning_rate": 8.495981630309988e-08, "logits/chosen": -3.0200366973876953, "logits/rejected": -2.994292736053467, "logps/chosen": -55.04505157470703, "logps/rejected": -54.068115234375, "loss": 0.692, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0012819484109058976, "rewards/margins": 0.00240796385332942, "rewards/rejected": -0.0036899116821587086, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.430384874343872, "learning_rate": 8.610792192881746e-08, "logits/chosen": -3.1935439109802246, "logits/rejected": -3.1655354499816895, "logps/chosen": -56.11168670654297, "logps/rejected": -54.45726776123047, "loss": 0.6914, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.000991794397123158, "rewards/margins": 0.003526625456288457, "rewards/rejected": -0.004518419038504362, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.590021848678589, "learning_rate": 8.725602755453502e-08, "logits/chosen": -3.043562412261963, "logits/rejected": -3.004751205444336, "logps/chosen": -54.59693145751953, "logps/rejected": -49.988407135009766, "loss": 0.6916, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0016047193203121424, "rewards/margins": 0.0031021684408187866, "rewards/rejected": -0.004706887062638998, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.0914759635925293, "learning_rate": 8.840413318025258e-08, "logits/chosen": -3.093982219696045, "logits/rejected": -3.070664405822754, "logps/chosen": -53.08367156982422, "logps/rejected": -52.61655807495117, "loss": 0.6919, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.001643743016757071, "rewards/margins": 0.002435127506032586, "rewards/rejected": -0.004078870639204979, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.6623942852020264, "learning_rate": 8.955223880597015e-08, "logits/chosen": -3.0886335372924805, "logits/rejected": -3.0576045513153076, "logps/chosen": -53.36533737182617, "logps/rejected": -51.709129333496094, "loss": 0.6916, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.002253578510135412, "rewards/margins": 0.003126527415588498, "rewards/rejected": -0.005380106158554554, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.2624731063842773, "learning_rate": 9.070034443168771e-08, "logits/chosen": -3.096177339553833, "logits/rejected": -3.062889337539673, "logps/chosen": -54.07451629638672, "logps/rejected": -54.05188751220703, "loss": 0.6916, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0016787642380222678, "rewards/margins": 0.0031538617331534624, "rewards/rejected": -0.004832625854760408, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.6842644214630127, "learning_rate": 9.184845005740528e-08, "logits/chosen": -2.9823784828186035, "logits/rejected": -2.956275224685669, "logps/chosen": -55.31383514404297, "logps/rejected": -54.95196533203125, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.003162782173603773, "rewards/margins": 0.002404420403763652, "rewards/rejected": -0.005567202344536781, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.157433271408081, "eval_logits/rejected": -3.1517605781555176, "eval_logps/chosen": -58.558616638183594, "eval_logps/rejected": -63.0954704284668, "eval_loss": 0.6928107142448425, "eval_rewards/accuracies": 0.5525093078613281, "eval_rewards/chosen": 0.0015327819855883718, "eval_rewards/margins": 0.0006862352602183819, "eval_rewards/rejected": 0.0008465467253699899, "eval_runtime": 382.9449, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.204132080078125, "learning_rate": 9.299655568312284e-08, "logits/chosen": -3.0566883087158203, "logits/rejected": -3.0285804271698, "logps/chosen": -56.864402770996094, "logps/rejected": -56.03449630737305, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.002270194236189127, "rewards/margins": 0.00244377669878304, "rewards/rejected": -0.004713970236480236, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.1584360599517822, "learning_rate": 9.414466130884042e-08, "logits/chosen": -3.1056180000305176, "logits/rejected": -3.0800392627716064, "logps/chosen": -51.924049377441406, "logps/rejected": -51.29270553588867, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": -0.003763598622754216, "rewards/margins": 0.0018941021990031004, "rewards/rejected": -0.005657701287418604, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.4463465213775635, "learning_rate": 9.529276693455798e-08, "logits/chosen": -3.03351092338562, "logits/rejected": -3.0181891918182373, "logps/chosen": -54.88848114013672, "logps/rejected": -54.726600646972656, "loss": 0.691, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0030868500471115112, "rewards/margins": 0.004416943993419409, "rewards/rejected": -0.007503793574869633, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.5564217567443848, "learning_rate": 9.644087256027555e-08, "logits/chosen": -3.136671781539917, "logits/rejected": -3.1101253032684326, "logps/chosen": -54.3847770690918, "logps/rejected": -50.26315689086914, "loss": 0.691, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.002898902166634798, "rewards/margins": 0.0043100654147565365, "rewards/rejected": -0.007208968047052622, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.2932486534118652, "learning_rate": 9.758897818599311e-08, "logits/chosen": -3.0120320320129395, "logits/rejected": -3.0013999938964844, "logps/chosen": -51.23966598510742, "logps/rejected": -55.53287887573242, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.004185699392110109, "rewards/margins": 0.0017863849643617868, "rewards/rejected": -0.0059720841236412525, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.3204004764556885, "learning_rate": 9.873708381171067e-08, "logits/chosen": -3.037720203399658, "logits/rejected": -3.017627239227295, "logps/chosen": -53.264244079589844, "logps/rejected": -53.15632247924805, "loss": 0.691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0040500895120203495, "rewards/margins": 0.004448921885341406, "rewards/rejected": -0.008499011397361755, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 1.9834500551223755, "learning_rate": 9.988518943742824e-08, "logits/chosen": -3.111067771911621, "logits/rejected": -3.1072709560394287, "logps/chosen": -51.75238800048828, "logps/rejected": -54.507606506347656, "loss": 0.6912, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.004508176352828741, "rewards/margins": 0.004009455442428589, "rewards/rejected": -0.008517631329596043, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 1.9233384132385254, "learning_rate": 1.0103329506314579e-07, "logits/chosen": -3.024372100830078, "logits/rejected": -3.0014374256134033, "logps/chosen": -52.12944412231445, "logps/rejected": -52.31949996948242, "loss": 0.6905, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.003995339386165142, "rewards/margins": 0.005398184061050415, "rewards/rejected": -0.009393523447215557, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.241018772125244, "learning_rate": 1.0218140068886336e-07, "logits/chosen": -3.0522618293762207, "logits/rejected": -3.0149013996124268, "logps/chosen": -58.669822692871094, "logps/rejected": -54.974082946777344, "loss": 0.69, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0018275838810950518, "rewards/margins": 0.006310337223112583, "rewards/rejected": -0.008137920871376991, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.1978955268859863, "learning_rate": 1.0332950631458092e-07, "logits/chosen": -3.072181463241577, "logits/rejected": -3.061849355697632, "logps/chosen": -54.629608154296875, "logps/rejected": -53.2902946472168, "loss": 0.6911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004928673151880503, "rewards/margins": 0.00411958945915103, "rewards/rejected": -0.009048262611031532, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.142141342163086, "learning_rate": 1.044776119402985e-07, "logits/chosen": -3.0427145957946777, "logits/rejected": -3.0346322059631348, "logps/chosen": -51.631103515625, "logps/rejected": -52.90343475341797, "loss": 0.692, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006040601991117001, "rewards/margins": 0.0024170693941414356, "rewards/rejected": -0.008457671850919724, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.318749189376831, "learning_rate": 1.0562571756601606e-07, "logits/chosen": -3.08345627784729, "logits/rejected": -3.040051221847534, "logps/chosen": -54.85624313354492, "logps/rejected": -50.60798263549805, "loss": 0.6901, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005340777337551117, "rewards/margins": 0.0062098256312310696, "rewards/rejected": -0.011550603434443474, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.554981231689453, "learning_rate": 1.0677382319173363e-07, "logits/chosen": -3.1441140174865723, "logits/rejected": -3.1281702518463135, "logps/chosen": -53.05458450317383, "logps/rejected": -55.630455017089844, "loss": 0.6904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005707239266484976, "rewards/margins": 0.0056891003623604774, "rewards/rejected": -0.01139634009450674, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.219067335128784, "learning_rate": 1.0792192881745119e-07, "logits/chosen": -3.1239795684814453, "logits/rejected": -3.0865120887756348, "logps/chosen": -60.679237365722656, "logps/rejected": -55.87761688232422, "loss": 0.6906, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00473904749378562, "rewards/margins": 0.005185187328606844, "rewards/rejected": -0.009924234822392464, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.2362265586853027, "learning_rate": 1.0907003444316875e-07, "logits/chosen": -2.916612148284912, "logits/rejected": -2.8993961811065674, "logps/chosen": -56.0440788269043, "logps/rejected": -56.89391326904297, "loss": 0.6908, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.007726993411779404, "rewards/margins": 0.004901893436908722, "rewards/rejected": -0.012628885917365551, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.317995071411133, "learning_rate": 1.1021814006888632e-07, "logits/chosen": -2.885284662246704, "logits/rejected": -2.8884658813476562, "logps/chosen": -51.720481872558594, "logps/rejected": -56.540191650390625, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.009791770949959755, "rewards/margins": -0.0003909044316969812, "rewards/rejected": -0.0094008669257164, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.359971046447754, "learning_rate": 1.1136624569460388e-07, "logits/chosen": -3.055280923843384, "logits/rejected": -3.0196239948272705, "logps/chosen": -61.49090576171875, "logps/rejected": -53.53166580200195, "loss": 0.6919, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008273814804852009, "rewards/margins": 0.002594894263893366, "rewards/rejected": -0.010868709534406662, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 3.1766042709350586, "learning_rate": 1.1251435132032146e-07, "logits/chosen": -3.1567161083221436, "logits/rejected": -3.136969804763794, "logps/chosen": -56.458778381347656, "logps/rejected": -56.0153694152832, "loss": 0.6894, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004100353457033634, "rewards/margins": 0.007654036395251751, "rewards/rejected": -0.011754389852285385, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.504091262817383, "learning_rate": 1.1366245694603902e-07, "logits/chosen": -3.0233750343322754, "logits/rejected": -3.000009298324585, "logps/chosen": -55.2623291015625, "logps/rejected": -53.919044494628906, "loss": 0.6902, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006930728908628225, "rewards/margins": 0.0060745240189135075, "rewards/rejected": -0.013005253858864307, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 2.235285758972168, "learning_rate": 1.1481056257175659e-07, "logits/chosen": -2.9751508235931396, "logits/rejected": -2.9474892616271973, "logps/chosen": -57.668296813964844, "logps/rejected": -52.80499267578125, "loss": 0.6919, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.008000878617167473, "rewards/margins": 0.0026205407921224833, "rewards/rejected": -0.010621419176459312, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.3575587272644043, "learning_rate": 1.1595866819747415e-07, "logits/chosen": -2.9106006622314453, "logits/rejected": -2.9210867881774902, "logps/chosen": -54.70320510864258, "logps/rejected": -59.270790100097656, "loss": 0.691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.009453857317566872, "rewards/margins": 0.0043923440389335155, "rewards/rejected": -0.0138462008908391, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.255159378051758, "learning_rate": 1.1710677382319172e-07, "logits/chosen": -3.0922813415527344, "logits/rejected": -3.0569264888763428, "logps/chosen": -57.53850173950195, "logps/rejected": -55.62078094482422, "loss": 0.6885, "rewards/accuracies": 0.71875, "rewards/chosen": -0.004358768928796053, "rewards/margins": 0.009538348764181137, "rewards/rejected": -0.013897115364670753, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.396897554397583, "learning_rate": 1.1825487944890928e-07, "logits/chosen": -3.140779495239258, "logits/rejected": -3.1136345863342285, "logps/chosen": -54.39406204223633, "logps/rejected": -53.09972381591797, "loss": 0.6907, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006752696819603443, "rewards/margins": 0.00511940149590373, "rewards/rejected": -0.011872097849845886, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.4822428226470947, "learning_rate": 1.1940298507462686e-07, "logits/chosen": -3.056056499481201, "logits/rejected": -3.0457711219787598, "logps/chosen": -54.37267303466797, "logps/rejected": -56.8503532409668, "loss": 0.6915, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00974307581782341, "rewards/margins": 0.003385114250704646, "rewards/rejected": -0.013128191232681274, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.523308515548706, "learning_rate": 1.205510907003444e-07, "logits/chosen": -2.9768359661102295, "logits/rejected": -2.941450595855713, "logps/chosen": -57.94348907470703, "logps/rejected": -52.01409912109375, "loss": 0.6901, "rewards/accuracies": 0.625, "rewards/chosen": -0.008004303090274334, "rewards/margins": 0.0064055053517222404, "rewards/rejected": -0.014409807510674, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.466860771179199, "learning_rate": 1.2169919632606198e-07, "logits/chosen": -2.985286235809326, "logits/rejected": -2.966055154800415, "logps/chosen": -57.37578201293945, "logps/rejected": -56.16017532348633, "loss": 0.6917, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008745206519961357, "rewards/margins": 0.003092522034421563, "rewards/rejected": -0.011837727390229702, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.3267135620117188, "learning_rate": 1.2284730195177955e-07, "logits/chosen": -3.128039598464966, "logits/rejected": -3.0941824913024902, "logps/chosen": -56.73784255981445, "logps/rejected": -54.831199645996094, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.008065931499004364, "rewards/margins": 0.007762957364320755, "rewards/rejected": -0.01582888886332512, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.292827606201172, "learning_rate": 1.2399540757749712e-07, "logits/chosen": -3.105078935623169, "logits/rejected": -3.0817818641662598, "logps/chosen": -56.80012130737305, "logps/rejected": -52.38414764404297, "loss": 0.69, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.010143356397747993, "rewards/margins": 0.006392681505531073, "rewards/rejected": -0.016536036506295204, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.412367343902588, "learning_rate": 1.251435132032147e-07, "logits/chosen": -2.9820761680603027, "logits/rejected": -2.9737863540649414, "logps/chosen": -53.27097702026367, "logps/rejected": -54.16463088989258, "loss": 0.6908, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.010393026284873486, "rewards/margins": 0.004915698431432247, "rewards/rejected": -0.015308722853660583, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 2.1496472358703613, "learning_rate": 1.2629161882893227e-07, "logits/chosen": -3.0467867851257324, "logits/rejected": -3.0438222885131836, "logps/chosen": -51.78764724731445, "logps/rejected": -56.13716506958008, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.00880212802439928, "rewards/margins": 0.006492135114967823, "rewards/rejected": -0.015294264070689678, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.670915126800537, "learning_rate": 1.2743972445464984e-07, "logits/chosen": -3.0431346893310547, "logits/rejected": -3.055330753326416, "logps/chosen": -54.185951232910156, "logps/rejected": -58.2843132019043, "loss": 0.6906, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.012901161797344685, "rewards/margins": 0.005421520210802555, "rewards/rejected": -0.01832268200814724, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.4121639728546143, "learning_rate": 1.285878300803674e-07, "logits/chosen": -3.0725626945495605, "logits/rejected": -3.0507113933563232, "logps/chosen": -57.42089080810547, "logps/rejected": -55.515769958496094, "loss": 0.6897, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.010498611256480217, "rewards/margins": 0.007218754850327969, "rewards/rejected": -0.01771736517548561, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.235563039779663, "learning_rate": 1.2973593570608496e-07, "logits/chosen": -3.1377358436584473, "logits/rejected": -3.1132779121398926, "logps/chosen": -53.177711486816406, "logps/rejected": -56.41312789916992, "loss": 0.6879, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011453317478299141, "rewards/margins": 0.010754810646176338, "rewards/rejected": -0.02220812812447548, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.4700300693511963, "learning_rate": 1.3088404133180254e-07, "logits/chosen": -3.044579029083252, "logits/rejected": -3.011229991912842, "logps/chosen": -57.91096878051758, "logps/rejected": -54.86474609375, "loss": 0.6891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010296146385371685, "rewards/margins": 0.008502911776304245, "rewards/rejected": -0.018799057230353355, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.472872734069824, "learning_rate": 1.3203214695752008e-07, "logits/chosen": -2.9843618869781494, "logits/rejected": -2.9652435779571533, "logps/chosen": -54.42316818237305, "logps/rejected": -56.3864631652832, "loss": 0.6891, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01066792942583561, "rewards/margins": 0.008340856991708279, "rewards/rejected": -0.019008787348866463, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 2.2732763290405273, "learning_rate": 1.3318025258323766e-07, "logits/chosen": -3.079559087753296, "logits/rejected": -3.070949077606201, "logps/chosen": -54.765602111816406, "logps/rejected": -56.289215087890625, "loss": 0.6883, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.007399639580398798, "rewards/margins": 0.010012407787144184, "rewards/rejected": -0.01741204783320427, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 2.170346736907959, "learning_rate": 1.3432835820895523e-07, "logits/chosen": -2.9518837928771973, "logits/rejected": -2.939192056655884, "logps/chosen": -53.35938262939453, "logps/rejected": -56.720787048339844, "loss": 0.6895, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.012800860218703747, "rewards/margins": 0.007673123385757208, "rewards/rejected": -0.020473983138799667, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.554912567138672, "learning_rate": 1.354764638346728e-07, "logits/chosen": -2.9232382774353027, "logits/rejected": -2.8921735286712646, "logps/chosen": -54.116477966308594, "logps/rejected": -53.69866943359375, "loss": 0.6864, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013146810233592987, "rewards/margins": 0.013819403946399689, "rewards/rejected": -0.026966210454702377, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.5223007202148438, "learning_rate": 1.3662456946039035e-07, "logits/chosen": -3.118698835372925, "logits/rejected": -3.0836944580078125, "logps/chosen": -60.1918830871582, "logps/rejected": -52.9239616394043, "loss": 0.6882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014924025163054466, "rewards/margins": 0.010316677391529083, "rewards/rejected": -0.0252407006919384, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 2.1731607913970947, "learning_rate": 1.3777267508610792e-07, "logits/chosen": -2.9499802589416504, "logits/rejected": -2.9351534843444824, "logps/chosen": -54.7470588684082, "logps/rejected": -53.473411560058594, "loss": 0.6902, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.015825878828763962, "rewards/margins": 0.006190788466483355, "rewards/rejected": -0.02201666869223118, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1337568759918213, "eval_logits/rejected": -3.128148317337036, "eval_logps/chosen": -58.61870193481445, "eval_logps/rejected": -63.452667236328125, "eval_loss": 0.691386878490448, "eval_rewards/accuracies": 0.5875929594039917, "eval_rewards/chosen": 0.0009319494129158556, "eval_rewards/margins": 0.003657379886135459, "eval_rewards/rejected": -0.002725430764257908, "eval_runtime": 382.7439, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 2.374234676361084, "learning_rate": 1.389207807118255e-07, "logits/chosen": -3.0486347675323486, "logits/rejected": -3.0146355628967285, "logps/chosen": -55.02860641479492, "logps/rejected": -55.492454528808594, "loss": 0.6872, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.012403490021824837, "rewards/margins": 0.012364232912659645, "rewards/rejected": -0.024767722934484482, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.142857551574707, "learning_rate": 1.4006888633754304e-07, "logits/chosen": -3.058962345123291, "logits/rejected": -3.0320382118225098, "logps/chosen": -54.947837829589844, "logps/rejected": -55.0828857421875, "loss": 0.6872, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01566535234451294, "rewards/margins": 0.012328686192631721, "rewards/rejected": -0.02799403667449951, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.823793888092041, "learning_rate": 1.4121699196326062e-07, "logits/chosen": -3.1282544136047363, "logits/rejected": -3.086381196975708, "logps/chosen": -57.08588790893555, "logps/rejected": -55.1170654296875, "loss": 0.6856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01335026603192091, "rewards/margins": 0.01557250041514635, "rewards/rejected": -0.02892276644706726, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.2466061115264893, "learning_rate": 1.423650975889782e-07, "logits/chosen": -2.9896018505096436, "logits/rejected": -2.972954273223877, "logps/chosen": -54.30778121948242, "logps/rejected": -55.87935256958008, "loss": 0.6874, "rewards/accuracies": 0.65625, "rewards/chosen": -0.018706928938627243, "rewards/margins": 0.011885268613696098, "rewards/rejected": -0.03059219755232334, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.7906625270843506, "learning_rate": 1.4351320321469576e-07, "logits/chosen": -3.1079976558685303, "logits/rejected": -3.0718159675598145, "logps/chosen": -55.47816848754883, "logps/rejected": -54.13701248168945, "loss": 0.6875, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.017741341143846512, "rewards/margins": 0.01175488717854023, "rewards/rejected": -0.029496226459741592, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.181622266769409, "learning_rate": 1.446613088404133e-07, "logits/chosen": -2.956184148788452, "logits/rejected": -2.9249396324157715, "logps/chosen": -54.747764587402344, "logps/rejected": -53.68642044067383, "loss": 0.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01772422529757023, "rewards/margins": 0.016296565532684326, "rewards/rejected": -0.034020788967609406, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.5509865283966064, "learning_rate": 1.4580941446613089e-07, "logits/chosen": -3.0164220333099365, "logits/rejected": -3.0100362300872803, "logps/chosen": -53.903045654296875, "logps/rejected": -57.88615798950195, "loss": 0.6877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020272260531783104, "rewards/margins": 0.011540655978024006, "rewards/rejected": -0.031812917441129684, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.4054272174835205, "learning_rate": 1.4695752009184846e-07, "logits/chosen": -3.0869784355163574, "logits/rejected": -3.093924045562744, "logps/chosen": -53.26460647583008, "logps/rejected": -62.8282470703125, "loss": 0.6893, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.020930523052811623, "rewards/margins": 0.008218497969210148, "rewards/rejected": -0.029149020090699196, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 2.3104968070983887, "learning_rate": 1.4810562571756603e-07, "logits/chosen": -2.986077070236206, "logits/rejected": -2.95453143119812, "logps/chosen": -56.5821647644043, "logps/rejected": -53.85289764404297, "loss": 0.6862, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.019812434911727905, "rewards/margins": 0.014519277028739452, "rewards/rejected": -0.03433171287178993, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.2120158672332764, "learning_rate": 1.4925373134328358e-07, "logits/chosen": -3.0651228427886963, "logits/rejected": -3.052384614944458, "logps/chosen": -53.11201095581055, "logps/rejected": -57.268531799316406, "loss": 0.6902, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.028970792889595032, "rewards/margins": 0.006490548141300678, "rewards/rejected": -0.03546133637428284, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 2.054224729537964, "learning_rate": 1.5040183696900115e-07, "logits/chosen": -3.064964771270752, "logits/rejected": -3.0295050144195557, "logps/chosen": -55.623008728027344, "logps/rejected": -53.59694290161133, "loss": 0.6841, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022824782878160477, "rewards/margins": 0.018944334238767624, "rewards/rejected": -0.0417691171169281, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.443293809890747, "learning_rate": 1.5154994259471873e-07, "logits/chosen": -3.037405490875244, "logits/rejected": -3.011997699737549, "logps/chosen": -57.403045654296875, "logps/rejected": -54.382476806640625, "loss": 0.6852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.019832521677017212, "rewards/margins": 0.016630280762910843, "rewards/rejected": -0.036462802439928055, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.500528335571289, "learning_rate": 1.5269804822043627e-07, "logits/chosen": -3.1157054901123047, "logits/rejected": -3.085151195526123, "logps/chosen": -57.14472579956055, "logps/rejected": -53.741905212402344, "loss": 0.6851, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.019271235913038254, "rewards/margins": 0.01675734482705593, "rewards/rejected": -0.036028582602739334, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.3414456844329834, "learning_rate": 1.5384615384615385e-07, "logits/chosen": -2.956932306289673, "logits/rejected": -2.9354348182678223, "logps/chosen": -57.207542419433594, "logps/rejected": -56.502777099609375, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": -0.020681411027908325, "rewards/margins": 0.01566999778151512, "rewards/rejected": -0.03635140508413315, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.3005292415618896, "learning_rate": 1.5499425947187142e-07, "logits/chosen": -3.0213208198547363, "logits/rejected": -2.996798038482666, "logps/chosen": -56.907737731933594, "logps/rejected": -57.270530700683594, "loss": 0.6874, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0280645452439785, "rewards/margins": 0.012301048263907433, "rewards/rejected": -0.04036559537053108, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.42112135887146, "learning_rate": 1.56142365097589e-07, "logits/chosen": -2.978275775909424, "logits/rejected": -2.965906858444214, "logps/chosen": -58.08185958862305, "logps/rejected": -59.959800720214844, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -0.028793543577194214, "rewards/margins": 0.010555420070886612, "rewards/rejected": -0.039348963648080826, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 2.564072370529175, "learning_rate": 1.5729047072330654e-07, "logits/chosen": -3.0901427268981934, "logits/rejected": -3.0705947875976562, "logps/chosen": -55.34904861450195, "logps/rejected": -57.388999938964844, "loss": 0.6839, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.026742001995444298, "rewards/margins": 0.01927506923675537, "rewards/rejected": -0.04601707309484482, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 2.3225302696228027, "learning_rate": 1.584385763490241e-07, "logits/chosen": -2.9823920726776123, "logits/rejected": -2.9642717838287354, "logps/chosen": -56.900108337402344, "logps/rejected": -56.28680419921875, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02567340061068535, "rewards/margins": 0.014020757749676704, "rewards/rejected": -0.0396941602230072, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 2.4447555541992188, "learning_rate": 1.5958668197474169e-07, "logits/chosen": -3.1299426555633545, "logits/rejected": -3.102720022201538, "logps/chosen": -59.336936950683594, "logps/rejected": -57.85896682739258, "loss": 0.6838, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02443106099963188, "rewards/margins": 0.01959383673965931, "rewards/rejected": -0.04402489587664604, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.328646659851074, "learning_rate": 1.6073478760045923e-07, "logits/chosen": -2.9504268169403076, "logits/rejected": -2.924954891204834, "logps/chosen": -58.6168098449707, "logps/rejected": -57.66655731201172, "loss": 0.6834, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02808014489710331, "rewards/margins": 0.02053389698266983, "rewards/rejected": -0.04861404374241829, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.178410053253174, "learning_rate": 1.618828932261768e-07, "logits/chosen": -3.0127053260803223, "logits/rejected": -3.0005548000335693, "logps/chosen": -57.48663330078125, "logps/rejected": -58.0041389465332, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031129727140069008, "rewards/margins": 0.01690850965678692, "rewards/rejected": -0.048038236796855927, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 2.2679238319396973, "learning_rate": 1.6303099885189438e-07, "logits/chosen": -2.9296984672546387, "logits/rejected": -2.9353113174438477, "logps/chosen": -54.37672805786133, "logps/rejected": -59.217491149902344, "loss": 0.6891, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03622571751475334, "rewards/margins": 0.009116034023463726, "rewards/rejected": -0.045341745018959045, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 2.6715142726898193, "learning_rate": 1.6417910447761195e-07, "logits/chosen": -3.0467171669006348, "logits/rejected": -3.0351033210754395, "logps/chosen": -56.65338134765625, "logps/rejected": -61.58381271362305, "loss": 0.6835, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.030687406659126282, "rewards/margins": 0.020251978188753128, "rewards/rejected": -0.05093938857316971, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.2624526023864746, "learning_rate": 1.653272101033295e-07, "logits/chosen": -2.9727156162261963, "logits/rejected": -2.9416565895080566, "logps/chosen": -54.01691818237305, "logps/rejected": -53.66893768310547, "loss": 0.6849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03554430976510048, "rewards/margins": 0.01750084012746811, "rewards/rejected": -0.05304514244198799, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 3.1223669052124023, "learning_rate": 1.6647531572904707e-07, "logits/chosen": -2.984633684158325, "logits/rejected": -2.9475743770599365, "logps/chosen": -58.33637237548828, "logps/rejected": -57.2099609375, "loss": 0.6816, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028612574562430382, "rewards/margins": 0.024286650121212006, "rewards/rejected": -0.05289921909570694, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 2.475480318069458, "learning_rate": 1.6762342135476465e-07, "logits/chosen": -2.9874236583709717, "logits/rejected": -2.955101490020752, "logps/chosen": -55.609344482421875, "logps/rejected": -56.4217643737793, "loss": 0.6836, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03909724950790405, "rewards/margins": 0.02046862244606018, "rewards/rejected": -0.05956587195396423, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 2.6421852111816406, "learning_rate": 1.687715269804822e-07, "logits/chosen": -3.1511387825012207, "logits/rejected": -3.115359306335449, "logps/chosen": -60.091941833496094, "logps/rejected": -60.51912307739258, "loss": 0.6799, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03068288043141365, "rewards/margins": 0.02774442359805107, "rewards/rejected": -0.05842730402946472, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 2.4771311283111572, "learning_rate": 1.6991963260619977e-07, "logits/chosen": -3.1178431510925293, "logits/rejected": -3.082062244415283, "logps/chosen": -55.12421417236328, "logps/rejected": -57.14054489135742, "loss": 0.6858, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03813482075929642, "rewards/margins": 0.015866823494434357, "rewards/rejected": -0.054001640528440475, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 2.576782703399658, "learning_rate": 1.7106773823191734e-07, "logits/chosen": -3.0479483604431152, "logits/rejected": -3.023475408554077, "logps/chosen": -61.34899139404297, "logps/rejected": -59.27949142456055, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.03965304046869278, "rewards/margins": 0.01644364930689335, "rewards/rejected": -0.05609668418765068, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 2.63468337059021, "learning_rate": 1.722158438576349e-07, "logits/chosen": -2.957155466079712, "logits/rejected": -2.954725980758667, "logps/chosen": -54.37346649169922, "logps/rejected": -58.872047424316406, "loss": 0.6848, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04384751245379448, "rewards/margins": 0.018207985907793045, "rewards/rejected": -0.06205549091100693, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 2.59952712059021, "learning_rate": 1.7336394948335246e-07, "logits/chosen": -2.9072952270507812, "logits/rejected": -2.872072696685791, "logps/chosen": -62.11091232299805, "logps/rejected": -59.270965576171875, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": -0.03504981845617294, "rewards/margins": 0.029394099488854408, "rewards/rejected": -0.0644439235329628, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 2.8741984367370605, "learning_rate": 1.7451205510907003e-07, "logits/chosen": -3.0184130668640137, "logits/rejected": -3.010338544845581, "logps/chosen": -56.67668914794922, "logps/rejected": -56.772361755371094, "loss": 0.6895, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0442604199051857, "rewards/margins": 0.008484233170747757, "rewards/rejected": -0.05274464935064316, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 2.7853004932403564, "learning_rate": 1.756601607347876e-07, "logits/chosen": -2.9038796424865723, "logits/rejected": -2.8876137733459473, "logps/chosen": -56.257667541503906, "logps/rejected": -56.9356803894043, "loss": 0.6858, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04569043964147568, "rewards/margins": 0.01612422987818718, "rewards/rejected": -0.06181466579437256, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 2.535432815551758, "learning_rate": 1.7680826636050515e-07, "logits/chosen": -3.03082013130188, "logits/rejected": -2.999049663543701, "logps/chosen": -60.42095184326172, "logps/rejected": -58.095306396484375, "loss": 0.6811, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.045154839754104614, "rewards/margins": 0.02576422318816185, "rewards/rejected": -0.07091905921697617, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 2.455566883087158, "learning_rate": 1.7795637198622273e-07, "logits/chosen": -2.9487860202789307, "logits/rejected": -2.929208278656006, "logps/chosen": -57.921913146972656, "logps/rejected": -57.43787384033203, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.045525889843702316, "rewards/margins": 0.023446694016456604, "rewards/rejected": -0.06897258758544922, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 2.513192653656006, "learning_rate": 1.791044776119403e-07, "logits/chosen": -2.960799217224121, "logits/rejected": -2.9433281421661377, "logps/chosen": -55.99836349487305, "logps/rejected": -57.2844123840332, "loss": 0.6864, "rewards/accuracies": 0.59375, "rewards/chosen": -0.051505934447050095, "rewards/margins": 0.014735942706465721, "rewards/rejected": -0.06624187529087067, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 2.5723841190338135, "learning_rate": 1.8025258323765787e-07, "logits/chosen": -3.010369300842285, "logits/rejected": -3.0123367309570312, "logps/chosen": -56.7002067565918, "logps/rejected": -62.82074737548828, "loss": 0.6847, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04784129932522774, "rewards/margins": 0.01806465908885002, "rewards/rejected": -0.06590595096349716, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 2.942056655883789, "learning_rate": 1.8140068886337542e-07, "logits/chosen": -2.943998098373413, "logits/rejected": -2.93428373336792, "logps/chosen": -57.99327850341797, "logps/rejected": -60.913169860839844, "loss": 0.6841, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.050780169665813446, "rewards/margins": 0.019445115700364113, "rewards/rejected": -0.07022528350353241, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 2.7961862087249756, "learning_rate": 1.82548794489093e-07, "logits/chosen": -2.9881751537323, "logits/rejected": -2.9737207889556885, "logps/chosen": -58.699737548828125, "logps/rejected": -61.12406539916992, "loss": 0.6842, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04569312185049057, "rewards/margins": 0.019125862047076225, "rewards/rejected": -0.06481898576021194, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 2.5653514862060547, "learning_rate": 1.8369690011481057e-07, "logits/chosen": -2.9790871143341064, "logits/rejected": -2.941410541534424, "logps/chosen": -57.3028450012207, "logps/rejected": -58.16033935546875, "loss": 0.6835, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.051235903054475784, "rewards/margins": 0.020867522805929184, "rewards/rejected": -0.07210342586040497, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.089492082595825, "eval_logits/rejected": -3.08384108543396, "eval_logps/chosen": -60.95978927612305, "eval_logps/rejected": -66.38330078125, "eval_loss": 0.688758134841919, "eval_rewards/accuracies": 0.5864312052726746, "eval_rewards/chosen": -0.02247888222336769, "eval_rewards/margins": 0.009552864357829094, "eval_rewards/rejected": -0.032031744718551636, "eval_runtime": 382.6729, "eval_samples_per_second": 11.247, "eval_steps_per_second": 1.406, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 2.465613842010498, "learning_rate": 1.848450057405281e-07, "logits/chosen": -3.018049716949463, "logits/rejected": -2.9930763244628906, "logps/chosen": -61.77216720581055, "logps/rejected": -65.25670623779297, "loss": 0.6856, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05878068879246712, "rewards/margins": 0.016800960525870323, "rewards/rejected": -0.07558164745569229, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 2.5315089225769043, "learning_rate": 1.8599311136624569e-07, "logits/chosen": -2.940201997756958, "logits/rejected": -2.91823410987854, "logps/chosen": -57.43720626831055, "logps/rejected": -62.375892639160156, "loss": 0.6826, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.059971462935209274, "rewards/margins": 0.02313486859202385, "rewards/rejected": -0.08310633152723312, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 2.86327862739563, "learning_rate": 1.8714121699196326e-07, "logits/chosen": -3.0095534324645996, "logits/rejected": -2.9834182262420654, "logps/chosen": -60.75006866455078, "logps/rejected": -59.5101203918457, "loss": 0.6813, "rewards/accuracies": 0.625, "rewards/chosen": -0.052403099834918976, "rewards/margins": 0.025434961542487144, "rewards/rejected": -0.07783806324005127, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 2.7473204135894775, "learning_rate": 1.8828932261768083e-07, "logits/chosen": -3.0492942333221436, "logits/rejected": -3.0127370357513428, "logps/chosen": -62.9014892578125, "logps/rejected": -59.5329475402832, "loss": 0.6832, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.05947121977806091, "rewards/margins": 0.02242801897227764, "rewards/rejected": -0.0818992406129837, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 2.928473949432373, "learning_rate": 1.8943742824339838e-07, "logits/chosen": -3.0190649032592773, "logits/rejected": -2.991811990737915, "logps/chosen": -62.157310485839844, "logps/rejected": -62.88653564453125, "loss": 0.6791, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.049208804965019226, "rewards/margins": 0.029801160097122192, "rewards/rejected": -0.07900996506214142, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 2.6895644664764404, "learning_rate": 1.9058553386911595e-07, "logits/chosen": -3.053314208984375, "logits/rejected": -3.007117986679077, "logps/chosen": -60.38581466674805, "logps/rejected": -58.10612869262695, "loss": 0.6793, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05414261296391487, "rewards/margins": 0.02976139448583126, "rewards/rejected": -0.08390400558710098, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 2.900505304336548, "learning_rate": 1.9173363949483353e-07, "logits/chosen": -3.0389440059661865, "logits/rejected": -3.016704559326172, "logps/chosen": -59.47711944580078, "logps/rejected": -63.44648361206055, "loss": 0.6775, "rewards/accuracies": 0.65625, "rewards/chosen": -0.049625031650066376, "rewards/margins": 0.032785750925540924, "rewards/rejected": -0.0824107900261879, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 2.6400303840637207, "learning_rate": 1.928817451205511e-07, "logits/chosen": -2.993952512741089, "logits/rejected": -2.9681200981140137, "logps/chosen": -61.86346435546875, "logps/rejected": -63.476051330566406, "loss": 0.6801, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06255128979682922, "rewards/margins": 0.028126254677772522, "rewards/rejected": -0.09067754447460175, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 2.792930841445923, "learning_rate": 1.9402985074626865e-07, "logits/chosen": -2.9742238521575928, "logits/rejected": -2.958693504333496, "logps/chosen": -61.657623291015625, "logps/rejected": -59.42774200439453, "loss": 0.6887, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.061123304069042206, "rewards/margins": 0.010420399717986584, "rewards/rejected": -0.07154370844364166, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 2.930274248123169, "learning_rate": 1.9517795637198622e-07, "logits/chosen": -2.8819127082824707, "logits/rejected": -2.8864543437957764, "logps/chosen": -56.14990234375, "logps/rejected": -63.5261116027832, "loss": 0.6859, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06586851179599762, "rewards/margins": 0.016264760866761208, "rewards/rejected": -0.08213327825069427, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 3.0649280548095703, "learning_rate": 1.963260619977038e-07, "logits/chosen": -3.0112526416778564, "logits/rejected": -2.9711148738861084, "logps/chosen": -64.08343505859375, "logps/rejected": -62.7235221862793, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05673649162054062, "rewards/margins": 0.02566385641694069, "rewards/rejected": -0.08240034431219101, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 2.632606029510498, "learning_rate": 1.9747416762342134e-07, "logits/chosen": -3.0191071033477783, "logits/rejected": -2.990119457244873, "logps/chosen": -62.924537658691406, "logps/rejected": -59.43379592895508, "loss": 0.6845, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0649864599108696, "rewards/margins": 0.01879298873245716, "rewards/rejected": -0.08377943933010101, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 2.6209614276885986, "learning_rate": 1.9862227324913891e-07, "logits/chosen": -3.013237953186035, "logits/rejected": -2.996584177017212, "logps/chosen": -60.8912353515625, "logps/rejected": -61.02091598510742, "loss": 0.6863, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.06956915557384491, "rewards/margins": 0.015582660213112831, "rewards/rejected": -0.085151806473732, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 2.7946395874023438, "learning_rate": 1.997703788748565e-07, "logits/chosen": -2.961860179901123, "logits/rejected": -2.9518237113952637, "logps/chosen": -59.7307243347168, "logps/rejected": -63.86859130859375, "loss": 0.6834, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07629448175430298, "rewards/margins": 0.021778276190161705, "rewards/rejected": -0.09807275980710983, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 3.019209146499634, "learning_rate": 1.999998713790723e-07, "logits/chosen": -3.002354621887207, "logits/rejected": -2.9911231994628906, "logps/chosen": -61.43891525268555, "logps/rejected": -64.77537536621094, "loss": 0.6816, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06252885609865189, "rewards/margins": 0.02495855651795864, "rewards/rejected": -0.08748741447925568, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 2.858008623123169, "learning_rate": 1.999993488571206e-07, "logits/chosen": -2.9882655143737793, "logits/rejected": -2.952385663986206, "logps/chosen": -62.764404296875, "logps/rejected": -63.62236404418945, "loss": 0.6748, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06741251796483994, "rewards/margins": 0.03932555764913559, "rewards/rejected": -0.10673806816339493, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 2.529355525970459, "learning_rate": 1.9999842439743547e-07, "logits/chosen": -3.0135600566864014, "logits/rejected": -2.9775524139404297, "logps/chosen": -62.50413131713867, "logps/rejected": -60.10844039916992, "loss": 0.676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07522428780794144, "rewards/margins": 0.03643802925944328, "rewards/rejected": -0.11166232824325562, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 3.3260445594787598, "learning_rate": 1.999970980037328e-07, "logits/chosen": -2.9299685955047607, "logits/rejected": -2.931786298751831, "logps/chosen": -62.187744140625, "logps/rejected": -69.33695220947266, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": -0.07669190317392349, "rewards/margins": 0.032053664326667786, "rewards/rejected": -0.10874556005001068, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 3.0353615283966064, "learning_rate": 1.999953696813438e-07, "logits/chosen": -3.0692267417907715, "logits/rejected": -3.0471017360687256, "logps/chosen": -59.89692306518555, "logps/rejected": -64.16006469726562, "loss": 0.6757, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07901540398597717, "rewards/margins": 0.037279874086380005, "rewards/rejected": -0.11629529297351837, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 2.8321800231933594, "learning_rate": 1.9999323943721533e-07, "logits/chosen": -3.033538818359375, "logits/rejected": -3.0094218254089355, "logps/chosen": -61.10071563720703, "logps/rejected": -64.22647857666016, "loss": 0.6796, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09298542886972427, "rewards/margins": 0.02937093935906887, "rewards/rejected": -0.1223563551902771, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 3.0810434818267822, "learning_rate": 1.9999070727990972e-07, "logits/chosen": -3.033754587173462, "logits/rejected": -2.998857021331787, "logps/chosen": -65.45790100097656, "logps/rejected": -65.5028305053711, "loss": 0.6798, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08840756863355637, "rewards/margins": 0.02971474826335907, "rewards/rejected": -0.11812230199575424, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 2.9397025108337402, "learning_rate": 1.999877732196047e-07, "logits/chosen": -3.002478837966919, "logits/rejected": -2.9685733318328857, "logps/chosen": -64.43415069580078, "logps/rejected": -62.759742736816406, "loss": 0.6806, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09272059053182602, "rewards/margins": 0.028056111186742783, "rewards/rejected": -0.1207766979932785, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 3.0365073680877686, "learning_rate": 1.9998443726809344e-07, "logits/chosen": -2.8997771739959717, "logits/rejected": -2.8866164684295654, "logps/chosen": -64.41737365722656, "logps/rejected": -65.77619934082031, "loss": 0.6836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0887407660484314, "rewards/margins": 0.021692968904972076, "rewards/rejected": -0.11043374240398407, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 3.104240655899048, "learning_rate": 1.9998069943878452e-07, "logits/chosen": -3.102999210357666, "logits/rejected": -3.0991158485412598, "logps/chosen": -65.93641662597656, "logps/rejected": -68.23253631591797, "loss": 0.6842, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11116300523281097, "rewards/margins": 0.020699962973594666, "rewards/rejected": -0.13186296820640564, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 3.036222219467163, "learning_rate": 1.9997655974670177e-07, "logits/chosen": -2.9688706398010254, "logits/rejected": -2.9688243865966797, "logps/chosen": -64.49559020996094, "logps/rejected": -67.51258850097656, "loss": 0.6869, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.112858846783638, "rewards/margins": 0.015191557817161083, "rewards/rejected": -0.12805040180683136, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 3.0601251125335693, "learning_rate": 1.9997201820848421e-07, "logits/chosen": -2.902102470397949, "logits/rejected": -2.8667800426483154, "logps/chosen": -66.10860443115234, "logps/rejected": -66.68751525878906, "loss": 0.6738, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08816654980182648, "rewards/margins": 0.041354451328516006, "rewards/rejected": -0.12952101230621338, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 3.619654893875122, "learning_rate": 1.999670748423862e-07, "logits/chosen": -2.9479846954345703, "logits/rejected": -2.9228718280792236, "logps/chosen": -65.58842468261719, "logps/rejected": -66.9212646484375, "loss": 0.6772, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09638717025518417, "rewards/margins": 0.03494124859571457, "rewards/rejected": -0.13132841885089874, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 2.9565136432647705, "learning_rate": 1.9996172966827712e-07, "logits/chosen": -2.99045991897583, "logits/rejected": -2.953990936279297, "logps/chosen": -61.8843994140625, "logps/rejected": -65.32537841796875, "loss": 0.6717, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09168566763401031, "rewards/margins": 0.04686136916279793, "rewards/rejected": -0.13854703307151794, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 3.3497560024261475, "learning_rate": 1.9995598270764132e-07, "logits/chosen": -3.0127041339874268, "logits/rejected": -3.006329298019409, "logps/chosen": -61.48860549926758, "logps/rejected": -67.71702575683594, "loss": 0.6709, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08430846035480499, "rewards/margins": 0.04838328808546066, "rewards/rejected": -0.13269174098968506, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 3.203680992126465, "learning_rate": 1.9994983398357822e-07, "logits/chosen": -2.9438915252685547, "logits/rejected": -2.9119534492492676, "logps/chosen": -65.86426544189453, "logps/rejected": -64.35572814941406, "loss": 0.6761, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08424468338489532, "rewards/margins": 0.03669450432062149, "rewards/rejected": -0.12093917280435562, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 3.236323118209839, "learning_rate": 1.9994328352080197e-07, "logits/chosen": -2.8468799591064453, "logits/rejected": -2.812983989715576, "logps/chosen": -64.58326721191406, "logps/rejected": -67.55799865722656, "loss": 0.6741, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10115577280521393, "rewards/margins": 0.04144899174571037, "rewards/rejected": -0.1426047533750534, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 3.068075656890869, "learning_rate": 1.9993633134564157e-07, "logits/chosen": -2.9570934772491455, "logits/rejected": -2.9299278259277344, "logps/chosen": -65.09169006347656, "logps/rejected": -67.30238342285156, "loss": 0.6753, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09723018109798431, "rewards/margins": 0.0387737974524498, "rewards/rejected": -0.1360039860010147, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 2.9155988693237305, "learning_rate": 1.9992897748604057e-07, "logits/chosen": -2.903533935546875, "logits/rejected": -2.867462635040283, "logps/chosen": -64.53730773925781, "logps/rejected": -67.99651336669922, "loss": 0.677, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10646890103816986, "rewards/margins": 0.03499242290854454, "rewards/rejected": -0.1414613425731659, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 3.012897491455078, "learning_rate": 1.9992122197155713e-07, "logits/chosen": -2.9248719215393066, "logits/rejected": -2.9068443775177, "logps/chosen": -61.72211837768555, "logps/rejected": -63.50091552734375, "loss": 0.6812, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10782128572463989, "rewards/margins": 0.02694891020655632, "rewards/rejected": -0.1347701996564865, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 2.9323947429656982, "learning_rate": 1.9991306483336379e-07, "logits/chosen": -2.9290108680725098, "logits/rejected": -2.927438974380493, "logps/chosen": -63.5114860534668, "logps/rejected": -69.5857162475586, "loss": 0.6763, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11003844439983368, "rewards/margins": 0.03804415836930275, "rewards/rejected": -0.14808261394500732, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 3.069072961807251, "learning_rate": 1.9990450610424739e-07, "logits/chosen": -2.93648624420166, "logits/rejected": -2.9172258377075195, "logps/chosen": -64.36561584472656, "logps/rejected": -68.8530502319336, "loss": 0.6731, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10006101429462433, "rewards/margins": 0.04350076615810394, "rewards/rejected": -0.14356176555156708, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 3.011516809463501, "learning_rate": 1.9989554581860885e-07, "logits/chosen": -2.9762659072875977, "logits/rejected": -2.9475595951080322, "logps/chosen": -66.1217041015625, "logps/rejected": -65.79177856445312, "loss": 0.6751, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09975095093250275, "rewards/margins": 0.03971099108457565, "rewards/rejected": -0.1394619345664978, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 3.3382675647735596, "learning_rate": 1.9988618401246327e-07, "logits/chosen": -2.88445782661438, "logits/rejected": -2.8788936138153076, "logps/chosen": -67.1238021850586, "logps/rejected": -67.88463592529297, "loss": 0.6839, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11536003649234772, "rewards/margins": 0.02309250645339489, "rewards/rejected": -0.13845254480838776, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 3.3324198722839355, "learning_rate": 1.9987642072343948e-07, "logits/chosen": -3.0067853927612305, "logits/rejected": -2.9725756645202637, "logps/chosen": -62.18952560424805, "logps/rejected": -64.51532745361328, "loss": 0.6701, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1135512962937355, "rewards/margins": 0.049906175583601, "rewards/rejected": -0.1634574681520462, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 3.651040554046631, "learning_rate": 1.9986625599078007e-07, "logits/chosen": -2.928395986557007, "logits/rejected": -2.934624195098877, "logps/chosen": -62.90681838989258, "logps/rejected": -73.9938735961914, "loss": 0.6778, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12945915758609772, "rewards/margins": 0.03435561805963516, "rewards/rejected": -0.16381476819515228, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.0270333290100098, "eval_logits/rejected": -3.0213301181793213, "eval_logps/chosen": -65.9486312866211, "eval_logps/rejected": -72.35735321044922, "eval_loss": 0.6844969391822815, "eval_rewards/accuracies": 0.5975836515426636, "eval_rewards/chosen": -0.072367362678051, "eval_rewards/margins": 0.019404985010623932, "eval_rewards/rejected": -0.09177234023809433, "eval_runtime": 382.4638, "eval_samples_per_second": 11.253, "eval_steps_per_second": 1.407, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 3.3070271015167236, "learning_rate": 1.9985568985534123e-07, "logits/chosen": -2.9318108558654785, "logits/rejected": -2.903789520263672, "logps/chosen": -66.5789794921875, "logps/rejected": -66.69670104980469, "loss": 0.6752, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10977102816104889, "rewards/margins": 0.039171889424324036, "rewards/rejected": -0.14894291758537292, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 3.1021924018859863, "learning_rate": 1.9984472235959246e-07, "logits/chosen": -2.9195027351379395, "logits/rejected": -2.8980581760406494, "logps/chosen": -62.109039306640625, "logps/rejected": -71.7960205078125, "loss": 0.671, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12279726564884186, "rewards/margins": 0.04837704449892044, "rewards/rejected": -0.1711743175983429, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 3.9570415019989014, "learning_rate": 1.9983335354761662e-07, "logits/chosen": -3.003624677658081, "logits/rejected": -2.9824397563934326, "logps/chosen": -68.96121978759766, "logps/rejected": -70.75452423095703, "loss": 0.6767, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11908937990665436, "rewards/margins": 0.03664074093103409, "rewards/rejected": -0.15573014318943024, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 3.131145715713501, "learning_rate": 1.9982158346510952e-07, "logits/chosen": -2.868227005004883, "logits/rejected": -2.8557090759277344, "logps/chosen": -64.66195678710938, "logps/rejected": -70.04840087890625, "loss": 0.6745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.11946003139019012, "rewards/margins": 0.04148910567164421, "rewards/rejected": -0.16094914078712463, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 3.7799129486083984, "learning_rate": 1.998094121593799e-07, "logits/chosen": -2.980715274810791, "logits/rejected": -2.956112861633301, "logps/chosen": -63.249267578125, "logps/rejected": -69.17818450927734, "loss": 0.6813, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12027259171009064, "rewards/margins": 0.02740442380309105, "rewards/rejected": -0.14767701923847198, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 3.3789472579956055, "learning_rate": 1.9979683967934911e-07, "logits/chosen": -2.96757173538208, "logits/rejected": -2.929948329925537, "logps/chosen": -66.1915512084961, "logps/rejected": -67.29234313964844, "loss": 0.6708, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10862097889184952, "rewards/margins": 0.04848029464483261, "rewards/rejected": -0.15710125863552094, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 3.4836807250976562, "learning_rate": 1.9978386607555103e-07, "logits/chosen": -2.983682870864868, "logits/rejected": -2.960925579071045, "logps/chosen": -67.11554718017578, "logps/rejected": -71.07099151611328, "loss": 0.6733, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11822348833084106, "rewards/margins": 0.04466177895665169, "rewards/rejected": -0.16288527846336365, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 3.548471450805664, "learning_rate": 1.9977049140013183e-07, "logits/chosen": -2.9241182804107666, "logits/rejected": -2.895223617553711, "logps/chosen": -65.55860137939453, "logps/rejected": -69.46336364746094, "loss": 0.6696, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1369078904390335, "rewards/margins": 0.051416944712400436, "rewards/rejected": -0.18832483887672424, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 3.2138357162475586, "learning_rate": 1.997567157068497e-07, "logits/chosen": -2.9462406635284424, "logits/rejected": -2.9429733753204346, "logps/chosen": -67.5681381225586, "logps/rejected": -70.5682144165039, "loss": 0.6813, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13173379004001617, "rewards/margins": 0.027362842112779617, "rewards/rejected": -0.1590966284275055, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 3.7781026363372803, "learning_rate": 1.997425390510747e-07, "logits/chosen": -2.894129991531372, "logits/rejected": -2.872572422027588, "logps/chosen": -68.9821548461914, "logps/rejected": -68.93914794921875, "loss": 0.6749, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.13635702431201935, "rewards/margins": 0.04054417088627815, "rewards/rejected": -0.1769011914730072, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 3.3131191730499268, "learning_rate": 1.9972796148978856e-07, "logits/chosen": -2.948805332183838, "logits/rejected": -2.951103448867798, "logps/chosen": -63.747772216796875, "logps/rejected": -73.74634552001953, "loss": 0.6771, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1484815776348114, "rewards/margins": 0.03708801046013832, "rewards/rejected": -0.18556959927082062, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 3.6554408073425293, "learning_rate": 1.9971298308158441e-07, "logits/chosen": -2.8126590251922607, "logits/rejected": -2.786180019378662, "logps/chosen": -65.94208526611328, "logps/rejected": -67.56100463867188, "loss": 0.6739, "rewards/accuracies": 0.625, "rewards/chosen": -0.1236642599105835, "rewards/margins": 0.04265039786696434, "rewards/rejected": -0.16631464660167694, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 3.0757296085357666, "learning_rate": 1.9969760388666645e-07, "logits/chosen": -2.846762180328369, "logits/rejected": -2.8287978172302246, "logps/chosen": -68.62953186035156, "logps/rejected": -72.33243560791016, "loss": 0.6674, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.14159207046031952, "rewards/margins": 0.05636315792798996, "rewards/rejected": -0.19795522093772888, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 3.5154972076416016, "learning_rate": 1.996818239668499e-07, "logits/chosen": -2.8251287937164307, "logits/rejected": -2.817430019378662, "logps/chosen": -64.94082641601562, "logps/rejected": -73.19986724853516, "loss": 0.678, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1628565490245819, "rewards/margins": 0.033618878573179245, "rewards/rejected": -0.19647544622421265, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 3.5373153686523438, "learning_rate": 1.9966564338556065e-07, "logits/chosen": -2.8717055320739746, "logits/rejected": -2.8266823291778564, "logps/chosen": -67.66168212890625, "logps/rejected": -67.52949523925781, "loss": 0.6628, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1297309845685959, "rewards/margins": 0.06547585129737854, "rewards/rejected": -0.19520683586597443, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 3.6125190258026123, "learning_rate": 1.9964906220783492e-07, "logits/chosen": -2.839944839477539, "logits/rejected": -2.8248395919799805, "logps/chosen": -69.63507080078125, "logps/rejected": -68.79278564453125, "loss": 0.6733, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1487804651260376, "rewards/margins": 0.04360898584127426, "rewards/rejected": -0.19238945841789246, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 3.6442248821258545, "learning_rate": 1.9963208050031922e-07, "logits/chosen": -2.954251766204834, "logits/rejected": -2.939054250717163, "logps/chosen": -68.50241088867188, "logps/rejected": -70.56828308105469, "loss": 0.6704, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1417117863893509, "rewards/margins": 0.0491500198841095, "rewards/rejected": -0.1908618062734604, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 3.8122751712799072, "learning_rate": 1.9961469833126987e-07, "logits/chosen": -3.001730442047119, "logits/rejected": -2.96539306640625, "logps/chosen": -73.8094482421875, "logps/rejected": -74.3231430053711, "loss": 0.667, "rewards/accuracies": 0.625, "rewards/chosen": -0.15785323083400726, "rewards/margins": 0.05803511291742325, "rewards/rejected": -0.2158883512020111, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 3.7190444469451904, "learning_rate": 1.995969157705528e-07, "logits/chosen": -3.0624592304229736, "logits/rejected": -3.056671142578125, "logps/chosen": -69.03919982910156, "logps/rejected": -72.68977355957031, "loss": 0.6822, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.16197362542152405, "rewards/margins": 0.026371484622359276, "rewards/rejected": -0.18834510445594788, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 3.5991036891937256, "learning_rate": 1.995787328896433e-07, "logits/chosen": -2.8910741806030273, "logits/rejected": -2.8688979148864746, "logps/chosen": -66.85711669921875, "logps/rejected": -76.20103454589844, "loss": 0.6618, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.14548395574092865, "rewards/margins": 0.06858919560909271, "rewards/rejected": -0.21407318115234375, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 3.7989940643310547, "learning_rate": 1.9956014976162572e-07, "logits/chosen": -2.947467565536499, "logits/rejected": -2.9316937923431396, "logps/chosen": -69.19676971435547, "logps/rejected": -73.41373443603516, "loss": 0.6696, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1447041779756546, "rewards/margins": 0.05233711004257202, "rewards/rejected": -0.19704128801822662, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 3.6562952995300293, "learning_rate": 1.9954116646119315e-07, "logits/chosen": -2.7782294750213623, "logits/rejected": -2.7709176540374756, "logps/chosen": -68.47245788574219, "logps/rejected": -72.8316421508789, "loss": 0.6741, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14594446122646332, "rewards/margins": 0.043900761753320694, "rewards/rejected": -0.1898452192544937, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 3.846614122390747, "learning_rate": 1.9952178306464708e-07, "logits/chosen": -2.9279448986053467, "logits/rejected": -2.8969149589538574, "logps/chosen": -70.74501037597656, "logps/rejected": -71.7994155883789, "loss": 0.6756, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.17067433893680573, "rewards/margins": 0.039899904280900955, "rewards/rejected": -0.21057423949241638, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 3.7794787883758545, "learning_rate": 1.9950199964989728e-07, "logits/chosen": -2.883044958114624, "logits/rejected": -2.848534107208252, "logps/chosen": -72.35716247558594, "logps/rejected": -72.94497680664062, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": -0.15547525882720947, "rewards/margins": 0.03803917020559311, "rewards/rejected": -0.19351443648338318, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 4.301860332489014, "learning_rate": 1.9948181629646125e-07, "logits/chosen": -2.840498447418213, "logits/rejected": -2.807094097137451, "logps/chosen": -71.03971862792969, "logps/rejected": -73.32917785644531, "loss": 0.6743, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.15689852833747864, "rewards/margins": 0.042939841747283936, "rewards/rejected": -0.19983838498592377, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 3.486368179321289, "learning_rate": 1.99461233085464e-07, "logits/chosen": -2.800408124923706, "logits/rejected": -2.775947332382202, "logps/chosen": -78.2021484375, "logps/rejected": -77.9363784790039, "loss": 0.6819, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17394503951072693, "rewards/margins": 0.031767066568136215, "rewards/rejected": -0.20571212470531464, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 4.095204830169678, "learning_rate": 1.9944025009963783e-07, "logits/chosen": -2.784682273864746, "logits/rejected": -2.752465009689331, "logps/chosen": -70.94514465332031, "logps/rejected": -73.77107238769531, "loss": 0.6702, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1688375473022461, "rewards/margins": 0.05170854926109314, "rewards/rejected": -0.22054609656333923, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 3.7926547527313232, "learning_rate": 1.9941886742332175e-07, "logits/chosen": -2.856924533843994, "logits/rejected": -2.8466103076934814, "logps/chosen": -68.77479553222656, "logps/rejected": -75.07438659667969, "loss": 0.6744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.15331251919269562, "rewards/margins": 0.04212985187768936, "rewards/rejected": -0.1954423487186432, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 3.7058446407318115, "learning_rate": 1.9939708514246143e-07, "logits/chosen": -2.7546420097351074, "logits/rejected": -2.727194309234619, "logps/chosen": -69.0736312866211, "logps/rejected": -75.74372863769531, "loss": 0.6666, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16190746426582336, "rewards/margins": 0.05792677402496338, "rewards/rejected": -0.21983423829078674, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 3.861417770385742, "learning_rate": 1.9937490334460857e-07, "logits/chosen": -2.9293429851531982, "logits/rejected": -2.8968148231506348, "logps/chosen": -71.00596618652344, "logps/rejected": -74.87997436523438, "loss": 0.6655, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15060889720916748, "rewards/margins": 0.0616321787238121, "rewards/rejected": -0.21224108338356018, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 3.798541307449341, "learning_rate": 1.9935232211892083e-07, "logits/chosen": -2.8329617977142334, "logits/rejected": -2.817756175994873, "logps/chosen": -67.91639709472656, "logps/rejected": -73.6534194946289, "loss": 0.6691, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1683826744556427, "rewards/margins": 0.05380697920918465, "rewards/rejected": -0.22218966484069824, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 3.9587326049804688, "learning_rate": 1.9932934155616127e-07, "logits/chosen": -2.9256093502044678, "logits/rejected": -2.8858180046081543, "logps/chosen": -73.00454711914062, "logps/rejected": -73.79489135742188, "loss": 0.6608, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.15563161671161652, "rewards/margins": 0.07084403187036514, "rewards/rejected": -0.22647564113140106, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 3.805995225906372, "learning_rate": 1.9930596174869797e-07, "logits/chosen": -2.8376352787017822, "logits/rejected": -2.816253900527954, "logps/chosen": -72.3519287109375, "logps/rejected": -76.95233154296875, "loss": 0.6572, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16542547941207886, "rewards/margins": 0.0815449133515358, "rewards/rejected": -0.24697038531303406, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 3.8098678588867188, "learning_rate": 1.992821827905039e-07, "logits/chosen": -2.8756024837493896, "logits/rejected": -2.863811492919922, "logps/chosen": -70.9607162475586, "logps/rejected": -77.6399917602539, "loss": 0.6782, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18659153580665588, "rewards/margins": 0.036031901836395264, "rewards/rejected": -0.22262343764305115, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 3.8881237506866455, "learning_rate": 1.9925800477715623e-07, "logits/chosen": -2.855710506439209, "logits/rejected": -2.8338699340820312, "logps/chosen": -74.32742309570312, "logps/rejected": -78.49640655517578, "loss": 0.6608, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.16066120564937592, "rewards/margins": 0.07114093005657196, "rewards/rejected": -0.2318021059036255, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 3.6372482776641846, "learning_rate": 1.992334278058362e-07, "logits/chosen": -2.872138738632202, "logits/rejected": -2.859128475189209, "logps/chosen": -67.64442443847656, "logps/rejected": -73.90852355957031, "loss": 0.6686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1728234887123108, "rewards/margins": 0.054965026676654816, "rewards/rejected": -0.2277885228395462, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 4.352206707000732, "learning_rate": 1.9920845197532854e-07, "logits/chosen": -2.8816115856170654, "logits/rejected": -2.8750884532928467, "logps/chosen": -72.4279556274414, "logps/rejected": -79.2039794921875, "loss": 0.6624, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1866598129272461, "rewards/margins": 0.06673283874988556, "rewards/rejected": -0.25339263677597046, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 3.87642765045166, "learning_rate": 1.991830773860212e-07, "logits/chosen": -2.8102853298187256, "logits/rejected": -2.7858879566192627, "logps/chosen": -71.7208251953125, "logps/rejected": -75.96289825439453, "loss": 0.668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.17378661036491394, "rewards/margins": 0.05719633772969246, "rewards/rejected": -0.2309829741716385, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 4.1682448387146, "learning_rate": 1.9915730413990486e-07, "logits/chosen": -2.889021158218384, "logits/rejected": -2.8628406524658203, "logps/chosen": -74.9619140625, "logps/rejected": -78.55941772460938, "loss": 0.6629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18727919459342957, "rewards/margins": 0.06737373024225235, "rewards/rejected": -0.2546529471874237, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 3.591698408126831, "learning_rate": 1.9913113234057264e-07, "logits/chosen": -2.933715343475342, "logits/rejected": -2.9128355979919434, "logps/chosen": -70.69978332519531, "logps/rejected": -77.25373077392578, "loss": 0.6688, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.18399231135845184, "rewards/margins": 0.055909980088472366, "rewards/rejected": -0.2399022877216339, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -2.9427647590637207, "eval_logits/rejected": -2.937041997909546, "eval_logps/chosen": -72.73748779296875, "eval_logps/rejected": -80.43446350097656, "eval_loss": 0.6791625618934631, "eval_rewards/accuracies": 0.6031598448753357, "eval_rewards/chosen": -0.14025600254535675, "eval_rewards/margins": 0.032287415117025375, "eval_rewards/rejected": -0.17254340648651123, "eval_runtime": 383.1472, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 3.8721492290496826, "learning_rate": 1.9910456209321956e-07, "logits/chosen": -2.9030094146728516, "logits/rejected": -2.8650403022766113, "logps/chosen": -72.07716369628906, "logps/rejected": -74.85731506347656, "loss": 0.6597, "rewards/accuracies": 0.71875, "rewards/chosen": -0.18226821720600128, "rewards/margins": 0.0736222192645073, "rewards/rejected": -0.255890429019928, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 4.3863983154296875, "learning_rate": 1.9907759350464212e-07, "logits/chosen": -2.8956375122070312, "logits/rejected": -2.8728861808776855, "logps/chosen": -76.08689880371094, "logps/rejected": -81.4808349609375, "loss": 0.6634, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19904786348342896, "rewards/margins": 0.06673558056354523, "rewards/rejected": -0.265783429145813, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 4.163143157958984, "learning_rate": 1.9905022668323803e-07, "logits/chosen": -2.8559088706970215, "logits/rejected": -2.8340187072753906, "logps/chosen": -75.20133209228516, "logps/rejected": -76.68769073486328, "loss": 0.6737, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2035834789276123, "rewards/margins": 0.04596007615327835, "rewards/rejected": -0.24954351782798767, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 4.649513244628906, "learning_rate": 1.9902246173900554e-07, "logits/chosen": -2.8532347679138184, "logits/rejected": -2.840039014816284, "logps/chosen": -73.97209930419922, "logps/rejected": -80.97435760498047, "loss": 0.6589, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.18915219604969025, "rewards/margins": 0.0755903422832489, "rewards/rejected": -0.26474255323410034, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 4.157593250274658, "learning_rate": 1.9899429878354318e-07, "logits/chosen": -2.8119709491729736, "logits/rejected": -2.78918194770813, "logps/chosen": -74.1239013671875, "logps/rejected": -79.02185821533203, "loss": 0.6675, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21065564453601837, "rewards/margins": 0.056983523070812225, "rewards/rejected": -0.2676391303539276, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 3.678806781768799, "learning_rate": 1.989657379300492e-07, "logits/chosen": -2.7902090549468994, "logits/rejected": -2.7585668563842773, "logps/chosen": -76.15962219238281, "logps/rejected": -78.26510620117188, "loss": 0.6684, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20018188655376434, "rewards/margins": 0.05713933706283569, "rewards/rejected": -0.25732123851776123, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 4.094069957733154, "learning_rate": 1.9893677929332123e-07, "logits/chosen": -2.9425644874572754, "logits/rejected": -2.9215643405914307, "logps/chosen": -74.55903625488281, "logps/rejected": -80.36302185058594, "loss": 0.658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19514811038970947, "rewards/margins": 0.08139944821596146, "rewards/rejected": -0.2765475809574127, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 4.179010391235352, "learning_rate": 1.9890742298975574e-07, "logits/chosen": -2.8428502082824707, "logits/rejected": -2.813183307647705, "logps/chosen": -76.90948486328125, "logps/rejected": -78.09593200683594, "loss": 0.6704, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2253132164478302, "rewards/margins": 0.051389217376708984, "rewards/rejected": -0.2767024636268616, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 4.350039482116699, "learning_rate": 1.9887766913734748e-07, "logits/chosen": -2.828249216079712, "logits/rejected": -2.8191637992858887, "logps/chosen": -71.01896667480469, "logps/rejected": -79.3835678100586, "loss": 0.6633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.20980136096477509, "rewards/margins": 0.06808502972126007, "rewards/rejected": -0.27788639068603516, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 4.809699058532715, "learning_rate": 1.9884751785568928e-07, "logits/chosen": -2.8965935707092285, "logits/rejected": -2.8732640743255615, "logps/chosen": -80.31697082519531, "logps/rejected": -85.91722106933594, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": -0.234262615442276, "rewards/margins": 0.059213198721408844, "rewards/rejected": -0.29347580671310425, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 4.566841125488281, "learning_rate": 1.9881696926597125e-07, "logits/chosen": -2.7498779296875, "logits/rejected": -2.7402052879333496, "logps/chosen": -75.13661193847656, "logps/rejected": -82.50086975097656, "loss": 0.6584, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.22750934958457947, "rewards/margins": 0.07768379151821136, "rewards/rejected": -0.30519312620162964, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 4.17630672454834, "learning_rate": 1.987860234909805e-07, "logits/chosen": -2.735560417175293, "logits/rejected": -2.7091479301452637, "logps/chosen": -74.96757507324219, "logps/rejected": -78.909912109375, "loss": 0.6607, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.22499346733093262, "rewards/margins": 0.07408694922924042, "rewards/rejected": -0.29908040165901184, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 4.669484615325928, "learning_rate": 1.987546806551006e-07, "logits/chosen": -2.8178813457489014, "logits/rejected": -2.813152551651001, "logps/chosen": -75.33357238769531, "logps/rejected": -82.6530990600586, "loss": 0.6757, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.24164001643657684, "rewards/margins": 0.04336600750684738, "rewards/rejected": -0.28500601649284363, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 4.2985920906066895, "learning_rate": 1.9872294088431105e-07, "logits/chosen": -2.856102705001831, "logits/rejected": -2.845292568206787, "logps/chosen": -78.64314270019531, "logps/rejected": -86.57007598876953, "loss": 0.6597, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.25102704763412476, "rewards/margins": 0.07619601488113403, "rewards/rejected": -0.3272230923175812, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 4.795299530029297, "learning_rate": 1.9869080430618684e-07, "logits/chosen": -2.799677610397339, "logits/rejected": -2.7678658962249756, "logps/chosen": -81.40946197509766, "logps/rejected": -86.08482360839844, "loss": 0.6552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2272636592388153, "rewards/margins": 0.09386886656284332, "rewards/rejected": -0.3211325407028198, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 4.634361743927002, "learning_rate": 1.9865827104989774e-07, "logits/chosen": -2.8855044841766357, "logits/rejected": -2.8617072105407715, "logps/chosen": -77.7742691040039, "logps/rejected": -83.0869369506836, "loss": 0.665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22513917088508606, "rewards/margins": 0.06552886217832565, "rewards/rejected": -0.2906680107116699, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 4.259562969207764, "learning_rate": 1.9862534124620814e-07, "logits/chosen": -2.8211474418640137, "logits/rejected": -2.8036704063415527, "logps/chosen": -81.92951965332031, "logps/rejected": -86.38294982910156, "loss": 0.6652, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23327799141407013, "rewards/margins": 0.06551525741815567, "rewards/rejected": -0.2987932562828064, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 4.979623317718506, "learning_rate": 1.9859201502747614e-07, "logits/chosen": -2.836080312728882, "logits/rejected": -2.8181748390197754, "logps/chosen": -79.82194519042969, "logps/rejected": -84.47944641113281, "loss": 0.6733, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2643613815307617, "rewards/margins": 0.048605453222990036, "rewards/rejected": -0.31296685338020325, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 4.920752048492432, "learning_rate": 1.985582925276533e-07, "logits/chosen": -2.8163769245147705, "logits/rejected": -2.7815186977386475, "logps/chosen": -80.47447967529297, "logps/rejected": -82.33802032470703, "loss": 0.661, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.23780038952827454, "rewards/margins": 0.07311274856328964, "rewards/rejected": -0.3109131455421448, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 5.04128885269165, "learning_rate": 1.9852417388228392e-07, "logits/chosen": -2.7918379306793213, "logits/rejected": -2.746398448944092, "logps/chosen": -82.25102233886719, "logps/rejected": -80.16227722167969, "loss": 0.6657, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2515484392642975, "rewards/margins": 0.06543479859828949, "rewards/rejected": -0.3169831931591034, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 4.687989711761475, "learning_rate": 1.9848965922850464e-07, "logits/chosen": -2.7723536491394043, "logits/rejected": -2.7345359325408936, "logps/chosen": -83.00138092041016, "logps/rejected": -83.72332000732422, "loss": 0.6616, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.24622325599193573, "rewards/margins": 0.07428522408008575, "rewards/rejected": -0.32050850987434387, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 4.529170989990234, "learning_rate": 1.9845474870504378e-07, "logits/chosen": -2.8512473106384277, "logits/rejected": -2.8228654861450195, "logps/chosen": -75.3122787475586, "logps/rejected": -83.6035385131836, "loss": 0.6521, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2404632866382599, "rewards/margins": 0.09109614789485931, "rewards/rejected": -0.3315594792366028, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 4.675422191619873, "learning_rate": 1.984194424522208e-07, "logits/chosen": -2.7441370487213135, "logits/rejected": -2.7133846282958984, "logps/chosen": -77.77023315429688, "logps/rejected": -85.73268127441406, "loss": 0.6508, "rewards/accuracies": 0.75, "rewards/chosen": -0.2295670509338379, "rewards/margins": 0.09620517492294312, "rewards/rejected": -0.325772225856781, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 4.241116523742676, "learning_rate": 1.9838374061194575e-07, "logits/chosen": -2.759497880935669, "logits/rejected": -2.738795518875122, "logps/chosen": -74.74937438964844, "logps/rejected": -81.0289077758789, "loss": 0.6614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.24653956294059753, "rewards/margins": 0.07387407124042511, "rewards/rejected": -0.32041364908218384, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 4.396022319793701, "learning_rate": 1.983476433277188e-07, "logits/chosen": -2.730039596557617, "logits/rejected": -2.713178873062134, "logps/chosen": -75.72853088378906, "logps/rejected": -85.95320129394531, "loss": 0.6572, "rewards/accuracies": 0.65625, "rewards/chosen": -0.23297707736492157, "rewards/margins": 0.08371174335479736, "rewards/rejected": -0.3166888356208801, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 4.776400089263916, "learning_rate": 1.9831115074462944e-07, "logits/chosen": -2.7560505867004395, "logits/rejected": -2.7167718410491943, "logps/chosen": -84.48075866699219, "logps/rejected": -86.92762756347656, "loss": 0.658, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2619093954563141, "rewards/margins": 0.08262166380882263, "rewards/rejected": -0.34453102946281433, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 4.16822624206543, "learning_rate": 1.982742630093561e-07, "logits/chosen": -2.729417324066162, "logits/rejected": -2.7020583152770996, "logps/chosen": -81.297119140625, "logps/rejected": -87.87712097167969, "loss": 0.6574, "rewards/accuracies": 0.625, "rewards/chosen": -0.2735114097595215, "rewards/margins": 0.08464406430721283, "rewards/rejected": -0.3581554591655731, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 5.366155624389648, "learning_rate": 1.9823698027016548e-07, "logits/chosen": -2.7723546028137207, "logits/rejected": -2.7467968463897705, "logps/chosen": -86.6219253540039, "logps/rejected": -87.81824493408203, "loss": 0.6694, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.2837437391281128, "rewards/margins": 0.05781107023358345, "rewards/rejected": -0.34155479073524475, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 4.76614236831665, "learning_rate": 1.98199302676912e-07, "logits/chosen": -2.7463812828063965, "logits/rejected": -2.7254321575164795, "logps/chosen": -78.85734558105469, "logps/rejected": -86.11651611328125, "loss": 0.6582, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.28216251730918884, "rewards/margins": 0.08013720065355301, "rewards/rejected": -0.36229974031448364, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 4.918941497802734, "learning_rate": 1.9816123038103701e-07, "logits/chosen": -2.7731640338897705, "logits/rejected": -2.7475695610046387, "logps/chosen": -81.00579071044922, "logps/rejected": -88.36027526855469, "loss": 0.6624, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2777502238750458, "rewards/margins": 0.07108843326568604, "rewards/rejected": -0.3488386571407318, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 5.564276218414307, "learning_rate": 1.9812276353556852e-07, "logits/chosen": -2.7926807403564453, "logits/rejected": -2.774944543838501, "logps/chosen": -86.74163055419922, "logps/rejected": -88.55531311035156, "loss": 0.6688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.27294981479644775, "rewards/margins": 0.059096015989780426, "rewards/rejected": -0.33204585313796997, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 5.371655464172363, "learning_rate": 1.9808390229512026e-07, "logits/chosen": -2.784646511077881, "logits/rejected": -2.7882354259490967, "logps/chosen": -80.06517028808594, "logps/rejected": -89.0752182006836, "loss": 0.6631, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2801792025566101, "rewards/margins": 0.07294531166553497, "rewards/rejected": -0.35312455892562866, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 5.554327487945557, "learning_rate": 1.980446468158912e-07, "logits/chosen": -2.7829484939575195, "logits/rejected": -2.764782667160034, "logps/chosen": -87.25930786132812, "logps/rejected": -90.30667114257812, "loss": 0.6665, "rewards/accuracies": 0.625, "rewards/chosen": -0.2869259715080261, "rewards/margins": 0.06388825923204422, "rewards/rejected": -0.35081425309181213, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 5.064526081085205, "learning_rate": 1.9800499725566506e-07, "logits/chosen": -2.7626073360443115, "logits/rejected": -2.742617130279541, "logps/chosen": -81.52635192871094, "logps/rejected": -84.69229125976562, "loss": 0.6665, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.28667086362838745, "rewards/margins": 0.06642518192529678, "rewards/rejected": -0.353096067905426, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 5.177687644958496, "learning_rate": 1.9796495377380933e-07, "logits/chosen": -2.696913242340088, "logits/rejected": -2.692275047302246, "logps/chosen": -78.03648376464844, "logps/rejected": -90.65962219238281, "loss": 0.6574, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2877274453639984, "rewards/margins": 0.08459886163473129, "rewards/rejected": -0.3723262846469879, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 5.387844562530518, "learning_rate": 1.9792451653127496e-07, "logits/chosen": -2.7484130859375, "logits/rejected": -2.7317137718200684, "logps/chosen": -82.39192199707031, "logps/rejected": -91.6829605102539, "loss": 0.6564, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.28116923570632935, "rewards/margins": 0.08760281652212143, "rewards/rejected": -0.36877208948135376, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 6.261878490447998, "learning_rate": 1.9788368569059551e-07, "logits/chosen": -2.866014003753662, "logits/rejected": -2.8366146087646484, "logps/chosen": -84.28050231933594, "logps/rejected": -88.85655212402344, "loss": 0.6653, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2906574010848999, "rewards/margins": 0.06752223521471024, "rewards/rejected": -0.35817962884902954, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 5.086827754974365, "learning_rate": 1.9784246141588662e-07, "logits/chosen": -2.690002918243408, "logits/rejected": -2.661464214324951, "logps/chosen": -83.01453399658203, "logps/rejected": -87.8421859741211, "loss": 0.6639, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3018570840358734, "rewards/margins": 0.07041344046592712, "rewards/rejected": -0.37227049469947815, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 4.994007587432861, "learning_rate": 1.9780084387284535e-07, "logits/chosen": -2.798659563064575, "logits/rejected": -2.7668426036834717, "logps/chosen": -80.95232391357422, "logps/rejected": -86.96568298339844, "loss": 0.6599, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.29192060232162476, "rewards/margins": 0.07660763710737228, "rewards/rejected": -0.3685282766819, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 5.847742557525635, "learning_rate": 1.977588332287493e-07, "logits/chosen": -2.784656286239624, "logits/rejected": -2.760721206665039, "logps/chosen": -92.74882507324219, "logps/rejected": -95.89972686767578, "loss": 0.6675, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.32713156938552856, "rewards/margins": 0.06348985433578491, "rewards/rejected": -0.3906214237213135, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -2.86350417137146, "eval_logits/rejected": -2.857557773590088, "eval_logps/chosen": -81.5435791015625, "eval_logps/rejected": -90.7352523803711, "eval_loss": 0.6732187867164612, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.22831681370735168, "eval_rewards/margins": 0.0472344309091568, "eval_rewards/rejected": -0.2755512595176697, "eval_runtime": 382.7509, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 6.044860363006592, "learning_rate": 1.9771642965245623e-07, "logits/chosen": -2.6763038635253906, "logits/rejected": -2.6524975299835205, "logps/chosen": -85.02580261230469, "logps/rejected": -94.72159576416016, "loss": 0.6535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2969000041484833, "rewards/margins": 0.09268021583557129, "rewards/rejected": -0.38958021998405457, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 5.906956672668457, "learning_rate": 1.9767363331440324e-07, "logits/chosen": -2.819481372833252, "logits/rejected": -2.8079781532287598, "logps/chosen": -84.50180053710938, "logps/rejected": -89.02869415283203, "loss": 0.6699, "rewards/accuracies": 0.625, "rewards/chosen": -0.30174607038497925, "rewards/margins": 0.05834274739027023, "rewards/rejected": -0.3600887656211853, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 5.0853800773620605, "learning_rate": 1.9763044438660606e-07, "logits/chosen": -2.6298704147338867, "logits/rejected": -2.61495304107666, "logps/chosen": -84.22966766357422, "logps/rejected": -95.0593032836914, "loss": 0.6485, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.28729158639907837, "rewards/margins": 0.11365656554698944, "rewards/rejected": -0.40094810724258423, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 5.301225662231445, "learning_rate": 1.9758686304265845e-07, "logits/chosen": -2.7911629676818848, "logits/rejected": -2.7801711559295654, "logps/chosen": -83.1259994506836, "logps/rejected": -89.50416564941406, "loss": 0.6615, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3048972487449646, "rewards/margins": 0.07634397596120834, "rewards/rejected": -0.38124123215675354, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 6.339087963104248, "learning_rate": 1.975428894577314e-07, "logits/chosen": -2.749023675918579, "logits/rejected": -2.730348825454712, "logps/chosen": -84.1259994506836, "logps/rejected": -97.1973876953125, "loss": 0.6493, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.31879574060440063, "rewards/margins": 0.10469119250774384, "rewards/rejected": -0.42348694801330566, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 5.268611431121826, "learning_rate": 1.9749852380857247e-07, "logits/chosen": -2.73268461227417, "logits/rejected": -2.7092766761779785, "logps/chosen": -85.26544952392578, "logps/rejected": -92.43037414550781, "loss": 0.6503, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3030092120170593, "rewards/margins": 0.10151870548725128, "rewards/rejected": -0.4045279622077942, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 6.336197853088379, "learning_rate": 1.9745376627350515e-07, "logits/chosen": -2.770284652709961, "logits/rejected": -2.7484183311462402, "logps/chosen": -86.85330963134766, "logps/rejected": -90.93848419189453, "loss": 0.6643, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.30737534165382385, "rewards/margins": 0.07163959741592407, "rewards/rejected": -0.3790149688720703, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 7.067396640777588, "learning_rate": 1.9740861703242797e-07, "logits/chosen": -2.8400158882141113, "logits/rejected": -2.814021587371826, "logps/chosen": -87.88722229003906, "logps/rejected": -92.84794616699219, "loss": 0.6548, "rewards/accuracies": 0.6875, "rewards/chosen": -0.325391948223114, "rewards/margins": 0.09382455050945282, "rewards/rejected": -0.4192165434360504, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 5.568375587463379, "learning_rate": 1.97363076266814e-07, "logits/chosen": -2.847001075744629, "logits/rejected": -2.841013193130493, "logps/chosen": -83.87931823730469, "logps/rejected": -94.69303131103516, "loss": 0.6453, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.30921635031700134, "rewards/margins": 0.11252041906118393, "rewards/rejected": -0.42173680663108826, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 5.770374298095703, "learning_rate": 1.9731714415970998e-07, "logits/chosen": -2.7627999782562256, "logits/rejected": -2.753859281539917, "logps/chosen": -82.06367492675781, "logps/rejected": -89.87281036376953, "loss": 0.6576, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2957562506198883, "rewards/margins": 0.08762596547603607, "rewards/rejected": -0.3833822011947632, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 6.5320916175842285, "learning_rate": 1.9727082089573552e-07, "logits/chosen": -2.8038830757141113, "logits/rejected": -2.7914395332336426, "logps/chosen": -86.36532592773438, "logps/rejected": -99.83525085449219, "loss": 0.6332, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.30258145928382874, "rewards/margins": 0.13676601648330688, "rewards/rejected": -0.43934744596481323, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 6.383522987365723, "learning_rate": 1.9722410666108251e-07, "logits/chosen": -2.7484636306762695, "logits/rejected": -2.744802474975586, "logps/chosen": -87.0767593383789, "logps/rejected": -100.16703033447266, "loss": 0.6474, "rewards/accuracies": 0.65625, "rewards/chosen": -0.33506637811660767, "rewards/margins": 0.11154337227344513, "rewards/rejected": -0.4466097354888916, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 5.712911128997803, "learning_rate": 1.9717700164351435e-07, "logits/chosen": -2.7168633937835693, "logits/rejected": -2.6899285316467285, "logps/chosen": -85.99942779541016, "logps/rejected": -95.53802490234375, "loss": 0.6447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.32548198103904724, "rewards/margins": 0.11321850121021271, "rewards/rejected": -0.43870049715042114, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 5.64019775390625, "learning_rate": 1.9712950603236508e-07, "logits/chosen": -2.745177745819092, "logits/rejected": -2.7103383541107178, "logps/chosen": -84.22530364990234, "logps/rejected": -90.6386489868164, "loss": 0.6651, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.32764965295791626, "rewards/margins": 0.07088522613048553, "rewards/rejected": -0.3985348641872406, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 6.2562994956970215, "learning_rate": 1.9708162001853873e-07, "logits/chosen": -2.7670552730560303, "logits/rejected": -2.7497591972351074, "logps/chosen": -86.6700668334961, "logps/rejected": -98.61284637451172, "loss": 0.6444, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3234894871711731, "rewards/margins": 0.11730514466762543, "rewards/rejected": -0.44079461693763733, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 6.609861373901367, "learning_rate": 1.9703334379450855e-07, "logits/chosen": -2.7286181449890137, "logits/rejected": -2.702838182449341, "logps/chosen": -86.29169464111328, "logps/rejected": -95.97723388671875, "loss": 0.6496, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.32343727350234985, "rewards/margins": 0.1173778548836708, "rewards/rejected": -0.44081512093544006, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 6.852441310882568, "learning_rate": 1.969846775543161e-07, "logits/chosen": -2.6641182899475098, "logits/rejected": -2.6382086277008057, "logps/chosen": -91.28890228271484, "logps/rejected": -96.90291595458984, "loss": 0.6594, "rewards/accuracies": 0.625, "rewards/chosen": -0.360513836145401, "rewards/margins": 0.08437645435333252, "rewards/rejected": -0.44489026069641113, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 6.942179203033447, "learning_rate": 1.9693562149357072e-07, "logits/chosen": -2.6576151847839355, "logits/rejected": -2.632829189300537, "logps/chosen": -86.34416961669922, "logps/rejected": -96.5618896484375, "loss": 0.6436, "rewards/accuracies": 0.6875, "rewards/chosen": -0.324049711227417, "rewards/margins": 0.1187606081366539, "rewards/rejected": -0.4428102970123291, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 6.220181941986084, "learning_rate": 1.9688617580944843e-07, "logits/chosen": -2.702073812484741, "logits/rejected": -2.6761860847473145, "logps/chosen": -93.23859405517578, "logps/rejected": -100.00730895996094, "loss": 0.6582, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.366287499666214, "rewards/margins": 0.08763855695724487, "rewards/rejected": -0.4539260268211365, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 5.371936321258545, "learning_rate": 1.9683634070069143e-07, "logits/chosen": -2.7104570865631104, "logits/rejected": -2.7045562267303467, "logps/chosen": -85.96195983886719, "logps/rejected": -97.77884674072266, "loss": 0.6586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.33949604630470276, "rewards/margins": 0.09934855997562408, "rewards/rejected": -0.43884459137916565, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 6.489460468292236, "learning_rate": 1.967861163676071e-07, "logits/chosen": -2.733281373977661, "logits/rejected": -2.7114508152008057, "logps/chosen": -92.49397277832031, "logps/rejected": -98.18407440185547, "loss": 0.662, "rewards/accuracies": 0.625, "rewards/chosen": -0.3730631172657013, "rewards/margins": 0.07783212512731552, "rewards/rejected": -0.4508952498435974, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 6.768235683441162, "learning_rate": 1.9673550301206733e-07, "logits/chosen": -2.808366060256958, "logits/rejected": -2.778860092163086, "logps/chosen": -93.14388275146484, "logps/rejected": -99.8772201538086, "loss": 0.6571, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.38905346393585205, "rewards/margins": 0.10656271129846573, "rewards/rejected": -0.4956161379814148, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 6.508683681488037, "learning_rate": 1.9668450083750762e-07, "logits/chosen": -2.727835178375244, "logits/rejected": -2.7069993019104004, "logps/chosen": -93.80626678466797, "logps/rejected": -99.74931335449219, "loss": 0.6678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3813135027885437, "rewards/margins": 0.06728534400463104, "rewards/rejected": -0.4485989212989807, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 6.18773078918457, "learning_rate": 1.9663311004892628e-07, "logits/chosen": -2.815324306488037, "logits/rejected": -2.8085083961486816, "logps/chosen": -87.60061645507812, "logps/rejected": -101.3532485961914, "loss": 0.6486, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3641282916069031, "rewards/margins": 0.10624756664037704, "rewards/rejected": -0.4703758656978607, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 7.5851149559021, "learning_rate": 1.9658133085288365e-07, "logits/chosen": -2.6851487159729004, "logits/rejected": -2.673161029815674, "logps/chosen": -90.39569854736328, "logps/rejected": -100.58292388916016, "loss": 0.6598, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3683760166168213, "rewards/margins": 0.08387802541255951, "rewards/rejected": -0.4522539973258972, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 7.758167743682861, "learning_rate": 1.965291634575011e-07, "logits/chosen": -2.714484214782715, "logits/rejected": -2.6961376667022705, "logps/chosen": -92.89412689208984, "logps/rejected": -101.2403564453125, "loss": 0.6545, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37218329310417175, "rewards/margins": 0.0965455174446106, "rewards/rejected": -0.46872884035110474, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 6.4621477127075195, "learning_rate": 1.9647660807246063e-07, "logits/chosen": -2.679758071899414, "logits/rejected": -2.6543891429901123, "logps/chosen": -95.65350341796875, "logps/rejected": -102.36808013916016, "loss": 0.6485, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.37710902094841003, "rewards/margins": 0.10719014704227448, "rewards/rejected": -0.4842991232872009, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 6.19595193862915, "learning_rate": 1.9642366490900337e-07, "logits/chosen": -2.5938596725463867, "logits/rejected": -2.5772056579589844, "logps/chosen": -91.1055908203125, "logps/rejected": -105.87294006347656, "loss": 0.6573, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4024714529514313, "rewards/margins": 0.10824602842330933, "rewards/rejected": -0.510717511177063, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 6.512645244598389, "learning_rate": 1.9637033417992936e-07, "logits/chosen": -2.6764976978302, "logits/rejected": -2.6590423583984375, "logps/chosen": -92.03446960449219, "logps/rejected": -103.1094741821289, "loss": 0.6477, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3842238485813141, "rewards/margins": 0.10982821881771088, "rewards/rejected": -0.49405208230018616, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 8.616137504577637, "learning_rate": 1.9631661609959628e-07, "logits/chosen": -2.6933939456939697, "logits/rejected": -2.665048599243164, "logps/chosen": -91.06763458251953, "logps/rejected": -101.47669982910156, "loss": 0.6419, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.37147217988967896, "rewards/margins": 0.12758705019950867, "rewards/rejected": -0.49905920028686523, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 7.095156192779541, "learning_rate": 1.9626251088391876e-07, "logits/chosen": -2.6438610553741455, "logits/rejected": -2.652470350265503, "logps/chosen": -92.75547790527344, "logps/rejected": -106.07810974121094, "loss": 0.6578, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4214300215244293, "rewards/margins": 0.08987243473529816, "rewards/rejected": -0.5113024711608887, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 6.420622825622559, "learning_rate": 1.9620801875036753e-07, "logits/chosen": -2.6833789348602295, "logits/rejected": -2.6618340015411377, "logps/chosen": -93.40895080566406, "logps/rejected": -107.32088470458984, "loss": 0.6337, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3739815950393677, "rewards/margins": 0.14117519557476044, "rewards/rejected": -0.5151568651199341, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 6.547939300537109, "learning_rate": 1.9615313991796843e-07, "logits/chosen": -2.6036171913146973, "logits/rejected": -2.594822645187378, "logps/chosen": -89.96730041503906, "logps/rejected": -101.12642669677734, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": -0.3788634240627289, "rewards/margins": 0.09900254011154175, "rewards/rejected": -0.477865993976593, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 7.179741859436035, "learning_rate": 1.960978746073016e-07, "logits/chosen": -2.701573133468628, "logits/rejected": -2.687563419342041, "logps/chosen": -94.88987731933594, "logps/rejected": -110.26725769042969, "loss": 0.6407, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.43192750215530396, "rewards/margins": 0.1333298236131668, "rewards/rejected": -0.5652573108673096, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 7.94010591506958, "learning_rate": 1.9604222304050074e-07, "logits/chosen": -2.7010350227355957, "logits/rejected": -2.673696517944336, "logps/chosen": -96.3143310546875, "logps/rejected": -106.46876525878906, "loss": 0.6503, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4194537103176117, "rewards/margins": 0.10921194404363632, "rewards/rejected": -0.5286656618118286, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 8.393180847167969, "learning_rate": 1.9598618544125184e-07, "logits/chosen": -2.630312204360962, "logits/rejected": -2.6040008068084717, "logps/chosen": -95.65312957763672, "logps/rejected": -107.04170989990234, "loss": 0.6428, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.41506823897361755, "rewards/margins": 0.12901504337787628, "rewards/rejected": -0.544083297252655, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 6.8773908615112305, "learning_rate": 1.9592976203479266e-07, "logits/chosen": -2.6714940071105957, "logits/rejected": -2.641033411026001, "logps/chosen": -98.60674285888672, "logps/rejected": -106.5346908569336, "loss": 0.6407, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42717328667640686, "rewards/margins": 0.143313467502594, "rewards/rejected": -0.5704867243766785, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 6.960276126861572, "learning_rate": 1.9587295304791164e-07, "logits/chosen": -2.725450038909912, "logits/rejected": -2.690068244934082, "logps/chosen": -95.82426452636719, "logps/rejected": -105.81490325927734, "loss": 0.6465, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.42567867040634155, "rewards/margins": 0.11819012463092804, "rewards/rejected": -0.5438688397407532, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 7.8112053871154785, "learning_rate": 1.95815758708947e-07, "logits/chosen": -2.7114315032958984, "logits/rejected": -2.7045350074768066, "logps/chosen": -93.44751739501953, "logps/rejected": -117.18695068359375, "loss": 0.6181, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.4114983081817627, "rewards/margins": 0.19371816515922546, "rewards/rejected": -0.6052165031433105, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 7.148016929626465, "learning_rate": 1.957581792477859e-07, "logits/chosen": -2.6281604766845703, "logits/rejected": -2.6143136024475098, "logps/chosen": -96.54183197021484, "logps/rejected": -106.0041275024414, "loss": 0.6437, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.4121316373348236, "rewards/margins": 0.12658736109733582, "rewards/rejected": -0.5387190580368042, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -2.760655403137207, "eval_logits/rejected": -2.754596710205078, "eval_logps/chosen": -94.27955627441406, "eval_logps/rejected": -105.83222198486328, "eval_loss": 0.6645947098731995, "eval_rewards/accuracies": 0.6119888424873352, "eval_rewards/chosen": -0.3556765913963318, "eval_rewards/margins": 0.0708443745970726, "eval_rewards/rejected": -0.426520973443985, "eval_runtime": 382.9537, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 9.388651847839355, "learning_rate": 1.9570021489586344e-07, "logits/chosen": -2.5501980781555176, "logits/rejected": -2.5273725986480713, "logps/chosen": -99.22215270996094, "logps/rejected": -108.93840026855469, "loss": 0.6486, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4527076780796051, "rewards/margins": 0.11170338094234467, "rewards/rejected": -0.5644110441207886, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 7.1553521156311035, "learning_rate": 1.956418658861617e-07, "logits/chosen": -2.5821588039398193, "logits/rejected": -2.5769202709198, "logps/chosen": -94.36402130126953, "logps/rejected": -108.30653381347656, "loss": 0.6499, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4402020573616028, "rewards/margins": 0.11403479427099228, "rewards/rejected": -0.5542367696762085, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 8.118807792663574, "learning_rate": 1.9558313245320888e-07, "logits/chosen": -2.63601016998291, "logits/rejected": -2.6221766471862793, "logps/chosen": -98.5857162475586, "logps/rejected": -110.01104736328125, "loss": 0.6584, "rewards/accuracies": 0.625, "rewards/chosen": -0.4442833960056305, "rewards/margins": 0.09203995019197464, "rewards/rejected": -0.5363233685493469, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 7.089800834655762, "learning_rate": 1.955240148330784e-07, "logits/chosen": -2.7510554790496826, "logits/rejected": -2.7306642532348633, "logps/chosen": -105.6661376953125, "logps/rejected": -110.5372543334961, "loss": 0.6636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4942947030067444, "rewards/margins": 0.08564119040966034, "rewards/rejected": -0.5799359083175659, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 8.295072555541992, "learning_rate": 1.954645132633878e-07, "logits/chosen": -2.637881278991699, "logits/rejected": -2.6184113025665283, "logps/chosen": -95.91987609863281, "logps/rejected": -110.17707824707031, "loss": 0.6399, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4334731101989746, "rewards/margins": 0.13521215319633484, "rewards/rejected": -0.5686852335929871, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 7.124377727508545, "learning_rate": 1.9540462798329788e-07, "logits/chosen": -2.655236005783081, "logits/rejected": -2.6376397609710693, "logps/chosen": -94.26342010498047, "logps/rejected": -111.944091796875, "loss": 0.6335, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.42727988958358765, "rewards/margins": 0.16477133333683014, "rewards/rejected": -0.5920512080192566, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 7.825984477996826, "learning_rate": 1.953443592335118e-07, "logits/chosen": -2.70601224899292, "logits/rejected": -2.7016124725341797, "logps/chosen": -102.71690368652344, "logps/rejected": -119.76414489746094, "loss": 0.6376, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.47587305307388306, "rewards/margins": 0.1428886353969574, "rewards/rejected": -0.6187616586685181, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 8.646442413330078, "learning_rate": 1.9528370725627393e-07, "logits/chosen": -2.702390670776367, "logits/rejected": -2.6908962726593018, "logps/chosen": -94.289306640625, "logps/rejected": -109.12205505371094, "loss": 0.6474, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.44868603348731995, "rewards/margins": 0.11855638027191162, "rewards/rejected": -0.567242443561554, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 7.565232276916504, "learning_rate": 1.9522267229536907e-07, "logits/chosen": -2.7394003868103027, "logits/rejected": -2.709407091140747, "logps/chosen": -98.76429748535156, "logps/rejected": -113.52610778808594, "loss": 0.6365, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.45598071813583374, "rewards/margins": 0.14605209231376648, "rewards/rejected": -0.6020327806472778, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 8.72111701965332, "learning_rate": 1.9516125459612133e-07, "logits/chosen": -2.6218557357788086, "logits/rejected": -2.6021950244903564, "logps/chosen": -105.73250579833984, "logps/rejected": -115.9620590209961, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": -0.5081601738929749, "rewards/margins": 0.11569005250930786, "rewards/rejected": -0.6238502264022827, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 7.255007743835449, "learning_rate": 1.9509945440539328e-07, "logits/chosen": -2.556091070175171, "logits/rejected": -2.5282187461853027, "logps/chosen": -100.8625717163086, "logps/rejected": -112.12745666503906, "loss": 0.6411, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.47763776779174805, "rewards/margins": 0.13664332032203674, "rewards/rejected": -0.6142809987068176, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 9.09738826751709, "learning_rate": 1.9503727197158475e-07, "logits/chosen": -2.612281560897827, "logits/rejected": -2.5788486003875732, "logps/chosen": -102.8791275024414, "logps/rejected": -110.78157806396484, "loss": 0.6458, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4917148947715759, "rewards/margins": 0.12664994597434998, "rewards/rejected": -0.6183648705482483, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 7.518438816070557, "learning_rate": 1.949747075446321e-07, "logits/chosen": -2.7091097831726074, "logits/rejected": -2.6815297603607178, "logps/chosen": -99.87444305419922, "logps/rejected": -116.1915512084961, "loss": 0.6286, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4566105008125305, "rewards/margins": 0.16854511201381683, "rewards/rejected": -0.6251556873321533, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 9.097807884216309, "learning_rate": 1.9491176137600695e-07, "logits/chosen": -2.6520981788635254, "logits/rejected": -2.6339221000671387, "logps/chosen": -106.4759521484375, "logps/rejected": -119.20014953613281, "loss": 0.637, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5056281685829163, "rewards/margins": 0.1465793251991272, "rewards/rejected": -0.6522074341773987, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 8.62578010559082, "learning_rate": 1.9484843371871538e-07, "logits/chosen": -2.545076370239258, "logits/rejected": -2.530463933944702, "logps/chosen": -105.77069091796875, "logps/rejected": -119.581787109375, "loss": 0.6471, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.49992021918296814, "rewards/margins": 0.12265481799840927, "rewards/rejected": -0.6225749850273132, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 7.896541118621826, "learning_rate": 1.9478472482729677e-07, "logits/chosen": -2.649341106414795, "logits/rejected": -2.618516445159912, "logps/chosen": -102.70530700683594, "logps/rejected": -112.48616790771484, "loss": 0.6476, "rewards/accuracies": 0.625, "rewards/chosen": -0.4789007604122162, "rewards/margins": 0.12364625930786133, "rewards/rejected": -0.6025470495223999, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 8.509905815124512, "learning_rate": 1.947206349578229e-07, "logits/chosen": -2.603302240371704, "logits/rejected": -2.594250202178955, "logps/chosen": -95.80339813232422, "logps/rejected": -114.97441101074219, "loss": 0.6297, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.44848114252090454, "rewards/margins": 0.1611265242099762, "rewards/rejected": -0.6096076369285583, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 8.58442497253418, "learning_rate": 1.9465616436789683e-07, "logits/chosen": -2.685575485229492, "logits/rejected": -2.6578125953674316, "logps/chosen": -100.21031951904297, "logps/rejected": -107.1775131225586, "loss": 0.6475, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.44749441742897034, "rewards/margins": 0.12284334003925323, "rewards/rejected": -0.57033771276474, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 8.441810607910156, "learning_rate": 1.9459131331665183e-07, "logits/chosen": -2.606628656387329, "logits/rejected": -2.582200765609741, "logps/chosen": -104.09676361083984, "logps/rejected": -112.6886215209961, "loss": 0.6599, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5036908388137817, "rewards/margins": 0.10885962098836899, "rewards/rejected": -0.6125504970550537, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 7.212275981903076, "learning_rate": 1.9452608206475044e-07, "logits/chosen": -2.643218755722046, "logits/rejected": -2.6058707237243652, "logps/chosen": -99.18804168701172, "logps/rejected": -114.4738540649414, "loss": 0.6285, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.46331319212913513, "rewards/margins": 0.16757920384407043, "rewards/rejected": -0.6308923959732056, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 10.034046173095703, "learning_rate": 1.9446047087438342e-07, "logits/chosen": -2.543471097946167, "logits/rejected": -2.515993118286133, "logps/chosen": -92.6207275390625, "logps/rejected": -103.92476654052734, "loss": 0.6382, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4304746985435486, "rewards/margins": 0.13809053599834442, "rewards/rejected": -0.5685652494430542, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 9.963957786560059, "learning_rate": 1.9439448000926859e-07, "logits/chosen": -2.567805528640747, "logits/rejected": -2.5440430641174316, "logps/chosen": -96.545166015625, "logps/rejected": -112.3097915649414, "loss": 0.6323, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.43777838349342346, "rewards/margins": 0.15233097970485687, "rewards/rejected": -0.5901094675064087, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 7.235942363739014, "learning_rate": 1.9432810973464988e-07, "logits/chosen": -2.6433088779449463, "logits/rejected": -2.6230881214141846, "logps/chosen": -97.69755554199219, "logps/rejected": -115.63801574707031, "loss": 0.6353, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46356654167175293, "rewards/margins": 0.14649716019630432, "rewards/rejected": -0.6100637316703796, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 10.129012107849121, "learning_rate": 1.942613603172962e-07, "logits/chosen": -2.561520576477051, "logits/rejected": -2.5459980964660645, "logps/chosen": -104.91939544677734, "logps/rejected": -118.30328369140625, "loss": 0.6436, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5234652161598206, "rewards/margins": 0.13155731558799744, "rewards/rejected": -0.6550225019454956, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 10.185026168823242, "learning_rate": 1.9419423202550037e-07, "logits/chosen": -2.780900478363037, "logits/rejected": -2.7437312602996826, "logps/chosen": -112.07551574707031, "logps/rejected": -121.580810546875, "loss": 0.643, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5269603133201599, "rewards/margins": 0.13937778770923615, "rewards/rejected": -0.6663380861282349, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 9.492935180664062, "learning_rate": 1.9412672512907812e-07, "logits/chosen": -2.5769448280334473, "logits/rejected": -2.5640530586242676, "logps/chosen": -98.23934173583984, "logps/rejected": -112.2879409790039, "loss": 0.6454, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.460599422454834, "rewards/margins": 0.12627866864204407, "rewards/rejected": -0.5868780612945557, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 8.52971363067627, "learning_rate": 1.940588398993669e-07, "logits/chosen": -2.652571439743042, "logits/rejected": -2.6381473541259766, "logps/chosen": -107.09454345703125, "logps/rejected": -122.2138900756836, "loss": 0.6393, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5128433704376221, "rewards/margins": 0.1659374237060547, "rewards/rejected": -0.6787808537483215, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 8.951239585876465, "learning_rate": 1.9399057660922482e-07, "logits/chosen": -2.583982467651367, "logits/rejected": -2.5551466941833496, "logps/chosen": -109.26493072509766, "logps/rejected": -121.90692138671875, "loss": 0.6445, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5415034294128418, "rewards/margins": 0.15873458981513977, "rewards/rejected": -0.7002379298210144, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 12.640382766723633, "learning_rate": 1.939219355330296e-07, "logits/chosen": -2.5890984535217285, "logits/rejected": -2.5623886585235596, "logps/chosen": -108.95796203613281, "logps/rejected": -125.61396789550781, "loss": 0.631, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5468857884407043, "rewards/margins": 0.164025217294693, "rewards/rejected": -0.7109109163284302, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 9.470446586608887, "learning_rate": 1.9385291694667742e-07, "logits/chosen": -2.538329601287842, "logits/rejected": -2.531400442123413, "logps/chosen": -106.4929428100586, "logps/rejected": -120.96571350097656, "loss": 0.6506, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5449027419090271, "rewards/margins": 0.11813728511333466, "rewards/rejected": -0.663040041923523, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 7.284107685089111, "learning_rate": 1.9378352112758182e-07, "logits/chosen": -2.47944974899292, "logits/rejected": -2.4565722942352295, "logps/chosen": -108.488525390625, "logps/rejected": -117.57743835449219, "loss": 0.6528, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5492126941680908, "rewards/margins": 0.11511759459972382, "rewards/rejected": -0.6643303036689758, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 9.556658744812012, "learning_rate": 1.937137483546726e-07, "logits/chosen": -2.514282464981079, "logits/rejected": -2.495223045349121, "logps/chosen": -107.26896667480469, "logps/rejected": -116.1639404296875, "loss": 0.6492, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5193467140197754, "rewards/margins": 0.12163758277893066, "rewards/rejected": -0.6409842371940613, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 10.154839515686035, "learning_rate": 1.936435989083947e-07, "logits/chosen": -2.6612658500671387, "logits/rejected": -2.6214406490325928, "logps/chosen": -107.97154235839844, "logps/rejected": -120.12506103515625, "loss": 0.6297, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5201407670974731, "rewards/margins": 0.171969935297966, "rewards/rejected": -0.6921107172966003, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 9.493666648864746, "learning_rate": 1.9357307307070706e-07, "logits/chosen": -2.5017638206481934, "logits/rejected": -2.480271816253662, "logps/chosen": -107.59370422363281, "logps/rejected": -121.32795715332031, "loss": 0.6428, "rewards/accuracies": 0.625, "rewards/chosen": -0.5368096828460693, "rewards/margins": 0.14079925417900085, "rewards/rejected": -0.677608847618103, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 9.391371726989746, "learning_rate": 1.9350217112508145e-07, "logits/chosen": -2.5833280086517334, "logits/rejected": -2.5742995738983154, "logps/chosen": -112.63236999511719, "logps/rejected": -122.10465240478516, "loss": 0.6552, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5806922912597656, "rewards/margins": 0.11484189331531525, "rewards/rejected": -0.6955341696739197, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 8.803918838500977, "learning_rate": 1.934308933565014e-07, "logits/chosen": -2.5108482837677, "logits/rejected": -2.486712694168091, "logps/chosen": -109.10555267333984, "logps/rejected": -121.26512145996094, "loss": 0.65, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5620559453964233, "rewards/margins": 0.125113382935524, "rewards/rejected": -0.6871693134307861, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 10.215635299682617, "learning_rate": 1.9335924005146106e-07, "logits/chosen": -2.635714530944824, "logits/rejected": -2.6039419174194336, "logps/chosen": -117.2509765625, "logps/rejected": -122.74668884277344, "loss": 0.6779, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.603782594203949, "rewards/margins": 0.09146207571029663, "rewards/rejected": -0.6952446699142456, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 8.089561462402344, "learning_rate": 1.9328721149796392e-07, "logits/chosen": -2.5828123092651367, "logits/rejected": -2.5639376640319824, "logps/chosen": -112.38520812988281, "logps/rejected": -126.69588470458984, "loss": 0.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5419041514396667, "rewards/margins": 0.150339737534523, "rewards/rejected": -0.6922438740730286, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 8.862298011779785, "learning_rate": 1.9321480798552184e-07, "logits/chosen": -2.5745413303375244, "logits/rejected": -2.5591864585876465, "logps/chosen": -113.92549133300781, "logps/rejected": -127.76713562011719, "loss": 0.6494, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5950049161911011, "rewards/margins": 0.14280660450458527, "rewards/rejected": -0.7378115653991699, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 10.446368217468262, "learning_rate": 1.9314202980515378e-07, "logits/chosen": -2.5526838302612305, "logits/rejected": -2.5284199714660645, "logps/chosen": -107.75823974609375, "logps/rejected": -116.94807434082031, "loss": 0.6516, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5311118364334106, "rewards/margins": 0.12276077270507812, "rewards/rejected": -0.653872549533844, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.6612462997436523, "eval_logits/rejected": -2.6547348499298096, "eval_logps/chosen": -99.96431732177734, "eval_logps/rejected": -112.99539184570312, "eval_loss": 0.6601956486701965, "eval_rewards/accuracies": 0.6177973747253418, "eval_rewards/chosen": -0.4125242233276367, "eval_rewards/margins": 0.0856284648180008, "eval_rewards/rejected": -0.4981527328491211, "eval_runtime": 382.6214, "eval_samples_per_second": 11.249, "eval_steps_per_second": 1.406, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 8.14836597442627, "learning_rate": 1.9306887724938452e-07, "logits/chosen": -2.520163059234619, "logits/rejected": -2.5044562816619873, "logps/chosen": -107.54792785644531, "logps/rejected": -117.61376953125, "loss": 0.6539, "rewards/accuracies": 0.625, "rewards/chosen": -0.5605472922325134, "rewards/margins": 0.12117090076208115, "rewards/rejected": -0.6817181706428528, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 9.616267204284668, "learning_rate": 1.929953506122438e-07, "logits/chosen": -2.4756884574890137, "logits/rejected": -2.4509835243225098, "logps/chosen": -101.85519409179688, "logps/rejected": -119.7405014038086, "loss": 0.6128, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4634733200073242, "rewards/margins": 0.197960764169693, "rewards/rejected": -0.6614340543746948, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 9.02615737915039, "learning_rate": 1.9292145018926478e-07, "logits/chosen": -2.6216444969177246, "logits/rejected": -2.614626407623291, "logps/chosen": -102.73152923583984, "logps/rejected": -127.5108871459961, "loss": 0.6145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5159082412719727, "rewards/margins": 0.2023492306470871, "rewards/rejected": -0.718257486820221, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 10.349617004394531, "learning_rate": 1.9284717627748308e-07, "logits/chosen": -2.575312376022339, "logits/rejected": -2.561131000518799, "logps/chosen": -104.86790466308594, "logps/rejected": -123.4863052368164, "loss": 0.6254, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.49824801087379456, "rewards/margins": 0.17481979727745056, "rewards/rejected": -0.6730678081512451, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 11.060629844665527, "learning_rate": 1.9277252917543557e-07, "logits/chosen": -2.5221903324127197, "logits/rejected": -2.524599552154541, "logps/chosen": -102.74385833740234, "logps/rejected": -124.79080963134766, "loss": 0.6239, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5166233777999878, "rewards/margins": 0.18595512211322784, "rewards/rejected": -0.7025784850120544, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 9.572096824645996, "learning_rate": 1.92697509183159e-07, "logits/chosen": -2.557387351989746, "logits/rejected": -2.5276927947998047, "logps/chosen": -113.19233703613281, "logps/rejected": -132.50997924804688, "loss": 0.6212, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5933358073234558, "rewards/margins": 0.19637107849121094, "rewards/rejected": -0.7897068858146667, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 10.522073745727539, "learning_rate": 1.926221166021891e-07, "logits/chosen": -2.5148067474365234, "logits/rejected": -2.4904632568359375, "logps/chosen": -114.75852966308594, "logps/rejected": -127.93223571777344, "loss": 0.6534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6190735697746277, "rewards/margins": 0.13477261364459991, "rewards/rejected": -0.7538461089134216, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 9.894550323486328, "learning_rate": 1.9254635173555895e-07, "logits/chosen": -2.541055679321289, "logits/rejected": -2.5070159435272217, "logps/chosen": -116.0576400756836, "logps/rejected": -125.56229400634766, "loss": 0.6492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6138181686401367, "rewards/margins": 0.14079603552818298, "rewards/rejected": -0.7546142339706421, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 11.63806438446045, "learning_rate": 1.9247021488779817e-07, "logits/chosen": -2.482922077178955, "logits/rejected": -2.4876933097839355, "logps/chosen": -107.5826416015625, "logps/rejected": -136.26242065429688, "loss": 0.6146, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5548892021179199, "rewards/margins": 0.21679215133190155, "rewards/rejected": -0.7716813087463379, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 10.26459789276123, "learning_rate": 1.923937063649315e-07, "logits/chosen": -2.519047260284424, "logits/rejected": -2.4944159984588623, "logps/chosen": -115.27989196777344, "logps/rejected": -134.55308532714844, "loss": 0.6318, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5768235921859741, "rewards/margins": 0.17483854293823242, "rewards/rejected": -0.7516621351242065, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 10.61504077911377, "learning_rate": 1.9231682647447757e-07, "logits/chosen": -2.5426928997039795, "logits/rejected": -2.5145814418792725, "logps/chosen": -113.14909362792969, "logps/rejected": -125.3436050415039, "loss": 0.644, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5928040742874146, "rewards/margins": 0.1502991020679474, "rewards/rejected": -0.7431031465530396, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 8.570870399475098, "learning_rate": 1.9223957552544762e-07, "logits/chosen": -2.553067207336426, "logits/rejected": -2.5354630947113037, "logps/chosen": -106.05216979980469, "logps/rejected": -128.2598419189453, "loss": 0.6134, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5415540933609009, "rewards/margins": 0.20719122886657715, "rewards/rejected": -0.748745322227478, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 10.781991958618164, "learning_rate": 1.9216195382834445e-07, "logits/chosen": -2.514934539794922, "logits/rejected": -2.4916977882385254, "logps/chosen": -113.71707916259766, "logps/rejected": -129.2454833984375, "loss": 0.6254, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5919945240020752, "rewards/margins": 0.19340413808822632, "rewards/rejected": -0.7853986024856567, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 11.913883209228516, "learning_rate": 1.9208396169516092e-07, "logits/chosen": -2.5093138217926025, "logits/rejected": -2.484928607940674, "logps/chosen": -113.5191650390625, "logps/rejected": -135.16090393066406, "loss": 0.6264, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.600841224193573, "rewards/margins": 0.18667316436767578, "rewards/rejected": -0.7875143885612488, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 9.88403606414795, "learning_rate": 1.9200559943937895e-07, "logits/chosen": -2.5667591094970703, "logits/rejected": -2.5409789085388184, "logps/chosen": -115.15147399902344, "logps/rejected": -131.19515991210938, "loss": 0.6395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6183281540870667, "rewards/margins": 0.16283147037029266, "rewards/rejected": -0.7811595797538757, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 14.569170951843262, "learning_rate": 1.91926867375968e-07, "logits/chosen": -2.5101914405822754, "logits/rejected": -2.5057315826416016, "logps/chosen": -124.47206115722656, "logps/rejected": -136.46078491210938, "loss": 0.6619, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6859841346740723, "rewards/margins": 0.11802731454372406, "rewards/rejected": -0.8040115237236023, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 11.981207847595215, "learning_rate": 1.9184776582138408e-07, "logits/chosen": -2.497622013092041, "logits/rejected": -2.4783318042755127, "logps/chosen": -116.83412170410156, "logps/rejected": -131.28945922851562, "loss": 0.6503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6329242587089539, "rewards/margins": 0.13447043299674988, "rewards/rejected": -0.7673946619033813, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 8.799779891967773, "learning_rate": 1.9176829509356817e-07, "logits/chosen": -2.5327377319335938, "logits/rejected": -2.4975881576538086, "logps/chosen": -116.349853515625, "logps/rejected": -129.2434844970703, "loss": 0.6319, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6227124929428101, "rewards/margins": 0.17571154236793518, "rewards/rejected": -0.7984240651130676, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 9.702862739562988, "learning_rate": 1.9168845551194526e-07, "logits/chosen": -2.4920172691345215, "logits/rejected": -2.465822219848633, "logps/chosen": -113.99241638183594, "logps/rejected": -130.11317443847656, "loss": 0.6261, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6000428199768066, "rewards/margins": 0.1911824643611908, "rewards/rejected": -0.7912253737449646, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 14.17463207244873, "learning_rate": 1.916082473974228e-07, "logits/chosen": -2.5084242820739746, "logits/rejected": -2.4781010150909424, "logps/chosen": -117.53678894042969, "logps/rejected": -131.0016326904297, "loss": 0.6248, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5984351634979248, "rewards/margins": 0.18290486931800842, "rewards/rejected": -0.7813400030136108, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 14.620065689086914, "learning_rate": 1.9152767107238957e-07, "logits/chosen": -2.520502805709839, "logits/rejected": -2.48211407661438, "logps/chosen": -124.7673110961914, "logps/rejected": -135.38389587402344, "loss": 0.6401, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6729920506477356, "rewards/margins": 0.16020779311656952, "rewards/rejected": -0.8331997990608215, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 10.79979419708252, "learning_rate": 1.9144672686071437e-07, "logits/chosen": -2.4938557147979736, "logits/rejected": -2.4761714935302734, "logps/chosen": -115.97245025634766, "logps/rejected": -133.52169799804688, "loss": 0.6249, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6187423467636108, "rewards/margins": 0.1855977475643158, "rewards/rejected": -0.8043401837348938, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 12.743062973022461, "learning_rate": 1.913654150877446e-07, "logits/chosen": -2.5338752269744873, "logits/rejected": -2.492554187774658, "logps/chosen": -125.34283447265625, "logps/rejected": -134.8113250732422, "loss": 0.6326, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6666573286056519, "rewards/margins": 0.16785266995429993, "rewards/rejected": -0.8345099687576294, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 9.466545104980469, "learning_rate": 1.9128373608030513e-07, "logits/chosen": -2.4681429862976074, "logits/rejected": -2.4535365104675293, "logps/chosen": -116.46714782714844, "logps/rejected": -146.2787322998047, "loss": 0.6043, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6238076090812683, "rewards/margins": 0.24866314232349396, "rewards/rejected": -0.8724706768989563, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 10.31405258178711, "learning_rate": 1.9120169016669683e-07, "logits/chosen": -2.5796279907226562, "logits/rejected": -2.564216136932373, "logps/chosen": -122.28788757324219, "logps/rejected": -137.9895782470703, "loss": 0.624, "rewards/accuracies": 0.625, "rewards/chosen": -0.6498677134513855, "rewards/margins": 0.19236890971660614, "rewards/rejected": -0.8422366380691528, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 12.240978240966797, "learning_rate": 1.9111927767669531e-07, "logits/chosen": -2.577061176300049, "logits/rejected": -2.553990125656128, "logps/chosen": -124.99577331542969, "logps/rejected": -136.03915405273438, "loss": 0.6561, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7081534266471863, "rewards/margins": 0.12281046062707901, "rewards/rejected": -0.8309639096260071, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 14.770833015441895, "learning_rate": 1.9103649894154965e-07, "logits/chosen": -2.467924118041992, "logits/rejected": -2.4463753700256348, "logps/chosen": -125.22386169433594, "logps/rejected": -144.5591278076172, "loss": 0.6129, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.663710355758667, "rewards/margins": 0.22524094581604004, "rewards/rejected": -0.8889514207839966, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 11.515402793884277, "learning_rate": 1.90953354293981e-07, "logits/chosen": -2.509796142578125, "logits/rejected": -2.514373540878296, "logps/chosen": -122.02311706542969, "logps/rejected": -137.66896057128906, "loss": 0.6515, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.711441159248352, "rewards/margins": 0.13033203780651093, "rewards/rejected": -0.8417732119560242, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 10.92282772064209, "learning_rate": 1.908698440681812e-07, "logits/chosen": -2.5480504035949707, "logits/rejected": -2.524164915084839, "logps/chosen": -118.02398681640625, "logps/rejected": -135.6583251953125, "loss": 0.6239, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6383758783340454, "rewards/margins": 0.196736142039299, "rewards/rejected": -0.8351120948791504, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 11.879098892211914, "learning_rate": 1.9078596859981163e-07, "logits/chosen": -2.5341193675994873, "logits/rejected": -2.484837532043457, "logps/chosen": -123.85850524902344, "logps/rejected": -134.4687042236328, "loss": 0.6352, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6702991724014282, "rewards/margins": 0.1717648208141327, "rewards/rejected": -0.8420640230178833, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 10.429671287536621, "learning_rate": 1.9070172822600152e-07, "logits/chosen": -2.5086872577667236, "logits/rejected": -2.4919564723968506, "logps/chosen": -124.47847747802734, "logps/rejected": -147.82156372070312, "loss": 0.6062, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6883711814880371, "rewards/margins": 0.24378545582294464, "rewards/rejected": -0.9321566820144653, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 10.995051383972168, "learning_rate": 1.90617123285347e-07, "logits/chosen": -2.5535805225372314, "logits/rejected": -2.5131783485412598, "logps/chosen": -124.58341979980469, "logps/rejected": -136.53872680664062, "loss": 0.625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6591401100158691, "rewards/margins": 0.20176732540130615, "rewards/rejected": -0.8609074354171753, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 14.10901927947998, "learning_rate": 1.9053215411790945e-07, "logits/chosen": -2.5570924282073975, "logits/rejected": -2.551347017288208, "logps/chosen": -119.05476379394531, "logps/rejected": -138.29513549804688, "loss": 0.6376, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6779693961143494, "rewards/margins": 0.17716078460216522, "rewards/rejected": -0.8551301956176758, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 13.000458717346191, "learning_rate": 1.9044682106521428e-07, "logits/chosen": -2.404637336730957, "logits/rejected": -2.387035846710205, "logps/chosen": -117.692626953125, "logps/rejected": -136.10415649414062, "loss": 0.6281, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6419261693954468, "rewards/margins": 0.1821344792842865, "rewards/rejected": -0.8240607380867004, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 11.085746765136719, "learning_rate": 1.903611244702494e-07, "logits/chosen": -2.3949618339538574, "logits/rejected": -2.3535492420196533, "logps/chosen": -121.14100646972656, "logps/rejected": -135.42344665527344, "loss": 0.6269, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6566764712333679, "rewards/margins": 0.1880909651517868, "rewards/rejected": -0.8447673916816711, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 11.905729293823242, "learning_rate": 1.9027506467746404e-07, "logits/chosen": -2.492781639099121, "logits/rejected": -2.489943027496338, "logps/chosen": -120.00762939453125, "logps/rejected": -153.7581024169922, "loss": 0.6004, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6730445623397827, "rewards/margins": 0.2705869674682617, "rewards/rejected": -0.943631649017334, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 12.589399337768555, "learning_rate": 1.901886420327672e-07, "logits/chosen": -2.486377477645874, "logits/rejected": -2.4649770259857178, "logps/chosen": -122.77647399902344, "logps/rejected": -142.85531616210938, "loss": 0.6216, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.687035858631134, "rewards/margins": 0.21148011088371277, "rewards/rejected": -0.8985159993171692, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 11.657106399536133, "learning_rate": 1.9010185688352643e-07, "logits/chosen": -2.3694095611572266, "logits/rejected": -2.3498167991638184, "logps/chosen": -123.49932861328125, "logps/rejected": -147.36175537109375, "loss": 0.6094, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7015981674194336, "rewards/margins": 0.24004308879375458, "rewards/rejected": -0.9416411519050598, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 10.493300437927246, "learning_rate": 1.9001470957856615e-07, "logits/chosen": -2.459582805633545, "logits/rejected": -2.4476981163024902, "logps/chosen": -125.8658676147461, "logps/rejected": -145.31004333496094, "loss": 0.639, "rewards/accuracies": 0.625, "rewards/chosen": -0.7264918088912964, "rewards/margins": 0.1739417016506195, "rewards/rejected": -0.9004335403442383, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 11.909307479858398, "learning_rate": 1.8992720046816664e-07, "logits/chosen": -2.5346808433532715, "logits/rejected": -2.5022826194763184, "logps/chosen": -130.37509155273438, "logps/rejected": -145.04820251464844, "loss": 0.6264, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7317085266113281, "rewards/margins": 0.1911483258008957, "rewards/rejected": -0.9228569269180298, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.5324018001556396, "eval_logits/rejected": -2.5251705646514893, "eval_logps/chosen": -117.29443359375, "eval_logps/rejected": -133.6785125732422, "eval_loss": 0.6514254808425903, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -0.5858253836631775, "eval_rewards/margins": 0.11915845423936844, "eval_rewards/rejected": -0.7049838900566101, "eval_runtime": 384.7783, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 10.766988754272461, "learning_rate": 1.8983932990406229e-07, "logits/chosen": -2.4037575721740723, "logits/rejected": -2.391706943511963, "logps/chosen": -120.1826400756836, "logps/rejected": -148.49423217773438, "loss": 0.6115, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7216323614120483, "rewards/margins": 0.23313522338867188, "rewards/rejected": -0.9547675251960754, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 12.511350631713867, "learning_rate": 1.8975109823944039e-07, "logits/chosen": -2.412825107574463, "logits/rejected": -2.400951385498047, "logps/chosen": -122.57711029052734, "logps/rejected": -142.6312713623047, "loss": 0.6284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.703916072845459, "rewards/margins": 0.1911393702030182, "rewards/rejected": -0.8950554132461548, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 10.538165092468262, "learning_rate": 1.8966250582893953e-07, "logits/chosen": -2.4241652488708496, "logits/rejected": -2.4017891883850098, "logps/chosen": -120.19538879394531, "logps/rejected": -143.36984252929688, "loss": 0.6065, "rewards/accuracies": 0.75, "rewards/chosen": -0.6414139270782471, "rewards/margins": 0.23436374962329865, "rewards/rejected": -0.8757776021957397, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 13.03740406036377, "learning_rate": 1.8957355302864842e-07, "logits/chosen": -2.489417314529419, "logits/rejected": -2.4761695861816406, "logps/chosen": -125.21080017089844, "logps/rejected": -147.3764190673828, "loss": 0.6079, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6901857852935791, "rewards/margins": 0.24308910965919495, "rewards/rejected": -0.933275043964386, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 11.657258987426758, "learning_rate": 1.894842401961042e-07, "logits/chosen": -2.3974480628967285, "logits/rejected": -2.378633737564087, "logps/chosen": -116.8731689453125, "logps/rejected": -145.9193115234375, "loss": 0.6029, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6630774736404419, "rewards/margins": 0.26788225769996643, "rewards/rejected": -0.9309597015380859, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 12.036991119384766, "learning_rate": 1.8939456769029122e-07, "logits/chosen": -2.394946575164795, "logits/rejected": -2.3649299144744873, "logps/chosen": -135.7313995361328, "logps/rejected": -149.5013885498047, "loss": 0.634, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7937172651290894, "rewards/margins": 0.179894357919693, "rewards/rejected": -0.9736116528511047, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 11.258556365966797, "learning_rate": 1.8930453587163949e-07, "logits/chosen": -2.38643217086792, "logits/rejected": -2.364267349243164, "logps/chosen": -130.9126739501953, "logps/rejected": -153.92453002929688, "loss": 0.6031, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.770807683467865, "rewards/margins": 0.23965749144554138, "rewards/rejected": -1.0104650259017944, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 11.272856712341309, "learning_rate": 1.8921414510202317e-07, "logits/chosen": -2.3410181999206543, "logits/rejected": -2.3312878608703613, "logps/chosen": -126.29512023925781, "logps/rejected": -151.48361206054688, "loss": 0.6124, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7439530491828918, "rewards/margins": 0.23066206276416779, "rewards/rejected": -0.9746150970458984, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 11.280725479125977, "learning_rate": 1.8912339574475925e-07, "logits/chosen": -2.4383339881896973, "logits/rejected": -2.4078497886657715, "logps/chosen": -126.5016860961914, "logps/rejected": -149.1123504638672, "loss": 0.6162, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7102442383766174, "rewards/margins": 0.23562097549438477, "rewards/rejected": -0.9458651542663574, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 15.935708045959473, "learning_rate": 1.8903228816460598e-07, "logits/chosen": -2.373109817504883, "logits/rejected": -2.346482753753662, "logps/chosen": -124.05479431152344, "logps/rejected": -145.33522033691406, "loss": 0.6066, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7121135592460632, "rewards/margins": 0.24879172444343567, "rewards/rejected": -0.9609053730964661, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 11.416752815246582, "learning_rate": 1.8894082272776156e-07, "logits/chosen": -2.3425259590148926, "logits/rejected": -2.331468105316162, "logps/chosen": -138.4196014404297, "logps/rejected": -147.4912872314453, "loss": 0.664, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8235718011856079, "rewards/margins": 0.13630910217761993, "rewards/rejected": -0.9598809480667114, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 11.454225540161133, "learning_rate": 1.8884899980186248e-07, "logits/chosen": -2.3015871047973633, "logits/rejected": -2.3006720542907715, "logps/chosen": -130.07545471191406, "logps/rejected": -158.34634399414062, "loss": 0.616, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8011056184768677, "rewards/margins": 0.23448316752910614, "rewards/rejected": -1.0355888605117798, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 12.713473320007324, "learning_rate": 1.8875681975598207e-07, "logits/chosen": -2.391552686691284, "logits/rejected": -2.370664119720459, "logps/chosen": -126.0354232788086, "logps/rejected": -149.48287963867188, "loss": 0.5988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7206748723983765, "rewards/margins": 0.2488236129283905, "rewards/rejected": -0.9694985151290894, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 11.148635864257812, "learning_rate": 1.8866428296062916e-07, "logits/chosen": -2.3824214935302734, "logits/rejected": -2.358895778656006, "logps/chosen": -138.39901733398438, "logps/rejected": -146.14036560058594, "loss": 0.6656, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.8287339210510254, "rewards/margins": 0.12563897669315338, "rewards/rejected": -0.954372763633728, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 11.041769981384277, "learning_rate": 1.8857138978774647e-07, "logits/chosen": -2.4081053733825684, "logits/rejected": -2.3844470977783203, "logps/chosen": -136.33596801757812, "logps/rejected": -152.49148559570312, "loss": 0.6193, "rewards/accuracies": 0.625, "rewards/chosen": -0.7849429249763489, "rewards/margins": 0.22173313796520233, "rewards/rejected": -1.0066759586334229, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 13.000495910644531, "learning_rate": 1.8847814061070917e-07, "logits/chosen": -2.3095197677612305, "logits/rejected": -2.2821574211120605, "logps/chosen": -130.6693878173828, "logps/rejected": -151.56906127929688, "loss": 0.6095, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7899118661880493, "rewards/margins": 0.23466038703918457, "rewards/rejected": -1.0245721340179443, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 13.73046875, "learning_rate": 1.8838453580432328e-07, "logits/chosen": -2.356701374053955, "logits/rejected": -2.347198963165283, "logps/chosen": -126.84205627441406, "logps/rejected": -151.11383056640625, "loss": 0.6175, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7544603943824768, "rewards/margins": 0.2208242416381836, "rewards/rejected": -0.97528475522995, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 13.165528297424316, "learning_rate": 1.882905757448243e-07, "logits/chosen": -2.337136745452881, "logits/rejected": -2.3184616565704346, "logps/chosen": -131.48397827148438, "logps/rejected": -157.45358276367188, "loss": 0.608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7613840699195862, "rewards/margins": 0.2502623498439789, "rewards/rejected": -1.0116463899612427, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 14.414857864379883, "learning_rate": 1.8819626080987567e-07, "logits/chosen": -2.326634168624878, "logits/rejected": -2.3124537467956543, "logps/chosen": -134.14418029785156, "logps/rejected": -160.92916870117188, "loss": 0.6139, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8307603597640991, "rewards/margins": 0.2533273994922638, "rewards/rejected": -1.08408784866333, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 13.043105125427246, "learning_rate": 1.881015913785671e-07, "logits/chosen": -2.3766627311706543, "logits/rejected": -2.361971378326416, "logps/chosen": -134.19444274902344, "logps/rejected": -143.70457458496094, "loss": 0.6605, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7848796248435974, "rewards/margins": 0.12627723813056946, "rewards/rejected": -0.9111568331718445, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 14.597874641418457, "learning_rate": 1.880065678314133e-07, "logits/chosen": -2.3638992309570312, "logits/rejected": -2.343294620513916, "logps/chosen": -138.7797393798828, "logps/rejected": -149.86465454101562, "loss": 0.6656, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8348299264907837, "rewards/margins": 0.13606666028499603, "rewards/rejected": -0.9708966016769409, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 11.53864860534668, "learning_rate": 1.8791119055035221e-07, "logits/chosen": -2.243114948272705, "logits/rejected": -2.224510431289673, "logps/chosen": -124.7751693725586, "logps/rejected": -145.98992919921875, "loss": 0.6256, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7255649566650391, "rewards/margins": 0.2072831392288208, "rewards/rejected": -0.9328479766845703, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 11.614204406738281, "learning_rate": 1.8781545991874362e-07, "logits/chosen": -2.4114151000976562, "logits/rejected": -2.39131236076355, "logps/chosen": -132.70994567871094, "logps/rejected": -155.8234405517578, "loss": 0.6129, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7740388512611389, "rewards/margins": 0.24319903552532196, "rewards/rejected": -1.017237901687622, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 12.244373321533203, "learning_rate": 1.8771937632136753e-07, "logits/chosen": -2.320223331451416, "logits/rejected": -2.2961888313293457, "logps/chosen": -129.49264526367188, "logps/rejected": -157.07241821289062, "loss": 0.6121, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7474852800369263, "rewards/margins": 0.2636989653110504, "rewards/rejected": -1.0111842155456543, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 15.389056205749512, "learning_rate": 1.8762294014442275e-07, "logits/chosen": -2.3215842247009277, "logits/rejected": -2.305537223815918, "logps/chosen": -127.04652404785156, "logps/rejected": -145.0000457763672, "loss": 0.6248, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7199771404266357, "rewards/margins": 0.20333869755268097, "rewards/rejected": -0.9233158230781555, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 13.800179481506348, "learning_rate": 1.8752615177552515e-07, "logits/chosen": -2.301645040512085, "logits/rejected": -2.287914276123047, "logps/chosen": -126.49116516113281, "logps/rejected": -148.6343536376953, "loss": 0.6291, "rewards/accuracies": 0.625, "rewards/chosen": -0.7196300625801086, "rewards/margins": 0.21817633509635925, "rewards/rejected": -0.9378064274787903, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 12.630337715148926, "learning_rate": 1.8742901160370629e-07, "logits/chosen": -2.3138930797576904, "logits/rejected": -2.2898266315460205, "logps/chosen": -116.51651763916016, "logps/rejected": -137.6942138671875, "loss": 0.6192, "rewards/accuracies": 0.6875, "rewards/chosen": -0.649080753326416, "rewards/margins": 0.2223900854587555, "rewards/rejected": -0.8714709281921387, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 14.70296573638916, "learning_rate": 1.8733152001941162e-07, "logits/chosen": -2.3043904304504395, "logits/rejected": -2.280935764312744, "logps/chosen": -118.82975769042969, "logps/rejected": -144.32821655273438, "loss": 0.6148, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6711223125457764, "rewards/margins": 0.2516058087348938, "rewards/rejected": -0.9227281808853149, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 13.53441047668457, "learning_rate": 1.872336774144992e-07, "logits/chosen": -2.315648317337036, "logits/rejected": -2.2937397956848145, "logps/chosen": -130.9637451171875, "logps/rejected": -150.52886962890625, "loss": 0.6304, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7572386860847473, "rewards/margins": 0.23190510272979736, "rewards/rejected": -0.9891437292098999, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 10.746184349060059, "learning_rate": 1.8713548418223797e-07, "logits/chosen": -2.346978187561035, "logits/rejected": -2.333615779876709, "logps/chosen": -122.73223876953125, "logps/rejected": -146.6703338623047, "loss": 0.6128, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7017590403556824, "rewards/margins": 0.23429390788078308, "rewards/rejected": -0.9360530972480774, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 13.176383972167969, "learning_rate": 1.8703694071730612e-07, "logits/chosen": -2.283977508544922, "logits/rejected": -2.2644619941711426, "logps/chosen": -127.2890625, "logps/rejected": -144.92465209960938, "loss": 0.6235, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7323711514472961, "rewards/margins": 0.2010023295879364, "rewards/rejected": -0.9333734512329102, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 10.389663696289062, "learning_rate": 1.8693804741578964e-07, "logits/chosen": -2.370084047317505, "logits/rejected": -2.341132402420044, "logps/chosen": -133.34310913085938, "logps/rejected": -160.8234100341797, "loss": 0.5774, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7470110654830933, "rewards/margins": 0.31445130705833435, "rewards/rejected": -1.061462163925171, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 13.39374828338623, "learning_rate": 1.8683880467518055e-07, "logits/chosen": -2.2760860919952393, "logits/rejected": -2.246753692626953, "logps/chosen": -132.9408416748047, "logps/rejected": -146.30142211914062, "loss": 0.6354, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7745003700256348, "rewards/margins": 0.19058740139007568, "rewards/rejected": -0.9650877714157104, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 12.95674991607666, "learning_rate": 1.8673921289437554e-07, "logits/chosen": -2.2742409706115723, "logits/rejected": -2.2547922134399414, "logps/chosen": -120.75160217285156, "logps/rejected": -148.62075805664062, "loss": 0.6065, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.717552900314331, "rewards/margins": 0.26367202401161194, "rewards/rejected": -0.9812248945236206, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 13.893998146057129, "learning_rate": 1.8663927247367407e-07, "logits/chosen": -2.28212308883667, "logits/rejected": -2.2669732570648193, "logps/chosen": -120.74113464355469, "logps/rejected": -147.4693145751953, "loss": 0.6028, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6973130106925964, "rewards/margins": 0.2454204559326172, "rewards/rejected": -0.9427334666252136, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 10.430903434753418, "learning_rate": 1.865389838147771e-07, "logits/chosen": -2.3136277198791504, "logits/rejected": -2.2893054485321045, "logps/chosen": -134.9327850341797, "logps/rejected": -152.28640747070312, "loss": 0.6473, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8126112818717957, "rewards/margins": 0.19375188648700714, "rewards/rejected": -1.0063631534576416, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 12.522747993469238, "learning_rate": 1.864383473207852e-07, "logits/chosen": -2.3265700340270996, "logits/rejected": -2.3115689754486084, "logps/chosen": -131.7525634765625, "logps/rejected": -153.37396240234375, "loss": 0.6252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7679398655891418, "rewards/margins": 0.21781209111213684, "rewards/rejected": -0.9857519268989563, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 11.650630950927734, "learning_rate": 1.8633736339619702e-07, "logits/chosen": -2.360729932785034, "logits/rejected": -2.338486909866333, "logps/chosen": -128.01376342773438, "logps/rejected": -149.15093994140625, "loss": 0.6255, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7371021509170532, "rewards/margins": 0.23261289298534393, "rewards/rejected": -0.9697149991989136, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 12.601362228393555, "learning_rate": 1.8623603244690772e-07, "logits/chosen": -2.3034393787384033, "logits/rejected": -2.2892696857452393, "logps/chosen": -126.63724517822266, "logps/rejected": -150.16226196289062, "loss": 0.614, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7340060472488403, "rewards/margins": 0.2326422929763794, "rewards/rejected": -0.966648280620575, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 13.8900146484375, "learning_rate": 1.861343548802073e-07, "logits/chosen": -2.3105218410491943, "logits/rejected": -2.2936127185821533, "logps/chosen": -132.64756774902344, "logps/rejected": -154.4315185546875, "loss": 0.6109, "rewards/accuracies": 0.6875, "rewards/chosen": -0.757422924041748, "rewards/margins": 0.23268385231494904, "rewards/rejected": -0.9901068806648254, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.412363290786743, "eval_logits/rejected": -2.404085397720337, "eval_logps/chosen": -120.88501739501953, "eval_logps/rejected": -139.04837036132812, "eval_loss": 0.6474176049232483, "eval_rewards/accuracies": 0.6312732100486755, "eval_rewards/chosen": -0.6217312216758728, "eval_rewards/margins": 0.1369512975215912, "eval_rewards/rejected": -0.7586825489997864, "eval_runtime": 384.2018, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 13.378119468688965, "learning_rate": 1.8603233110477884e-07, "logits/chosen": -2.268012762069702, "logits/rejected": -2.2514090538024902, "logps/chosen": -134.5967559814453, "logps/rejected": -156.74072265625, "loss": 0.6342, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7970510721206665, "rewards/margins": 0.22004970908164978, "rewards/rejected": -1.0171008110046387, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 14.908544540405273, "learning_rate": 1.8592996153069715e-07, "logits/chosen": -2.3738467693328857, "logits/rejected": -2.33332896232605, "logps/chosen": -133.03750610351562, "logps/rejected": -147.9682159423828, "loss": 0.6339, "rewards/accuracies": 0.625, "rewards/chosen": -0.7822148203849792, "rewards/margins": 0.19276809692382812, "rewards/rejected": -0.9749830365180969, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 13.850336074829102, "learning_rate": 1.8582724656942683e-07, "logits/chosen": -2.265064001083374, "logits/rejected": -2.233612298965454, "logps/chosen": -129.28952026367188, "logps/rejected": -146.53509521484375, "loss": 0.6389, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7550686597824097, "rewards/margins": 0.19707316160202026, "rewards/rejected": -0.9521418809890747, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 14.160219192504883, "learning_rate": 1.8572418663382074e-07, "logits/chosen": -2.32669734954834, "logits/rejected": -2.2965919971466064, "logps/chosen": -141.6306915283203, "logps/rejected": -165.29336547851562, "loss": 0.6015, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8222866058349609, "rewards/margins": 0.28041699528694153, "rewards/rejected": -1.10270357131958, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 12.661294937133789, "learning_rate": 1.8562078213811833e-07, "logits/chosen": -2.204195737838745, "logits/rejected": -2.192131757736206, "logps/chosen": -128.37451171875, "logps/rejected": -152.55081176757812, "loss": 0.6088, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7265924215316772, "rewards/margins": 0.25704535841941833, "rewards/rejected": -0.9836376905441284, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 12.920085906982422, "learning_rate": 1.8551703349794406e-07, "logits/chosen": -2.31882643699646, "logits/rejected": -2.3102874755859375, "logps/chosen": -126.8609848022461, "logps/rejected": -150.83541870117188, "loss": 0.6177, "rewards/accuracies": 0.625, "rewards/chosen": -0.7412899136543274, "rewards/margins": 0.24046726524829865, "rewards/rejected": -0.9817571640014648, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 12.629962921142578, "learning_rate": 1.854129411303055e-07, "logits/chosen": -2.367927312850952, "logits/rejected": -2.33909010887146, "logps/chosen": -130.4448699951172, "logps/rejected": -145.80923461914062, "loss": 0.6269, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7452287077903748, "rewards/margins": 0.2104768306016922, "rewards/rejected": -0.9557055234909058, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 18.944599151611328, "learning_rate": 1.8530850545359193e-07, "logits/chosen": -2.384981393814087, "logits/rejected": -2.3748526573181152, "logps/chosen": -131.04205322265625, "logps/rejected": -155.17080688476562, "loss": 0.6267, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8003128170967102, "rewards/margins": 0.2289293110370636, "rewards/rejected": -1.0292421579360962, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 15.200418472290039, "learning_rate": 1.8520372688757245e-07, "logits/chosen": -2.2746169567108154, "logits/rejected": -2.24771785736084, "logps/chosen": -130.04827880859375, "logps/rejected": -150.07791137695312, "loss": 0.6338, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7890744805335999, "rewards/margins": 0.21333102881908417, "rewards/rejected": -1.0024055242538452, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 12.581011772155762, "learning_rate": 1.8509860585339446e-07, "logits/chosen": -2.285783290863037, "logits/rejected": -2.265993595123291, "logps/chosen": -135.75062561035156, "logps/rejected": -159.28656005859375, "loss": 0.6125, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7941531538963318, "rewards/margins": 0.26907879114151, "rewards/rejected": -1.0632318258285522, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 17.554412841796875, "learning_rate": 1.8499314277358167e-07, "logits/chosen": -2.3803791999816895, "logits/rejected": -2.3529820442199707, "logps/chosen": -133.89425659179688, "logps/rejected": -163.8766632080078, "loss": 0.6036, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.804618239402771, "rewards/margins": 0.2779730558395386, "rewards/rejected": -1.0825912952423096, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 14.42990779876709, "learning_rate": 1.848873380720329e-07, "logits/chosen": -2.336332321166992, "logits/rejected": -2.314807653427124, "logps/chosen": -135.14236450195312, "logps/rejected": -152.7788543701172, "loss": 0.6363, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8254430890083313, "rewards/margins": 0.2015375792980194, "rewards/rejected": -1.0269806385040283, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 14.05636978149414, "learning_rate": 1.8478119217401985e-07, "logits/chosen": -2.318453788757324, "logits/rejected": -2.3068490028381348, "logps/chosen": -128.04605102539062, "logps/rejected": -146.30068969726562, "loss": 0.6409, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7427166700363159, "rewards/margins": 0.1854708194732666, "rewards/rejected": -0.9281874895095825, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 19.649593353271484, "learning_rate": 1.8467470550618574e-07, "logits/chosen": -2.2332546710968018, "logits/rejected": -2.2186062335968018, "logps/chosen": -125.69355773925781, "logps/rejected": -147.77760314941406, "loss": 0.6121, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7296175360679626, "rewards/margins": 0.2388092577457428, "rewards/rejected": -0.9684268236160278, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 16.796405792236328, "learning_rate": 1.8456787849654347e-07, "logits/chosen": -2.3180625438690186, "logits/rejected": -2.293102741241455, "logps/chosen": -131.48257446289062, "logps/rejected": -154.91592407226562, "loss": 0.6116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7512409090995789, "rewards/margins": 0.2640026807785034, "rewards/rejected": -1.015243649482727, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 13.212952613830566, "learning_rate": 1.844607115744739e-07, "logits/chosen": -2.2414722442626953, "logits/rejected": -2.203615665435791, "logps/chosen": -133.26007080078125, "logps/rejected": -159.36962890625, "loss": 0.596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7704302072525024, "rewards/margins": 0.2923758625984192, "rewards/rejected": -1.0628061294555664, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 19.492271423339844, "learning_rate": 1.8435320517072408e-07, "logits/chosen": -2.24662709236145, "logits/rejected": -2.2220511436462402, "logps/chosen": -140.2692108154297, "logps/rejected": -164.09080505371094, "loss": 0.6398, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8917884826660156, "rewards/margins": 0.23158666491508484, "rewards/rejected": -1.1233750581741333, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 14.586845397949219, "learning_rate": 1.842453597174057e-07, "logits/chosen": -2.214069128036499, "logits/rejected": -2.1866512298583984, "logps/chosen": -127.54747009277344, "logps/rejected": -150.66769409179688, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7328786253929138, "rewards/margins": 0.25618213415145874, "rewards/rejected": -0.9890606999397278, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 14.758131980895996, "learning_rate": 1.841371756479931e-07, "logits/chosen": -2.3069567680358887, "logits/rejected": -2.2812230587005615, "logps/chosen": -140.52098083496094, "logps/rejected": -155.68270874023438, "loss": 0.6581, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8563669323921204, "rewards/margins": 0.17839594185352325, "rewards/rejected": -1.034762978553772, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 14.336187362670898, "learning_rate": 1.8402865339732171e-07, "logits/chosen": -2.2076327800750732, "logits/rejected": -2.1841959953308105, "logps/chosen": -134.13148498535156, "logps/rejected": -170.171875, "loss": 0.572, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8076637387275696, "rewards/margins": 0.35054484009742737, "rewards/rejected": -1.1582086086273193, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 15.751847267150879, "learning_rate": 1.8391979340158627e-07, "logits/chosen": -2.219324827194214, "logits/rejected": -2.2080159187316895, "logps/chosen": -141.0472412109375, "logps/rejected": -158.53619384765625, "loss": 0.649, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8990927934646606, "rewards/margins": 0.17679741978645325, "rewards/rejected": -1.075890302658081, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 15.971353530883789, "learning_rate": 1.8381059609833904e-07, "logits/chosen": -2.2696173191070557, "logits/rejected": -2.243626594543457, "logps/chosen": -134.827392578125, "logps/rejected": -163.3299102783203, "loss": 0.5942, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7910867929458618, "rewards/margins": 0.2870883345603943, "rewards/rejected": -1.0781750679016113, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 14.739949226379395, "learning_rate": 1.83701061926488e-07, "logits/chosen": -2.2830615043640137, "logits/rejected": -2.247520923614502, "logps/chosen": -144.0731658935547, "logps/rejected": -171.52520751953125, "loss": 0.6025, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8678949475288391, "rewards/margins": 0.2814059257507324, "rewards/rejected": -1.1493009328842163, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 17.518966674804688, "learning_rate": 1.8359119132629522e-07, "logits/chosen": -2.2724461555480957, "logits/rejected": -2.2560017108917236, "logps/chosen": -151.77694702148438, "logps/rejected": -177.51280212402344, "loss": 0.6233, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9702235460281372, "rewards/margins": 0.24939396977424622, "rewards/rejected": -1.219617486000061, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 16.50297737121582, "learning_rate": 1.8348098473937498e-07, "logits/chosen": -2.2693591117858887, "logits/rejected": -2.2382216453552246, "logps/chosen": -145.84854125976562, "logps/rejected": -167.67819213867188, "loss": 0.6194, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9169775247573853, "rewards/margins": 0.252015620470047, "rewards/rejected": -1.1689932346343994, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 13.565765380859375, "learning_rate": 1.8337044260869195e-07, "logits/chosen": -2.2698116302490234, "logits/rejected": -2.2529296875, "logps/chosen": -135.17910766601562, "logps/rejected": -159.684326171875, "loss": 0.6098, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8087183833122253, "rewards/margins": 0.24533787369728088, "rewards/rejected": -1.0540562868118286, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 15.313465118408203, "learning_rate": 1.8325956537855964e-07, "logits/chosen": -2.2522294521331787, "logits/rejected": -2.228555202484131, "logps/chosen": -133.0584259033203, "logps/rejected": -157.33486938476562, "loss": 0.6001, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8063106536865234, "rewards/margins": 0.276062548160553, "rewards/rejected": -1.0823732614517212, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 14.039936065673828, "learning_rate": 1.8314835349463834e-07, "logits/chosen": -2.249237537384033, "logits/rejected": -2.2180287837982178, "logps/chosen": -136.5815887451172, "logps/rejected": -163.4698944091797, "loss": 0.6178, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8409102559089661, "rewards/margins": 0.27688902616500854, "rewards/rejected": -1.1177994012832642, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 13.222373008728027, "learning_rate": 1.8303680740393354e-07, "logits/chosen": -2.2840068340301514, "logits/rejected": -2.2594456672668457, "logps/chosen": -142.51416015625, "logps/rejected": -176.42428588867188, "loss": 0.5895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8675103187561035, "rewards/margins": 0.3537306487560272, "rewards/rejected": -1.2212409973144531, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 16.832406997680664, "learning_rate": 1.829249275547939e-07, "logits/chosen": -2.2416279315948486, "logits/rejected": -2.227914333343506, "logps/chosen": -143.00843811035156, "logps/rejected": -179.38323974609375, "loss": 0.6022, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9127408862113953, "rewards/margins": 0.32419341802597046, "rewards/rejected": -1.2369344234466553, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 17.441082000732422, "learning_rate": 1.8281271439690972e-07, "logits/chosen": -2.2823708057403564, "logits/rejected": -2.2493019104003906, "logps/chosen": -148.35629272460938, "logps/rejected": -171.28524780273438, "loss": 0.6138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9040901064872742, "rewards/margins": 0.2849331796169281, "rewards/rejected": -1.1890233755111694, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 15.347607612609863, "learning_rate": 1.8270016838131098e-07, "logits/chosen": -2.170727491378784, "logits/rejected": -2.140810489654541, "logps/chosen": -144.2772216796875, "logps/rejected": -172.93899536132812, "loss": 0.5901, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8747490048408508, "rewards/margins": 0.3303028643131256, "rewards/rejected": -1.2050518989562988, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 18.199466705322266, "learning_rate": 1.825872899603655e-07, "logits/chosen": -2.1585171222686768, "logits/rejected": -2.132734537124634, "logps/chosen": -146.39041137695312, "logps/rejected": -168.12551879882812, "loss": 0.6366, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9264416694641113, "rewards/margins": 0.24258160591125488, "rewards/rejected": -1.1690232753753662, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 13.123265266418457, "learning_rate": 1.824740795877772e-07, "logits/chosen": -2.236109972000122, "logits/rejected": -2.2206127643585205, "logps/chosen": -134.59371948242188, "logps/rejected": -167.7301483154297, "loss": 0.5892, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8015215992927551, "rewards/margins": 0.3370732367038727, "rewards/rejected": -1.1385948657989502, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 14.976317405700684, "learning_rate": 1.8236053771858428e-07, "logits/chosen": -2.2023322582244873, "logits/rejected": -2.186938524246216, "logps/chosen": -141.57130432128906, "logps/rejected": -165.6098175048828, "loss": 0.612, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8617923855781555, "rewards/margins": 0.24837076663970947, "rewards/rejected": -1.1101632118225098, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 17.25830078125, "learning_rate": 1.8224666480915732e-07, "logits/chosen": -2.2000133991241455, "logits/rejected": -2.175978660583496, "logps/chosen": -141.7084197998047, "logps/rejected": -169.6337127685547, "loss": 0.6029, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8923633694648743, "rewards/margins": 0.28839367628097534, "rewards/rejected": -1.1807570457458496, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 12.114882469177246, "learning_rate": 1.8213246131719746e-07, "logits/chosen": -2.258085250854492, "logits/rejected": -2.2356557846069336, "logps/chosen": -157.0770721435547, "logps/rejected": -176.11485290527344, "loss": 0.6458, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9748696088790894, "rewards/margins": 0.23428723216056824, "rewards/rejected": -1.2091567516326904, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 13.910969734191895, "learning_rate": 1.8201792770173462e-07, "logits/chosen": -2.1453006267547607, "logits/rejected": -2.1221261024475098, "logps/chosen": -137.5989990234375, "logps/rejected": -174.49765014648438, "loss": 0.5723, "rewards/accuracies": 0.71875, "rewards/chosen": -0.861288845539093, "rewards/margins": 0.34425026178359985, "rewards/rejected": -1.2055391073226929, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 13.05284309387207, "learning_rate": 1.8190306442312565e-07, "logits/chosen": -2.2314467430114746, "logits/rejected": -2.208369731903076, "logps/chosen": -151.15231323242188, "logps/rejected": -163.9604034423828, "loss": 0.6475, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9336099624633789, "rewards/margins": 0.1781330555677414, "rewards/rejected": -1.1117430925369263, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 15.045424461364746, "learning_rate": 1.8178787194305239e-07, "logits/chosen": -2.179490089416504, "logits/rejected": -2.160287857055664, "logps/chosen": -134.5084228515625, "logps/rejected": -158.16839599609375, "loss": 0.6153, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8251282572746277, "rewards/margins": 0.25325918197631836, "rewards/rejected": -1.0783874988555908, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.3301966190338135, "eval_logits/rejected": -2.3205788135528564, "eval_logps/chosen": -129.8305206298828, "eval_logps/rejected": -150.38137817382812, "eval_loss": 0.6432415246963501, "eval_rewards/accuracies": 0.6266263723373413, "eval_rewards/chosen": -0.7111861705780029, "eval_rewards/margins": 0.16082628071308136, "eval_rewards/rejected": -0.8720124363899231, "eval_runtime": 384.3644, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 13.558989524841309, "learning_rate": 1.816723507245199e-07, "logits/chosen": -2.1871228218078613, "logits/rejected": -2.151096820831299, "logps/chosen": -139.2497100830078, "logps/rejected": -166.04983520507812, "loss": 0.5912, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.820334255695343, "rewards/margins": 0.31642287969589233, "rewards/rejected": -1.1367571353912354, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 13.416345596313477, "learning_rate": 1.8155650123185458e-07, "logits/chosen": -2.226797580718994, "logits/rejected": -2.213923931121826, "logps/chosen": -133.97592163085938, "logps/rejected": -164.05328369140625, "loss": 0.6038, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8281854391098022, "rewards/margins": 0.2941720485687256, "rewards/rejected": -1.1223574876785278, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 13.031325340270996, "learning_rate": 1.8144032393070225e-07, "logits/chosen": -2.2374587059020996, "logits/rejected": -2.2140958309173584, "logps/chosen": -139.13052368164062, "logps/rejected": -157.33258056640625, "loss": 0.6539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8555358648300171, "rewards/margins": 0.19151292741298676, "rewards/rejected": -1.047048807144165, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 12.416264533996582, "learning_rate": 1.8132381928802643e-07, "logits/chosen": -2.192883253097534, "logits/rejected": -2.1586222648620605, "logps/chosen": -144.0219268798828, "logps/rejected": -174.36602783203125, "loss": 0.5878, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8785157203674316, "rewards/margins": 0.3410136103630066, "rewards/rejected": -1.219529390335083, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 14.267515182495117, "learning_rate": 1.8120698777210626e-07, "logits/chosen": -2.24122953414917, "logits/rejected": -2.222362518310547, "logps/chosen": -140.9573211669922, "logps/rejected": -168.92770385742188, "loss": 0.6085, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8682325482368469, "rewards/margins": 0.28492265939712524, "rewards/rejected": -1.1531550884246826, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 17.367626190185547, "learning_rate": 1.8108982985253472e-07, "logits/chosen": -2.249375104904175, "logits/rejected": -2.2173800468444824, "logps/chosen": -148.92501831054688, "logps/rejected": -161.76766967773438, "loss": 0.638, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9046787023544312, "rewards/margins": 0.20723576843738556, "rewards/rejected": -1.1119145154953003, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 17.7515869140625, "learning_rate": 1.8097234600021679e-07, "logits/chosen": -2.258147954940796, "logits/rejected": -2.229663133621216, "logps/chosen": -149.1612091064453, "logps/rejected": -175.52978515625, "loss": 0.6017, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9169160723686218, "rewards/margins": 0.3234608769416809, "rewards/rejected": -1.2403769493103027, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 15.29587459564209, "learning_rate": 1.8085453668736745e-07, "logits/chosen": -2.1662745475769043, "logits/rejected": -2.133847713470459, "logps/chosen": -138.71878051757812, "logps/rejected": -161.3764190673828, "loss": 0.622, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8553798794746399, "rewards/margins": 0.2521751821041107, "rewards/rejected": -1.1075549125671387, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 14.815744400024414, "learning_rate": 1.8073640238750988e-07, "logits/chosen": -2.2213339805603027, "logits/rejected": -2.1952900886535645, "logps/chosen": -144.75059509277344, "logps/rejected": -175.25985717773438, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8932998776435852, "rewards/margins": 0.32396167516708374, "rewards/rejected": -1.217261552810669, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 13.8853178024292, "learning_rate": 1.806179435754735e-07, "logits/chosen": -2.196868419647217, "logits/rejected": -2.164306640625, "logps/chosen": -144.29612731933594, "logps/rejected": -169.0148468017578, "loss": 0.6377, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9109374284744263, "rewards/margins": 0.2767733931541443, "rewards/rejected": -1.1877108812332153, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 15.792616844177246, "learning_rate": 1.804991607273921e-07, "logits/chosen": -2.306757926940918, "logits/rejected": -2.2741053104400635, "logps/chosen": -147.36080932617188, "logps/rejected": -171.72479248046875, "loss": 0.6104, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9102157354354858, "rewards/margins": 0.2826886773109436, "rewards/rejected": -1.1929042339324951, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 15.132826805114746, "learning_rate": 1.8038005432070183e-07, "logits/chosen": -2.151120662689209, "logits/rejected": -2.1305134296417236, "logps/chosen": -142.0108184814453, "logps/rejected": -177.2761993408203, "loss": 0.5886, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8817094564437866, "rewards/margins": 0.3563971221446991, "rewards/rejected": -1.238106608390808, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 15.987706184387207, "learning_rate": 1.8026062483413943e-07, "logits/chosen": -2.318948984146118, "logits/rejected": -2.3007540702819824, "logps/chosen": -147.85739135742188, "logps/rejected": -180.6566619873047, "loss": 0.6267, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.947142481803894, "rewards/margins": 0.27939143776893616, "rewards/rejected": -1.2265340089797974, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 15.91757869720459, "learning_rate": 1.8014087274774018e-07, "logits/chosen": -2.2689530849456787, "logits/rejected": -2.238652467727661, "logps/chosen": -147.04481506347656, "logps/rejected": -176.6737060546875, "loss": 0.5941, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9112297296524048, "rewards/margins": 0.323239266872406, "rewards/rejected": -1.2344690561294556, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 15.712467193603516, "learning_rate": 1.8002079854283605e-07, "logits/chosen": -2.0711658000946045, "logits/rejected": -2.0532054901123047, "logps/chosen": -141.21932983398438, "logps/rejected": -167.3578338623047, "loss": 0.6104, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8639213442802429, "rewards/margins": 0.2684737741947174, "rewards/rejected": -1.1323951482772827, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 19.658069610595703, "learning_rate": 1.799004027020537e-07, "logits/chosen": -2.1870884895324707, "logits/rejected": -2.1787643432617188, "logps/chosen": -137.68484497070312, "logps/rejected": -175.98887634277344, "loss": 0.5754, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8543106317520142, "rewards/margins": 0.34138739109039307, "rewards/rejected": -1.1956980228424072, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 14.821301460266113, "learning_rate": 1.7977968570931262e-07, "logits/chosen": -2.135411500930786, "logits/rejected": -2.118692636489868, "logps/chosen": -145.12217712402344, "logps/rejected": -184.40428161621094, "loss": 0.5729, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9374872446060181, "rewards/margins": 0.387998104095459, "rewards/rejected": -1.3254854679107666, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 14.007376670837402, "learning_rate": 1.796586480498231e-07, "logits/chosen": -2.1761584281921387, "logits/rejected": -2.161414384841919, "logps/chosen": -148.2333221435547, "logps/rejected": -183.5047607421875, "loss": 0.5911, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9537581205368042, "rewards/margins": 0.3324365019798279, "rewards/rejected": -1.2861945629119873, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 18.360790252685547, "learning_rate": 1.7953729021008434e-07, "logits/chosen": -2.110150098800659, "logits/rejected": -2.0944671630859375, "logps/chosen": -151.00375366210938, "logps/rejected": -184.45541381835938, "loss": 0.6066, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9921935200691223, "rewards/margins": 0.30348965525627136, "rewards/rejected": -1.2956831455230713, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 15.967220306396484, "learning_rate": 1.7941561267788245e-07, "logits/chosen": -2.1210286617279053, "logits/rejected": -2.093567371368408, "logps/chosen": -151.39828491210938, "logps/rejected": -182.8577880859375, "loss": 0.5942, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9559431076049805, "rewards/margins": 0.3455808162689209, "rewards/rejected": -1.3015239238739014, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 17.03342056274414, "learning_rate": 1.7929361594228852e-07, "logits/chosen": -2.1279661655426025, "logits/rejected": -2.0970072746276855, "logps/chosen": -153.00637817382812, "logps/rejected": -183.12176513671875, "loss": 0.5895, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.999555766582489, "rewards/margins": 0.3205220103263855, "rewards/rejected": -1.320077896118164, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 16.8343448638916, "learning_rate": 1.7917130049365672e-07, "logits/chosen": -2.1045081615448, "logits/rejected": -2.074673652648926, "logps/chosen": -152.74563598632812, "logps/rejected": -183.15377807617188, "loss": 0.6032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.992462158203125, "rewards/margins": 0.3191111385822296, "rewards/rejected": -1.3115732669830322, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 19.641525268554688, "learning_rate": 1.7904866682362213e-07, "logits/chosen": -2.0697884559631348, "logits/rejected": -2.053832530975342, "logps/chosen": -151.6670379638672, "logps/rejected": -182.6899871826172, "loss": 0.6099, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9904986619949341, "rewards/margins": 0.2968147397041321, "rewards/rejected": -1.2873132228851318, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 18.75517463684082, "learning_rate": 1.7892571542509896e-07, "logits/chosen": -2.2520158290863037, "logits/rejected": -2.215366840362549, "logps/chosen": -163.31063842773438, "logps/rejected": -180.81187438964844, "loss": 0.6286, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0615317821502686, "rewards/margins": 0.24211840331554413, "rewards/rejected": -1.3036502599716187, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 17.165555953979492, "learning_rate": 1.7880244679227853e-07, "logits/chosen": -2.1506662368774414, "logits/rejected": -2.1234805583953857, "logps/chosen": -151.9202880859375, "logps/rejected": -183.21421813964844, "loss": 0.6096, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9631596803665161, "rewards/margins": 0.3219161629676819, "rewards/rejected": -1.2850759029388428, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 21.54743766784668, "learning_rate": 1.7867886142062717e-07, "logits/chosen": -2.0237479209899902, "logits/rejected": -2.013625383377075, "logps/chosen": -146.44781494140625, "logps/rejected": -173.47976684570312, "loss": 0.6347, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9597591161727905, "rewards/margins": 0.2546851933002472, "rewards/rejected": -1.2144443988800049, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 16.986095428466797, "learning_rate": 1.785549598068844e-07, "logits/chosen": -2.1160712242126465, "logits/rejected": -2.089094638824463, "logps/chosen": -145.29318237304688, "logps/rejected": -166.83309936523438, "loss": 0.6317, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9254485368728638, "rewards/margins": 0.261610746383667, "rewards/rejected": -1.1870592832565308, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 11.260225296020508, "learning_rate": 1.7843074244906075e-07, "logits/chosen": -2.313349485397339, "logits/rejected": -2.273677349090576, "logps/chosen": -141.82994079589844, "logps/rejected": -171.1905975341797, "loss": 0.5794, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8432556390762329, "rewards/margins": 0.3597295582294464, "rewards/rejected": -1.202985167503357, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 19.612459182739258, "learning_rate": 1.7830620984643597e-07, "logits/chosen": -2.1832711696624756, "logits/rejected": -2.146843671798706, "logps/chosen": -141.8723907470703, "logps/rejected": -175.12730407714844, "loss": 0.5775, "rewards/accuracies": 0.75, "rewards/chosen": -0.8758166432380676, "rewards/margins": 0.36045199632644653, "rewards/rejected": -1.2362687587738037, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 21.284860610961914, "learning_rate": 1.7818136249955678e-07, "logits/chosen": -2.020749568939209, "logits/rejected": -1.985507607460022, "logps/chosen": -154.8480987548828, "logps/rejected": -173.80126953125, "loss": 0.6379, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9978119134902954, "rewards/margins": 0.21697509288787842, "rewards/rejected": -1.2147870063781738, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 15.660050392150879, "learning_rate": 1.7805620091023505e-07, "logits/chosen": -2.145282745361328, "logits/rejected": -2.1087963581085205, "logps/chosen": -161.85922241210938, "logps/rejected": -181.78924560546875, "loss": 0.6592, "rewards/accuracies": 0.59375, "rewards/chosen": -1.03986394405365, "rewards/margins": 0.2396329641342163, "rewards/rejected": -1.2794967889785767, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 18.590288162231445, "learning_rate": 1.7793072558154573e-07, "logits/chosen": -2.2191407680511475, "logits/rejected": -2.187811851501465, "logps/chosen": -146.4526824951172, "logps/rejected": -164.67807006835938, "loss": 0.6344, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9257931709289551, "rewards/margins": 0.21812419593334198, "rewards/rejected": -1.1439173221588135, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 15.55690860748291, "learning_rate": 1.778049370178248e-07, "logits/chosen": -2.229480743408203, "logits/rejected": -2.1971240043640137, "logps/chosen": -149.29954528808594, "logps/rejected": -176.12698364257812, "loss": 0.6021, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9573342204093933, "rewards/margins": 0.3041648268699646, "rewards/rejected": -1.2614991664886475, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 17.51070213317871, "learning_rate": 1.7767883572466726e-07, "logits/chosen": -2.1650819778442383, "logits/rejected": -2.1310112476348877, "logps/chosen": -149.53402709960938, "logps/rejected": -166.9405975341797, "loss": 0.6326, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9171573519706726, "rewards/margins": 0.2225378453731537, "rewards/rejected": -1.139695167541504, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 16.04876136779785, "learning_rate": 1.7755242220892507e-07, "logits/chosen": -2.190516233444214, "logits/rejected": -2.175175189971924, "logps/chosen": -142.1367950439453, "logps/rejected": -170.53720092773438, "loss": 0.6166, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8900700807571411, "rewards/margins": 0.26656752824783325, "rewards/rejected": -1.1566376686096191, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 19.764726638793945, "learning_rate": 1.7742569697870512e-07, "logits/chosen": -2.173518419265747, "logits/rejected": -2.1372475624084473, "logps/chosen": -137.65684509277344, "logps/rejected": -161.5216064453125, "loss": 0.6132, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.862400233745575, "rewards/margins": 0.27269431948661804, "rewards/rejected": -1.135094404220581, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 15.33234691619873, "learning_rate": 1.7729866054336734e-07, "logits/chosen": -2.157505750656128, "logits/rejected": -2.1266400814056396, "logps/chosen": -136.25643920898438, "logps/rejected": -175.56077575683594, "loss": 0.5571, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.8095985651016235, "rewards/margins": 0.4275360703468323, "rewards/rejected": -1.2371346950531006, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 24.214967727661133, "learning_rate": 1.7717131341352235e-07, "logits/chosen": -2.2335801124572754, "logits/rejected": -2.217602252960205, "logps/chosen": -159.39169311523438, "logps/rejected": -182.57766723632812, "loss": 0.6406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0398868322372437, "rewards/margins": 0.240803524851799, "rewards/rejected": -1.2806904315948486, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 19.098445892333984, "learning_rate": 1.770436561010297e-07, "logits/chosen": -2.11548113822937, "logits/rejected": -2.0990681648254395, "logps/chosen": -151.03292846679688, "logps/rejected": -174.67657470703125, "loss": 0.6363, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.981927216053009, "rewards/margins": 0.22593578696250916, "rewards/rejected": -1.2078630924224854, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 14.947031021118164, "learning_rate": 1.7691568911899556e-07, "logits/chosen": -2.1943612098693848, "logits/rejected": -2.1697402000427246, "logps/chosen": -144.95828247070312, "logps/rejected": -175.0249786376953, "loss": 0.6107, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9287967681884766, "rewards/margins": 0.28082185983657837, "rewards/rejected": -1.2096188068389893, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.2476398944854736, "eval_logits/rejected": -2.236311197280884, "eval_logps/chosen": -133.41664123535156, "eval_logps/rejected": -155.67408752441406, "eval_loss": 0.6407474279403687, "eval_rewards/accuracies": 0.6349906921386719, "eval_rewards/chosen": -0.747047483921051, "eval_rewards/margins": 0.17789211869239807, "eval_rewards/rejected": -0.9249395728111267, "eval_runtime": 384.3229, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 14.731963157653809, "learning_rate": 1.7678741298177092e-07, "logits/chosen": -2.138547658920288, "logits/rejected": -2.119096279144287, "logps/chosen": -143.7112274169922, "logps/rejected": -167.04486083984375, "loss": 0.6323, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9207198023796082, "rewards/margins": 0.24972346425056458, "rewards/rejected": -1.1704432964324951, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 13.480340003967285, "learning_rate": 1.766588282049494e-07, "logits/chosen": -2.1872940063476562, "logits/rejected": -2.1705737113952637, "logps/chosen": -134.97857666015625, "logps/rejected": -158.5009307861328, "loss": 0.6425, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8236749768257141, "rewards/margins": 0.20152375102043152, "rewards/rejected": -1.0251986980438232, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 16.07693099975586, "learning_rate": 1.7652993530536497e-07, "logits/chosen": -2.1148829460144043, "logits/rejected": -2.096147298812866, "logps/chosen": -139.06875610351562, "logps/rejected": -180.09439086914062, "loss": 0.5651, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8380593061447144, "rewards/margins": 0.40176233649253845, "rewards/rejected": -1.2398215532302856, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 16.134185791015625, "learning_rate": 1.764007348010903e-07, "logits/chosen": -2.11022686958313, "logits/rejected": -2.07845401763916, "logps/chosen": -138.83816528320312, "logps/rejected": -184.94107055664062, "loss": 0.5544, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8606517910957336, "rewards/margins": 0.4552828371524811, "rewards/rejected": -1.3159345388412476, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 14.774961471557617, "learning_rate": 1.762712272114343e-07, "logits/chosen": -2.062854051589966, "logits/rejected": -2.043586015701294, "logps/chosen": -137.57249450683594, "logps/rejected": -168.33363342285156, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8659477233886719, "rewards/margins": 0.3085930049419403, "rewards/rejected": -1.174540638923645, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 25.076032638549805, "learning_rate": 1.7614141305694029e-07, "logits/chosen": -2.1341543197631836, "logits/rejected": -2.0903890132904053, "logps/chosen": -144.52560424804688, "logps/rejected": -169.38229370117188, "loss": 0.6099, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8774024844169617, "rewards/margins": 0.30131369829177856, "rewards/rejected": -1.1787161827087402, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 13.271434783935547, "learning_rate": 1.7601129285938364e-07, "logits/chosen": -2.2262303829193115, "logits/rejected": -2.208543300628662, "logps/chosen": -142.20352172851562, "logps/rejected": -175.04721069335938, "loss": 0.5965, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9064930081367493, "rewards/margins": 0.33251988887786865, "rewards/rejected": -1.2390129566192627, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 17.105276107788086, "learning_rate": 1.7588086714177003e-07, "logits/chosen": -2.1332719326019287, "logits/rejected": -2.0988264083862305, "logps/chosen": -152.53726196289062, "logps/rejected": -174.6946563720703, "loss": 0.6222, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9816780090332031, "rewards/margins": 0.2585660517215729, "rewards/rejected": -1.2402441501617432, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 24.475873947143555, "learning_rate": 1.7575013642833295e-07, "logits/chosen": -2.0810658931732178, "logits/rejected": -2.039071798324585, "logps/chosen": -147.7109832763672, "logps/rejected": -168.82142639160156, "loss": 0.6285, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9185523986816406, "rewards/margins": 0.24926848709583282, "rewards/rejected": -1.167820930480957, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 13.721856117248535, "learning_rate": 1.7561910124453195e-07, "logits/chosen": -2.087953805923462, "logits/rejected": -2.0669302940368652, "logps/chosen": -144.81130981445312, "logps/rejected": -178.23941040039062, "loss": 0.5901, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8971312642097473, "rewards/margins": 0.3337869942188263, "rewards/rejected": -1.2309181690216064, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 14.560609817504883, "learning_rate": 1.7548776211705034e-07, "logits/chosen": -2.1995584964752197, "logits/rejected": -2.185981512069702, "logps/chosen": -152.6493682861328, "logps/rejected": -172.60911560058594, "loss": 0.6313, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9451559782028198, "rewards/margins": 0.25696811079978943, "rewards/rejected": -1.2021242380142212, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 18.688629150390625, "learning_rate": 1.7535611957379302e-07, "logits/chosen": -2.129035472869873, "logits/rejected": -2.074998140335083, "logps/chosen": -151.6221160888672, "logps/rejected": -174.55166625976562, "loss": 0.6073, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9406827092170715, "rewards/margins": 0.30224713683128357, "rewards/rejected": -1.2429296970367432, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 12.732282638549805, "learning_rate": 1.7522417414388446e-07, "logits/chosen": -2.0805225372314453, "logits/rejected": -2.074148416519165, "logps/chosen": -141.38742065429688, "logps/rejected": -182.73350524902344, "loss": 0.5767, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9053370356559753, "rewards/margins": 0.38774365186691284, "rewards/rejected": -1.2930806875228882, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 16.328292846679688, "learning_rate": 1.7509192635766664e-07, "logits/chosen": -2.1107449531555176, "logits/rejected": -2.062073230743408, "logps/chosen": -146.4422607421875, "logps/rejected": -170.7941131591797, "loss": 0.6002, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.90080326795578, "rewards/margins": 0.2933877110481262, "rewards/rejected": -1.1941907405853271, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 19.853805541992188, "learning_rate": 1.7495937674669675e-07, "logits/chosen": -2.062171459197998, "logits/rejected": -2.0297489166259766, "logps/chosen": -144.7355499267578, "logps/rejected": -170.14242553710938, "loss": 0.6209, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9209309816360474, "rewards/margins": 0.2607792317867279, "rewards/rejected": -1.181710124015808, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 16.074657440185547, "learning_rate": 1.7482652584374514e-07, "logits/chosen": -2.160583019256592, "logits/rejected": -2.1394407749176025, "logps/chosen": -142.66517639160156, "logps/rejected": -187.32321166992188, "loss": 0.5671, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8945107460021973, "rewards/margins": 0.4139057993888855, "rewards/rejected": -1.3084166049957275, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 20.958538055419922, "learning_rate": 1.7469337418279325e-07, "logits/chosen": -2.069669008255005, "logits/rejected": -2.050961971282959, "logps/chosen": -143.38893127441406, "logps/rejected": -171.64334106445312, "loss": 0.6235, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9146418571472168, "rewards/margins": 0.26775017380714417, "rewards/rejected": -1.1823922395706177, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 19.33513069152832, "learning_rate": 1.7455992229903133e-07, "logits/chosen": -2.16186785697937, "logits/rejected": -2.122178077697754, "logps/chosen": -155.24649047851562, "logps/rejected": -181.39578247070312, "loss": 0.5992, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9784836769104004, "rewards/margins": 0.3191712498664856, "rewards/rejected": -1.2976548671722412, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 19.882524490356445, "learning_rate": 1.7442617072885627e-07, "logits/chosen": -2.08012056350708, "logits/rejected": -2.0367953777313232, "logps/chosen": -161.18124389648438, "logps/rejected": -183.7856903076172, "loss": 0.6114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0261393785476685, "rewards/margins": 0.29558470845222473, "rewards/rejected": -1.3217241764068604, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 16.86815643310547, "learning_rate": 1.7429212000986965e-07, "logits/chosen": -2.0811963081359863, "logits/rejected": -2.0536980628967285, "logps/chosen": -143.79843139648438, "logps/rejected": -189.55067443847656, "loss": 0.563, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9269205927848816, "rewards/margins": 0.4296680986881256, "rewards/rejected": -1.35658860206604, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 27.655717849731445, "learning_rate": 1.7415777068087545e-07, "logits/chosen": -2.086303949356079, "logits/rejected": -2.0706019401550293, "logps/chosen": -162.78172302246094, "logps/rejected": -184.86241149902344, "loss": 0.6245, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0614235401153564, "rewards/margins": 0.2463240623474121, "rewards/rejected": -1.3077476024627686, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 19.206357955932617, "learning_rate": 1.7402312328187776e-07, "logits/chosen": -2.1491780281066895, "logits/rejected": -2.1271309852600098, "logps/chosen": -153.69996643066406, "logps/rejected": -181.80645751953125, "loss": 0.6144, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9998170733451843, "rewards/margins": 0.28072595596313477, "rewards/rejected": -1.2805430889129639, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 19.00049591064453, "learning_rate": 1.7388817835407884e-07, "logits/chosen": -2.1044201850891113, "logits/rejected": -2.08538556098938, "logps/chosen": -156.75936889648438, "logps/rejected": -189.45535278320312, "loss": 0.5936, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0316128730773926, "rewards/margins": 0.3454335331916809, "rewards/rejected": -1.3770463466644287, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 14.75564956665039, "learning_rate": 1.737529364398768e-07, "logits/chosen": -2.086437225341797, "logits/rejected": -2.061494827270508, "logps/chosen": -159.58126831054688, "logps/rejected": -198.78977966308594, "loss": 0.5698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0337985754013062, "rewards/margins": 0.4016556739807129, "rewards/rejected": -1.4354543685913086, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 24.83152961730957, "learning_rate": 1.7361739808286343e-07, "logits/chosen": -2.0081675052642822, "logits/rejected": -1.988793969154358, "logps/chosen": -162.98727416992188, "logps/rejected": -194.2002716064453, "loss": 0.5992, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0740816593170166, "rewards/margins": 0.3331459164619446, "rewards/rejected": -1.407227635383606, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 17.985929489135742, "learning_rate": 1.7348156382782215e-07, "logits/chosen": -2.0747618675231934, "logits/rejected": -2.0514020919799805, "logps/chosen": -161.99270629882812, "logps/rejected": -185.65159606933594, "loss": 0.6304, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0814931392669678, "rewards/margins": 0.2549132704734802, "rewards/rejected": -1.3364064693450928, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 18.192277908325195, "learning_rate": 1.733454342207256e-07, "logits/chosen": -2.0366196632385254, "logits/rejected": -2.017728328704834, "logps/chosen": -159.3045654296875, "logps/rejected": -180.2318878173828, "loss": 0.6695, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0794590711593628, "rewards/margins": 0.21801552176475525, "rewards/rejected": -1.29747474193573, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 19.787668228149414, "learning_rate": 1.732090098087336e-07, "logits/chosen": -2.0741186141967773, "logits/rejected": -2.04555082321167, "logps/chosen": -153.68495178222656, "logps/rejected": -189.20364379882812, "loss": 0.5878, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0074174404144287, "rewards/margins": 0.36818450689315796, "rewards/rejected": -1.3756020069122314, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 14.216739654541016, "learning_rate": 1.7307229114019091e-07, "logits/chosen": -2.0435163974761963, "logits/rejected": -2.010690212249756, "logps/chosen": -154.7379150390625, "logps/rejected": -180.75912475585938, "loss": 0.6133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9917277097702026, "rewards/margins": 0.29374217987060547, "rewards/rejected": -1.285469889640808, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 21.31083106994629, "learning_rate": 1.7293527876462504e-07, "logits/chosen": -2.1244988441467285, "logits/rejected": -2.1008124351501465, "logps/chosen": -150.3030242919922, "logps/rejected": -189.2754364013672, "loss": 0.581, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9649218320846558, "rewards/margins": 0.3811757564544678, "rewards/rejected": -1.346097707748413, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 17.781944274902344, "learning_rate": 1.72797973232744e-07, "logits/chosen": -2.0382659435272217, "logits/rejected": -2.017784833908081, "logps/chosen": -165.6508026123047, "logps/rejected": -191.04080200195312, "loss": 0.6376, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1242339611053467, "rewards/margins": 0.2588551640510559, "rewards/rejected": -1.3830890655517578, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 21.734769821166992, "learning_rate": 1.726603750964341e-07, "logits/chosen": -2.031649112701416, "logits/rejected": -2.0036509037017822, "logps/chosen": -155.42190551757812, "logps/rejected": -185.03111267089844, "loss": 0.5963, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9959745407104492, "rewards/margins": 0.33889469504356384, "rewards/rejected": -1.334869146347046, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 20.869234085083008, "learning_rate": 1.725224849087578e-07, "logits/chosen": -2.066890239715576, "logits/rejected": -2.0298147201538086, "logps/chosen": -162.42332458496094, "logps/rejected": -187.11082458496094, "loss": 0.6107, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0251661539077759, "rewards/margins": 0.3152082860469818, "rewards/rejected": -1.34037446975708, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 22.375978469848633, "learning_rate": 1.723843032239514e-07, "logits/chosen": -2.0838558673858643, "logits/rejected": -2.078803539276123, "logps/chosen": -147.0901641845703, "logps/rejected": -188.95034790039062, "loss": 0.584, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9474297761917114, "rewards/margins": 0.37506046891212463, "rewards/rejected": -1.3224903345108032, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 16.93791961669922, "learning_rate": 1.722458305974229e-07, "logits/chosen": -1.950770378112793, "logits/rejected": -1.9323211908340454, "logps/chosen": -156.38357543945312, "logps/rejected": -177.25633239746094, "loss": 0.6578, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0376572608947754, "rewards/margins": 0.19414012134075165, "rewards/rejected": -1.231797456741333, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 17.297195434570312, "learning_rate": 1.7210706758574957e-07, "logits/chosen": -2.0867764949798584, "logits/rejected": -2.0538828372955322, "logps/chosen": -140.0636444091797, "logps/rejected": -172.29058837890625, "loss": 0.5909, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.881769061088562, "rewards/margins": 0.3462349772453308, "rewards/rejected": -1.2280040979385376, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 16.964792251586914, "learning_rate": 1.71968014746676e-07, "logits/chosen": -2.115882635116577, "logits/rejected": -2.0990474224090576, "logps/chosen": -137.8494415283203, "logps/rejected": -174.75584411621094, "loss": 0.6051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8718293905258179, "rewards/margins": 0.30636200308799744, "rewards/rejected": -1.1781913042068481, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 14.352399826049805, "learning_rate": 1.7182867263911163e-07, "logits/chosen": -1.999959945678711, "logits/rejected": -1.9783000946044922, "logps/chosen": -146.61141967773438, "logps/rejected": -180.71133422851562, "loss": 0.5877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.934338390827179, "rewards/margins": 0.3349146246910095, "rewards/rejected": -1.2692530155181885, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 29.507858276367188, "learning_rate": 1.7168904182312863e-07, "logits/chosen": -2.149902820587158, "logits/rejected": -2.106049060821533, "logps/chosen": -155.22250366210938, "logps/rejected": -183.86978149414062, "loss": 0.6186, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9839137196540833, "rewards/margins": 0.3032621741294861, "rewards/rejected": -1.2871758937835693, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 18.434415817260742, "learning_rate": 1.715491228599596e-07, "logits/chosen": -2.1140358448028564, "logits/rejected": -2.101685047149658, "logps/chosen": -149.51068115234375, "logps/rejected": -186.94764709472656, "loss": 0.6061, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9700676798820496, "rewards/margins": 0.326327383518219, "rewards/rejected": -1.296394944190979, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.1859350204467773, "eval_logits/rejected": -2.1733474731445312, "eval_logps/chosen": -137.2255096435547, "eval_logps/rejected": -160.406982421875, "eval_loss": 0.6391822695732117, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -0.785136342048645, "eval_rewards/margins": 0.1871323138475418, "eval_rewards/rejected": -0.9722687005996704, "eval_runtime": 384.4615, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 16.30390739440918, "learning_rate": 1.7140891631199533e-07, "logits/chosen": -2.121743679046631, "logits/rejected": -2.09936785697937, "logps/chosen": -145.47705078125, "logps/rejected": -184.77903747558594, "loss": 0.5891, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9307653307914734, "rewards/margins": 0.3514396548271179, "rewards/rejected": -1.2822052240371704, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 22.45163345336914, "learning_rate": 1.7126842274278245e-07, "logits/chosen": -2.036102771759033, "logits/rejected": -2.013686418533325, "logps/chosen": -157.7967987060547, "logps/rejected": -178.23263549804688, "loss": 0.6294, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0277711153030396, "rewards/margins": 0.2493065595626831, "rewards/rejected": -1.2770777940750122, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 18.08307456970215, "learning_rate": 1.7112764271702135e-07, "logits/chosen": -2.1119580268859863, "logits/rejected": -2.081120729446411, "logps/chosen": -154.29696655273438, "logps/rejected": -172.24014282226562, "loss": 0.6441, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9838927388191223, "rewards/margins": 0.226963073015213, "rewards/rejected": -1.2108559608459473, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 15.348020553588867, "learning_rate": 1.7098657680056373e-07, "logits/chosen": -2.0765204429626465, "logits/rejected": -2.053340196609497, "logps/chosen": -139.92092895507812, "logps/rejected": -175.59921264648438, "loss": 0.6002, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8597497940063477, "rewards/margins": 0.3353044092655182, "rewards/rejected": -1.195054054260254, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 12.885562896728516, "learning_rate": 1.7084522556041049e-07, "logits/chosen": -2.015348434448242, "logits/rejected": -1.9957529306411743, "logps/chosen": -138.64785766601562, "logps/rejected": -175.66854858398438, "loss": 0.593, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8663733601570129, "rewards/margins": 0.3682767152786255, "rewards/rejected": -1.2346501350402832, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 16.89599609375, "learning_rate": 1.7070358956470923e-07, "logits/chosen": -2.0125277042388916, "logits/rejected": -1.9972074031829834, "logps/chosen": -148.81884765625, "logps/rejected": -184.15538024902344, "loss": 0.6028, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9695499539375305, "rewards/margins": 0.3619958460330963, "rewards/rejected": -1.3315457105636597, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 19.88907241821289, "learning_rate": 1.705616693827522e-07, "logits/chosen": -2.034695863723755, "logits/rejected": -2.0065276622772217, "logps/chosen": -140.55987548828125, "logps/rejected": -173.42185974121094, "loss": 0.5839, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8651742935180664, "rewards/margins": 0.3406986594200134, "rewards/rejected": -1.2058730125427246, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 23.023468017578125, "learning_rate": 1.7041946558497388e-07, "logits/chosen": -2.056576728820801, "logits/rejected": -2.014734983444214, "logps/chosen": -158.1607666015625, "logps/rejected": -190.6458282470703, "loss": 0.5909, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0378226041793823, "rewards/margins": 0.3521498739719391, "rewards/rejected": -1.389972448348999, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 16.29109764099121, "learning_rate": 1.7027697874294867e-07, "logits/chosen": -2.086655855178833, "logits/rejected": -2.049394130706787, "logps/chosen": -158.97598266601562, "logps/rejected": -188.78822326660156, "loss": 0.5832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0097744464874268, "rewards/margins": 0.3614472448825836, "rewards/rejected": -1.3712217807769775, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 29.7178955078125, "learning_rate": 1.7013420942938876e-07, "logits/chosen": -1.9082027673721313, "logits/rejected": -1.887874960899353, "logps/chosen": -162.42620849609375, "logps/rejected": -187.9700469970703, "loss": 0.627, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0464036464691162, "rewards/margins": 0.30022451281547546, "rewards/rejected": -1.346628189086914, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 19.80901336669922, "learning_rate": 1.6999115821814155e-07, "logits/chosen": -2.0822360515594482, "logits/rejected": -2.052886486053467, "logps/chosen": -158.82598876953125, "logps/rejected": -193.48214721679688, "loss": 0.5986, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0371277332305908, "rewards/margins": 0.3718651533126831, "rewards/rejected": -1.4089930057525635, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 17.461763381958008, "learning_rate": 1.6984782568418766e-07, "logits/chosen": -2.0551178455352783, "logits/rejected": -2.0250158309936523, "logps/chosen": -150.8203887939453, "logps/rejected": -192.51698303222656, "loss": 0.5634, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9880839586257935, "rewards/margins": 0.4387296736240387, "rewards/rejected": -1.4268134832382202, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 23.234567642211914, "learning_rate": 1.697042124036383e-07, "logits/chosen": -2.117779493331909, "logits/rejected": -2.1022117137908936, "logps/chosen": -154.16024780273438, "logps/rejected": -186.769287109375, "loss": 0.6134, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.024356484413147, "rewards/margins": 0.324504554271698, "rewards/rejected": -1.3488609790802002, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 17.887039184570312, "learning_rate": 1.6956031895373327e-07, "logits/chosen": -2.0169944763183594, "logits/rejected": -1.9780069589614868, "logps/chosen": -170.80160522460938, "logps/rejected": -204.9024658203125, "loss": 0.5819, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1159305572509766, "rewards/margins": 0.38865792751312256, "rewards/rejected": -1.5045883655548096, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 18.053930282592773, "learning_rate": 1.6941614591283834e-07, "logits/chosen": -2.1555233001708984, "logits/rejected": -2.126132011413574, "logps/chosen": -162.896484375, "logps/rejected": -179.764892578125, "loss": 0.6383, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0623223781585693, "rewards/margins": 0.23261627554893494, "rewards/rejected": -1.2949388027191162, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 18.37833595275879, "learning_rate": 1.6927169386044313e-07, "logits/chosen": -2.0529894828796387, "logits/rejected": -2.0172603130340576, "logps/chosen": -156.34518432617188, "logps/rejected": -190.53683471679688, "loss": 0.5971, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9993540048599243, "rewards/margins": 0.3503246307373047, "rewards/rejected": -1.34967839717865, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 17.409399032592773, "learning_rate": 1.691269633771588e-07, "logits/chosen": -2.0082180500030518, "logits/rejected": -1.975886583328247, "logps/chosen": -144.2236328125, "logps/rejected": -185.03004455566406, "loss": 0.5754, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9236896634101868, "rewards/margins": 0.38613972067832947, "rewards/rejected": -1.3098294734954834, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 18.62607192993164, "learning_rate": 1.6898195504471552e-07, "logits/chosen": -2.02447509765625, "logits/rejected": -1.9947541952133179, "logps/chosen": -151.97547912597656, "logps/rejected": -192.77957153320312, "loss": 0.5888, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9951189756393433, "rewards/margins": 0.39299628138542175, "rewards/rejected": -1.3881151676177979, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 14.568831443786621, "learning_rate": 1.688366694459603e-07, "logits/chosen": -1.955736517906189, "logits/rejected": -1.9136276245117188, "logps/chosen": -142.00694274902344, "logps/rejected": -181.83102416992188, "loss": 0.5724, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9053285717964172, "rewards/margins": 0.37364915013313293, "rewards/rejected": -1.278977632522583, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 18.89543342590332, "learning_rate": 1.6869110716485456e-07, "logits/chosen": -2.0314784049987793, "logits/rejected": -1.9947092533111572, "logps/chosen": -153.63650512695312, "logps/rejected": -198.30746459960938, "loss": 0.5658, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0288561582565308, "rewards/margins": 0.4323458671569824, "rewards/rejected": -1.4612020254135132, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 18.528011322021484, "learning_rate": 1.6854526878647186e-07, "logits/chosen": -2.0780978202819824, "logits/rejected": -2.0521152019500732, "logps/chosen": -163.8245086669922, "logps/rejected": -199.75381469726562, "loss": 0.5987, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.097534418106079, "rewards/margins": 0.36402350664138794, "rewards/rejected": -1.4615581035614014, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 18.017547607421875, "learning_rate": 1.6839915489699545e-07, "logits/chosen": -2.062495708465576, "logits/rejected": -2.0183212757110596, "logps/chosen": -163.41049194335938, "logps/rejected": -210.4263916015625, "loss": 0.5397, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1038446426391602, "rewards/margins": 0.5071204304695129, "rewards/rejected": -1.6109651327133179, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 26.828323364257812, "learning_rate": 1.682527660837161e-07, "logits/chosen": -1.954272985458374, "logits/rejected": -1.9177658557891846, "logps/chosen": -158.9732666015625, "logps/rejected": -200.26998901367188, "loss": 0.5782, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0547670125961304, "rewards/margins": 0.4192841053009033, "rewards/rejected": -1.4740509986877441, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 15.48154354095459, "learning_rate": 1.6810610293502944e-07, "logits/chosen": -1.9852135181427002, "logits/rejected": -1.9541308879852295, "logps/chosen": -156.57272338867188, "logps/rejected": -206.9540252685547, "loss": 0.5533, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0418469905853271, "rewards/margins": 0.4819900393486023, "rewards/rejected": -1.5238369703292847, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 19.590543746948242, "learning_rate": 1.679591660404339e-07, "logits/chosen": -2.031728982925415, "logits/rejected": -1.9990825653076172, "logps/chosen": -155.19406127929688, "logps/rejected": -213.0499725341797, "loss": 0.5278, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.042792797088623, "rewards/margins": 0.5427265167236328, "rewards/rejected": -1.5855190753936768, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 15.18802547454834, "learning_rate": 1.6781195599052807e-07, "logits/chosen": -1.8982799053192139, "logits/rejected": -1.8733913898468018, "logps/chosen": -160.97203063964844, "logps/rejected": -217.6151123046875, "loss": 0.5504, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0919866561889648, "rewards/margins": 0.5518103837966919, "rewards/rejected": -1.6437969207763672, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 27.201187133789062, "learning_rate": 1.6766447337700865e-07, "logits/chosen": -1.9276330471038818, "logits/rejected": -1.8953415155410767, "logps/chosen": -177.0043487548828, "logps/rejected": -222.3863067626953, "loss": 0.5629, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2082350254058838, "rewards/margins": 0.4653599262237549, "rewards/rejected": -1.6735950708389282, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 22.19955825805664, "learning_rate": 1.6751671879266769e-07, "logits/chosen": -1.9740365743637085, "logits/rejected": -1.9525846242904663, "logps/chosen": -168.96153259277344, "logps/rejected": -208.54513549804688, "loss": 0.5868, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1421959400177002, "rewards/margins": 0.4119526743888855, "rewards/rejected": -1.5541484355926514, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 27.656230926513672, "learning_rate": 1.673686928313905e-07, "logits/chosen": -1.9965871572494507, "logits/rejected": -1.968044638633728, "logps/chosen": -170.11952209472656, "logps/rejected": -217.99942016601562, "loss": 0.5625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1495411396026611, "rewards/margins": 0.44836121797561646, "rewards/rejected": -1.597902536392212, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 18.465089797973633, "learning_rate": 1.6722039608815315e-07, "logits/chosen": -1.9055583477020264, "logits/rejected": -1.8747007846832275, "logps/chosen": -174.04336547851562, "logps/rejected": -228.24948120117188, "loss": 0.5291, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1837732791900635, "rewards/margins": 0.5520281791687012, "rewards/rejected": -1.7358014583587646, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 20.942235946655273, "learning_rate": 1.670718291590201e-07, "logits/chosen": -1.9033527374267578, "logits/rejected": -1.8923523426055908, "logps/chosen": -169.00942993164062, "logps/rejected": -211.85433959960938, "loss": 0.5828, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1649696826934814, "rewards/margins": 0.3764055669307709, "rewards/rejected": -1.5413752794265747, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 25.44744873046875, "learning_rate": 1.6692299264114178e-07, "logits/chosen": -1.9109452962875366, "logits/rejected": -1.8838907480239868, "logps/chosen": -172.25010681152344, "logps/rejected": -210.68661499023438, "loss": 0.6112, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2076642513275146, "rewards/margins": 0.3641297221183777, "rewards/rejected": -1.5717939138412476, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 15.050667762756348, "learning_rate": 1.6677388713275224e-07, "logits/chosen": -1.9521329402923584, "logits/rejected": -1.9319088459014893, "logps/chosen": -176.0291290283203, "logps/rejected": -216.5508270263672, "loss": 0.6086, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2580091953277588, "rewards/margins": 0.39582735300064087, "rewards/rejected": -1.6538364887237549, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 18.323843002319336, "learning_rate": 1.6662451323316663e-07, "logits/chosen": -1.946067452430725, "logits/rejected": -1.9008439779281616, "logps/chosen": -157.49227905273438, "logps/rejected": -209.2881622314453, "loss": 0.5384, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0325511693954468, "rewards/margins": 0.529666543006897, "rewards/rejected": -1.5622177124023438, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 14.298197746276855, "learning_rate": 1.6647487154277897e-07, "logits/chosen": -1.8379230499267578, "logits/rejected": -1.8213342428207397, "logps/chosen": -162.3971405029297, "logps/rejected": -204.32907104492188, "loss": 0.5784, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0869194269180298, "rewards/margins": 0.4166596531867981, "rewards/rejected": -1.5035792589187622, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 14.643122673034668, "learning_rate": 1.6632496266305958e-07, "logits/chosen": -1.8929506540298462, "logits/rejected": -1.8443950414657593, "logps/chosen": -174.22885131835938, "logps/rejected": -198.41079711914062, "loss": 0.6354, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1590553522109985, "rewards/margins": 0.30704209208488464, "rewards/rejected": -1.4660975933074951, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 17.613201141357422, "learning_rate": 1.661747871965527e-07, "logits/chosen": -1.8706638813018799, "logits/rejected": -1.8416591882705688, "logps/chosen": -161.9023895263672, "logps/rejected": -208.8050994873047, "loss": 0.5632, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1000502109527588, "rewards/margins": 0.48885440826416016, "rewards/rejected": -1.588904857635498, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 21.86261749267578, "learning_rate": 1.6602434574687417e-07, "logits/chosen": -1.883933424949646, "logits/rejected": -1.8692964315414429, "logps/chosen": -155.93893432617188, "logps/rejected": -211.02001953125, "loss": 0.5424, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.033689260482788, "rewards/margins": 0.5113634467124939, "rewards/rejected": -1.5450528860092163, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 25.24880027770996, "learning_rate": 1.658736389187089e-07, "logits/chosen": -1.933140516281128, "logits/rejected": -1.8884773254394531, "logps/chosen": -170.1532745361328, "logps/rejected": -207.06857299804688, "loss": 0.5904, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.150805115699768, "rewards/margins": 0.39992770552635193, "rewards/rejected": -1.5507327318191528, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 26.61385726928711, "learning_rate": 1.6572266731780842e-07, "logits/chosen": -1.9090297222137451, "logits/rejected": -1.877764344215393, "logps/chosen": -169.45245361328125, "logps/rejected": -214.60702514648438, "loss": 0.5701, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1734836101531982, "rewards/margins": 0.44985073804855347, "rewards/rejected": -1.623334288597107, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -2.029165029525757, "eval_logits/rejected": -2.0121874809265137, "eval_logps/chosen": -159.05809020996094, "eval_logps/rejected": -187.67584228515625, "eval_loss": 0.6356069445610046, "eval_rewards/accuracies": 0.6291821599006653, "eval_rewards/chosen": -1.0034619569778442, "eval_rewards/margins": 0.2414952963590622, "eval_rewards/rejected": -1.2449570894241333, "eval_runtime": 384.1954, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 20.915616989135742, "learning_rate": 1.655714315509885e-07, "logits/chosen": -1.9238481521606445, "logits/rejected": -1.9024574756622314, "logps/chosen": -169.83511352539062, "logps/rejected": -201.43551635742188, "loss": 0.6298, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1679986715316772, "rewards/margins": 0.3322969377040863, "rewards/rejected": -1.5002957582473755, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 16.894756317138672, "learning_rate": 1.654199322261267e-07, "logits/chosen": -2.0713367462158203, "logits/rejected": -2.0292153358459473, "logps/chosen": -170.26837158203125, "logps/rejected": -205.8231964111328, "loss": 0.61, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1588108539581299, "rewards/margins": 0.40042543411254883, "rewards/rejected": -1.5592362880706787, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 17.70704460144043, "learning_rate": 1.6526816995215995e-07, "logits/chosen": -1.775228500366211, "logits/rejected": -1.7515833377838135, "logps/chosen": -160.17477416992188, "logps/rejected": -196.04910278320312, "loss": 0.6146, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1033680438995361, "rewards/margins": 0.3539595901966095, "rewards/rejected": -1.4573276042938232, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 20.00725746154785, "learning_rate": 1.651161453390821e-07, "logits/chosen": -1.9683364629745483, "logits/rejected": -1.9444955587387085, "logps/chosen": -154.38311767578125, "logps/rejected": -191.213623046875, "loss": 0.578, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0067485570907593, "rewards/margins": 0.3886244297027588, "rewards/rejected": -1.3953731060028076, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 18.94013786315918, "learning_rate": 1.6496385899794135e-07, "logits/chosen": -1.873522400856018, "logits/rejected": -1.8403030633926392, "logps/chosen": -168.37149047851562, "logps/rejected": -204.64468383789062, "loss": 0.5708, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0902937650680542, "rewards/margins": 0.419158399105072, "rewards/rejected": -1.5094521045684814, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 18.95502281188965, "learning_rate": 1.64811311540838e-07, "logits/chosen": -1.9236536026000977, "logits/rejected": -1.890880823135376, "logps/chosen": -158.81686401367188, "logps/rejected": -197.5561981201172, "loss": 0.5943, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0532209873199463, "rewards/margins": 0.40269631147384644, "rewards/rejected": -1.4559170007705688, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 24.840234756469727, "learning_rate": 1.6465850358092184e-07, "logits/chosen": -1.9287309646606445, "logits/rejected": -1.8931522369384766, "logps/chosen": -159.619384765625, "logps/rejected": -200.43862915039062, "loss": 0.5665, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0567705631256104, "rewards/margins": 0.43039005994796753, "rewards/rejected": -1.487160563468933, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 16.706314086914062, "learning_rate": 1.645054357323897e-07, "logits/chosen": -1.9066253900527954, "logits/rejected": -1.8798481225967407, "logps/chosen": -169.624755859375, "logps/rejected": -208.3575439453125, "loss": 0.5701, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1054916381835938, "rewards/margins": 0.4423116147518158, "rewards/rejected": -1.5478031635284424, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 21.22140121459961, "learning_rate": 1.6435210861048302e-07, "logits/chosen": -1.9500453472137451, "logits/rejected": -1.9190781116485596, "logps/chosen": -159.1141357421875, "logps/rejected": -212.4842071533203, "loss": 0.5258, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.077060341835022, "rewards/margins": 0.5342671871185303, "rewards/rejected": -1.6113275289535522, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 19.14046287536621, "learning_rate": 1.6419852283148535e-07, "logits/chosen": -1.916421890258789, "logits/rejected": -1.8944313526153564, "logps/chosen": -177.45870971679688, "logps/rejected": -230.8317413330078, "loss": 0.5576, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2067549228668213, "rewards/margins": 0.56285560131073, "rewards/rejected": -1.7696106433868408, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 17.196231842041016, "learning_rate": 1.6404467901271998e-07, "logits/chosen": -1.9056333303451538, "logits/rejected": -1.872330904006958, "logps/chosen": -179.09910583496094, "logps/rejected": -231.61837768554688, "loss": 0.5544, "rewards/accuracies": 0.75, "rewards/chosen": -1.2740018367767334, "rewards/margins": 0.5072966814041138, "rewards/rejected": -1.7812986373901367, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 21.852598190307617, "learning_rate": 1.6389057777254722e-07, "logits/chosen": -1.995835542678833, "logits/rejected": -1.9330600500106812, "logps/chosen": -174.4574432373047, "logps/rejected": -239.14688110351562, "loss": 0.5111, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2013572454452515, "rewards/margins": 0.6553361415863037, "rewards/rejected": -1.8566935062408447, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 15.418428421020508, "learning_rate": 1.6373621973036224e-07, "logits/chosen": -1.896733045578003, "logits/rejected": -1.8596582412719727, "logps/chosen": -176.39080810546875, "logps/rejected": -231.16891479492188, "loss": 0.5421, "rewards/accuracies": 0.75, "rewards/chosen": -1.2199915647506714, "rewards/margins": 0.5522419810295105, "rewards/rejected": -1.7722337245941162, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 33.1326904296875, "learning_rate": 1.6358160550659213e-07, "logits/chosen": -1.9121757745742798, "logits/rejected": -1.8819659948349, "logps/chosen": -178.7617645263672, "logps/rejected": -224.37741088867188, "loss": 0.5819, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2568280696868896, "rewards/margins": 0.4621516168117523, "rewards/rejected": -1.7189795970916748, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 24.849287033081055, "learning_rate": 1.6342673572269398e-07, "logits/chosen": -1.8367702960968018, "logits/rejected": -1.81298828125, "logps/chosen": -182.08493041992188, "logps/rejected": -224.43612670898438, "loss": 0.6006, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2907702922821045, "rewards/margins": 0.45370227098464966, "rewards/rejected": -1.7444725036621094, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 27.562496185302734, "learning_rate": 1.632716110011519e-07, "logits/chosen": -1.76801335811615, "logits/rejected": -1.7465245723724365, "logps/chosen": -173.99270629882812, "logps/rejected": -217.61288452148438, "loss": 0.5933, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2301690578460693, "rewards/margins": 0.42046982049942017, "rewards/rejected": -1.6506388187408447, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 20.34008026123047, "learning_rate": 1.6311623196547474e-07, "logits/chosen": -1.8881728649139404, "logits/rejected": -1.8623225688934326, "logps/chosen": -203.97837829589844, "logps/rejected": -253.9075927734375, "loss": 0.5641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.445177435874939, "rewards/margins": 0.5291798710823059, "rewards/rejected": -1.9743572473526, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 23.07094955444336, "learning_rate": 1.6296059924019353e-07, "logits/chosen": -1.8910945653915405, "logits/rejected": -1.859135627746582, "logps/chosen": -198.97625732421875, "logps/rejected": -236.320068359375, "loss": 0.6122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4082109928131104, "rewards/margins": 0.4251943528652191, "rewards/rejected": -1.8334052562713623, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 25.002544403076172, "learning_rate": 1.6280471345085901e-07, "logits/chosen": -1.8824446201324463, "logits/rejected": -1.8468780517578125, "logps/chosen": -192.41334533691406, "logps/rejected": -240.2723388671875, "loss": 0.571, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3423736095428467, "rewards/margins": 0.5303239226341248, "rewards/rejected": -1.8726978302001953, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 18.83368682861328, "learning_rate": 1.6264857522403906e-07, "logits/chosen": -1.7882111072540283, "logits/rejected": -1.7458820343017578, "logps/chosen": -180.8072967529297, "logps/rejected": -237.64926147460938, "loss": 0.5446, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2629281282424927, "rewards/margins": 0.5611923933029175, "rewards/rejected": -1.824120283126831, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 22.740522384643555, "learning_rate": 1.6249218518731623e-07, "logits/chosen": -1.8761451244354248, "logits/rejected": -1.8383567333221436, "logps/chosen": -190.98704528808594, "logps/rejected": -237.24948120117188, "loss": 0.5727, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3616178035736084, "rewards/margins": 0.481916606426239, "rewards/rejected": -1.843534231185913, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 19.40971565246582, "learning_rate": 1.6233554396928515e-07, "logits/chosen": -1.8758461475372314, "logits/rejected": -1.8481992483139038, "logps/chosen": -179.4745330810547, "logps/rejected": -222.5402069091797, "loss": 0.5813, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2733550071716309, "rewards/margins": 0.4265773892402649, "rewards/rejected": -1.699932336807251, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 18.235477447509766, "learning_rate": 1.6217865219955008e-07, "logits/chosen": -2.0155832767486572, "logits/rejected": -1.972914457321167, "logps/chosen": -177.2640838623047, "logps/rejected": -251.619873046875, "loss": 0.4886, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1843140125274658, "rewards/margins": 0.7663615942001343, "rewards/rejected": -1.950675368309021, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 20.55992317199707, "learning_rate": 1.6202151050872242e-07, "logits/chosen": -1.8177560567855835, "logits/rejected": -1.7749807834625244, "logps/chosen": -177.9635009765625, "logps/rejected": -224.63125610351562, "loss": 0.574, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2157669067382812, "rewards/margins": 0.49352017045021057, "rewards/rejected": -1.7092870473861694, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 14.043688774108887, "learning_rate": 1.618641195284179e-07, "logits/chosen": -1.9259369373321533, "logits/rejected": -1.894171953201294, "logps/chosen": -178.1907958984375, "logps/rejected": -209.7275848388672, "loss": 0.6409, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2246299982070923, "rewards/margins": 0.33370086550712585, "rewards/rejected": -1.5583306550979614, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 21.307506561279297, "learning_rate": 1.6170647989125455e-07, "logits/chosen": -1.8041244745254517, "logits/rejected": -1.777611494064331, "logps/chosen": -180.18528747558594, "logps/rejected": -208.56021118164062, "loss": 0.6312, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2626087665557861, "rewards/margins": 0.31918349862098694, "rewards/rejected": -1.5817922353744507, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 16.19000816345215, "learning_rate": 1.6154859223084953e-07, "logits/chosen": -2.079878807067871, "logits/rejected": -2.0693135261535645, "logps/chosen": -171.21466064453125, "logps/rejected": -208.9565887451172, "loss": 0.6056, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1831395626068115, "rewards/margins": 0.36004436016082764, "rewards/rejected": -1.5431840419769287, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 19.230445861816406, "learning_rate": 1.613904571818171e-07, "logits/chosen": -1.7808287143707275, "logits/rejected": -1.7513635158538818, "logps/chosen": -161.0174102783203, "logps/rejected": -203.79562377929688, "loss": 0.5709, "rewards/accuracies": 0.6875, "rewards/chosen": -1.066051721572876, "rewards/margins": 0.44547238945961, "rewards/rejected": -1.5115240812301636, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 16.056062698364258, "learning_rate": 1.6123207537976588e-07, "logits/chosen": -1.8716799020767212, "logits/rejected": -1.835736870765686, "logps/chosen": -161.36973571777344, "logps/rejected": -206.4115753173828, "loss": 0.5785, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0999131202697754, "rewards/margins": 0.4404403269290924, "rewards/rejected": -1.5403534173965454, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 23.695697784423828, "learning_rate": 1.6107344746129622e-07, "logits/chosen": -1.9100465774536133, "logits/rejected": -1.8787978887557983, "logps/chosen": -172.82638549804688, "logps/rejected": -209.90701293945312, "loss": 0.6107, "rewards/accuracies": 0.625, "rewards/chosen": -1.2039012908935547, "rewards/margins": 0.3747173845767975, "rewards/rejected": -1.5786187648773193, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 19.707185745239258, "learning_rate": 1.609145740639977e-07, "logits/chosen": -1.9088573455810547, "logits/rejected": -1.8732635974884033, "logps/chosen": -156.1880645751953, "logps/rejected": -189.70230102539062, "loss": 0.6169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0573114156723022, "rewards/margins": 0.3241928219795227, "rewards/rejected": -1.3815044164657593, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 18.144315719604492, "learning_rate": 1.6075545582644663e-07, "logits/chosen": -1.895162582397461, "logits/rejected": -1.8665847778320312, "logps/chosen": -159.209228515625, "logps/rejected": -199.00975036621094, "loss": 0.5928, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.044872760772705, "rewards/margins": 0.41198864579200745, "rewards/rejected": -1.4568613767623901, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 18.30056381225586, "learning_rate": 1.6059609338820342e-07, "logits/chosen": -1.9051834344863892, "logits/rejected": -1.8644979000091553, "logps/chosen": -154.34178161621094, "logps/rejected": -213.75088500976562, "loss": 0.5157, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0105177164077759, "rewards/margins": 0.590909481048584, "rewards/rejected": -1.6014270782470703, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 17.638629913330078, "learning_rate": 1.6043648738981e-07, "logits/chosen": -1.9253466129302979, "logits/rejected": -1.8944011926651, "logps/chosen": -162.9455108642578, "logps/rejected": -203.7376251220703, "loss": 0.5806, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0735816955566406, "rewards/margins": 0.43518805503845215, "rewards/rejected": -1.5087696313858032, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 24.398740768432617, "learning_rate": 1.6027663847278725e-07, "logits/chosen": -1.848207712173462, "logits/rejected": -1.8280870914459229, "logps/chosen": -172.33726501464844, "logps/rejected": -217.809814453125, "loss": 0.5685, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.168224573135376, "rewards/margins": 0.4468640387058258, "rewards/rejected": -1.615088701248169, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 18.46108627319336, "learning_rate": 1.6011654727963252e-07, "logits/chosen": -1.8542156219482422, "logits/rejected": -1.8329334259033203, "logps/chosen": -169.8433380126953, "logps/rejected": -220.50668334960938, "loss": 0.5608, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1637474298477173, "rewards/margins": 0.4954819679260254, "rewards/rejected": -1.6592292785644531, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 18.699954986572266, "learning_rate": 1.599562144538169e-07, "logits/chosen": -1.8681213855743408, "logits/rejected": -1.8547241687774658, "logps/chosen": -174.8009490966797, "logps/rejected": -216.6031036376953, "loss": 0.6111, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2165663242340088, "rewards/margins": 0.4050436019897461, "rewards/rejected": -1.6216099262237549, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 22.507064819335938, "learning_rate": 1.597956406397827e-07, "logits/chosen": -1.9227724075317383, "logits/rejected": -1.8930885791778564, "logps/chosen": -175.95718383789062, "logps/rejected": -230.57723999023438, "loss": 0.5499, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2322263717651367, "rewards/margins": 0.5083431005477905, "rewards/rejected": -1.7405694723129272, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 18.604408264160156, "learning_rate": 1.5963482648294085e-07, "logits/chosen": -1.9683860540390015, "logits/rejected": -1.9260038137435913, "logps/chosen": -171.10850524902344, "logps/rejected": -217.3498992919922, "loss": 0.5586, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1502472162246704, "rewards/margins": 0.5132304430007935, "rewards/rejected": -1.6634775400161743, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 21.85063362121582, "learning_rate": 1.5947377262966842e-07, "logits/chosen": -1.9116064310073853, "logits/rejected": -1.8747844696044922, "logps/chosen": -167.57177734375, "logps/rejected": -214.62210083007812, "loss": 0.5557, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0987837314605713, "rewards/margins": 0.48204049468040466, "rewards/rejected": -1.5808241367340088, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -1.9953449964523315, "eval_logits/rejected": -1.977708101272583, "eval_logps/chosen": -161.66815185546875, "eval_logps/rejected": -191.02622985839844, "eval_loss": 0.6357540488243103, "eval_rewards/accuracies": 0.6322026252746582, "eval_rewards/chosen": -1.0295625925064087, "eval_rewards/margins": 0.248898446559906, "eval_rewards/rejected": -1.27846097946167, "eval_runtime": 383.9272, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 29.006738662719727, "learning_rate": 1.5931247972730572e-07, "logits/chosen": -1.966294288635254, "logits/rejected": -1.9305827617645264, "logps/chosen": -180.78768920898438, "logps/rejected": -228.24093627929688, "loss": 0.5943, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2642327547073364, "rewards/margins": 0.5007797479629517, "rewards/rejected": -1.765012502670288, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 22.37037467956543, "learning_rate": 1.591509484241541e-07, "logits/chosen": -1.8983854055404663, "logits/rejected": -1.8700834512710571, "logps/chosen": -185.94509887695312, "logps/rejected": -225.1170196533203, "loss": 0.6176, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3115462064743042, "rewards/margins": 0.40037521719932556, "rewards/rejected": -1.7119213342666626, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 23.627182006835938, "learning_rate": 1.5898917936947297e-07, "logits/chosen": -1.8346779346466064, "logits/rejected": -1.81487238407135, "logps/chosen": -162.68484497070312, "logps/rejected": -192.02468872070312, "loss": 0.6214, "rewards/accuracies": 0.625, "rewards/chosen": -1.1229327917099, "rewards/margins": 0.31496235728263855, "rewards/rejected": -1.4378952980041504, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 20.39024543762207, "learning_rate": 1.5882717321347752e-07, "logits/chosen": -1.8643989562988281, "logits/rejected": -1.8361200094223022, "logps/chosen": -172.6422882080078, "logps/rejected": -217.25473022460938, "loss": 0.5813, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1938320398330688, "rewards/margins": 0.4467882513999939, "rewards/rejected": -1.640620470046997, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 16.047563552856445, "learning_rate": 1.5866493060733576e-07, "logits/chosen": -1.8755505084991455, "logits/rejected": -1.8299802541732788, "logps/chosen": -165.16580200195312, "logps/rejected": -213.8719940185547, "loss": 0.5564, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.088411569595337, "rewards/margins": 0.5280265212059021, "rewards/rejected": -1.6164381504058838, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 25.144058227539062, "learning_rate": 1.585024522031663e-07, "logits/chosen": -1.818200707435608, "logits/rejected": -1.801749587059021, "logps/chosen": -164.95494079589844, "logps/rejected": -234.9093780517578, "loss": 0.5135, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1307909488677979, "rewards/margins": 0.6390718221664429, "rewards/rejected": -1.7698627710342407, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 16.80597686767578, "learning_rate": 1.5833973865403533e-07, "logits/chosen": -1.8051636219024658, "logits/rejected": -1.7716310024261475, "logps/chosen": -155.55227661132812, "logps/rejected": -197.94503784179688, "loss": 0.5708, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.027890682220459, "rewards/margins": 0.43396225571632385, "rewards/rejected": -1.46185302734375, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 24.668655395507812, "learning_rate": 1.5817679061395426e-07, "logits/chosen": -1.8865272998809814, "logits/rejected": -1.8435176610946655, "logps/chosen": -164.46475219726562, "logps/rejected": -199.07980346679688, "loss": 0.584, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0789591073989868, "rewards/margins": 0.40726250410079956, "rewards/rejected": -1.4862215518951416, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 18.045318603515625, "learning_rate": 1.5801360873787704e-07, "logits/chosen": -2.040658473968506, "logits/rejected": -2.015576124191284, "logps/chosen": -172.48171997070312, "logps/rejected": -218.50021362304688, "loss": 0.5734, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1886144876480103, "rewards/margins": 0.45509299635887146, "rewards/rejected": -1.643707513809204, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 21.18434715270996, "learning_rate": 1.5785019368169748e-07, "logits/chosen": -1.8881008625030518, "logits/rejected": -1.8674548864364624, "logps/chosen": -169.26629638671875, "logps/rejected": -207.85452270507812, "loss": 0.5721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1305954456329346, "rewards/margins": 0.4208308160305023, "rewards/rejected": -1.5514262914657593, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 18.61454963684082, "learning_rate": 1.5768654610224664e-07, "logits/chosen": -1.8763364553451538, "logits/rejected": -1.8204272985458374, "logps/chosen": -173.53945922851562, "logps/rejected": -221.5675506591797, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1675516366958618, "rewards/margins": 0.5180361866950989, "rewards/rejected": -1.6855876445770264, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 20.92111587524414, "learning_rate": 1.575226666572901e-07, "logits/chosen": -1.8613507747650146, "logits/rejected": -1.829530954360962, "logps/chosen": -166.28549194335938, "logps/rejected": -207.47384643554688, "loss": 0.5742, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.126740574836731, "rewards/margins": 0.43342143297195435, "rewards/rejected": -1.560161828994751, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 22.111543655395508, "learning_rate": 1.573585560055256e-07, "logits/chosen": -1.809640645980835, "logits/rejected": -1.7645851373672485, "logps/chosen": -166.86082458496094, "logps/rejected": -224.10556030273438, "loss": 0.5233, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1323686838150024, "rewards/margins": 0.5942455530166626, "rewards/rejected": -1.726613998413086, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 19.58144760131836, "learning_rate": 1.5719421480657996e-07, "logits/chosen": -1.8357940912246704, "logits/rejected": -1.8000843524932861, "logps/chosen": -185.57937622070312, "logps/rejected": -224.1998291015625, "loss": 0.6105, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2820502519607544, "rewards/margins": 0.39689844846725464, "rewards/rejected": -1.6789487600326538, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 30.742877960205078, "learning_rate": 1.570296437210068e-07, "logits/chosen": -1.7698867321014404, "logits/rejected": -1.7366396188735962, "logps/chosen": -173.15988159179688, "logps/rejected": -211.2397003173828, "loss": 0.62, "rewards/accuracies": 0.65625, "rewards/chosen": -1.216139316558838, "rewards/margins": 0.37928685545921326, "rewards/rejected": -1.595426321029663, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 20.883285522460938, "learning_rate": 1.5686484341028374e-07, "logits/chosen": -1.884711503982544, "logits/rejected": -1.8411281108856201, "logps/chosen": -167.2931671142578, "logps/rejected": -211.8631134033203, "loss": 0.5624, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1128895282745361, "rewards/margins": 0.48710498213768005, "rewards/rejected": -1.599994421005249, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 18.167383193969727, "learning_rate": 1.566998145368097e-07, "logits/chosen": -1.8972285985946655, "logits/rejected": -1.8509814739227295, "logps/chosen": -163.0162353515625, "logps/rejected": -214.9351043701172, "loss": 0.5346, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0945223569869995, "rewards/margins": 0.5278555154800415, "rewards/rejected": -1.6223779916763306, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 18.919946670532227, "learning_rate": 1.5653455776390235e-07, "logits/chosen": -1.892338514328003, "logits/rejected": -1.849704384803772, "logps/chosen": -170.286865234375, "logps/rejected": -201.15330505371094, "loss": 0.6021, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1525856256484985, "rewards/margins": 0.35892918705940247, "rewards/rejected": -1.5115145444869995, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 19.923118591308594, "learning_rate": 1.563690737557953e-07, "logits/chosen": -1.8471940755844116, "logits/rejected": -1.8094072341918945, "logps/chosen": -164.0819549560547, "logps/rejected": -214.5155792236328, "loss": 0.5563, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1124200820922852, "rewards/margins": 0.4979260563850403, "rewards/rejected": -1.6103461980819702, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 26.111726760864258, "learning_rate": 1.562033631776356e-07, "logits/chosen": -1.9286270141601562, "logits/rejected": -1.8974510431289673, "logps/chosen": -170.0607147216797, "logps/rejected": -214.9732666015625, "loss": 0.5814, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1442835330963135, "rewards/margins": 0.46957072615623474, "rewards/rejected": -1.613854169845581, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 16.342317581176758, "learning_rate": 1.560374266954809e-07, "logits/chosen": -1.8816630840301514, "logits/rejected": -1.8479684591293335, "logps/chosen": -169.02830505371094, "logps/rejected": -223.63723754882812, "loss": 0.5361, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1706207990646362, "rewards/margins": 0.5482100248336792, "rewards/rejected": -1.7188308238983154, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 23.36205291748047, "learning_rate": 1.5587126497629686e-07, "logits/chosen": -1.8299745321273804, "logits/rejected": -1.7964084148406982, "logps/chosen": -183.80242919921875, "logps/rejected": -225.1468505859375, "loss": 0.6006, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3141157627105713, "rewards/margins": 0.41576558351516724, "rewards/rejected": -1.7298812866210938, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 20.460121154785156, "learning_rate": 1.557048786879545e-07, "logits/chosen": -1.803312063217163, "logits/rejected": -1.768874168395996, "logps/chosen": -168.17320251464844, "logps/rejected": -201.20533752441406, "loss": 0.5978, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1486170291900635, "rewards/margins": 0.37151283025741577, "rewards/rejected": -1.5201297998428345, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 24.55524253845215, "learning_rate": 1.5553826849922747e-07, "logits/chosen": -1.8844077587127686, "logits/rejected": -1.839284896850586, "logps/chosen": -170.33155822753906, "logps/rejected": -212.1918182373047, "loss": 0.5843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1722571849822998, "rewards/margins": 0.4536797106266022, "rewards/rejected": -1.62593674659729, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 21.957923889160156, "learning_rate": 1.553714350797893e-07, "logits/chosen": -1.930938959121704, "logits/rejected": -1.894940972328186, "logps/chosen": -166.409423828125, "logps/rejected": -220.80581665039062, "loss": 0.5335, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1429588794708252, "rewards/margins": 0.5315641164779663, "rewards/rejected": -1.6745229959487915, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 18.168514251708984, "learning_rate": 1.5520437910021084e-07, "logits/chosen": -1.9338651895523071, "logits/rejected": -1.9020391702651978, "logps/chosen": -162.00460815429688, "logps/rejected": -216.5800018310547, "loss": 0.5419, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.102439045906067, "rewards/margins": 0.5217219591140747, "rewards/rejected": -1.6241611242294312, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 22.49798011779785, "learning_rate": 1.550371012319575e-07, "logits/chosen": -1.8237295150756836, "logits/rejected": -1.798282265663147, "logps/chosen": -174.2585906982422, "logps/rejected": -247.76937866210938, "loss": 0.5025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2435170412063599, "rewards/margins": 0.6812580227851868, "rewards/rejected": -1.9247751235961914, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 23.629316329956055, "learning_rate": 1.5486960214738648e-07, "logits/chosen": -1.8039436340332031, "logits/rejected": -1.7624622583389282, "logps/chosen": -183.01832580566406, "logps/rejected": -225.6625518798828, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2919310331344604, "rewards/margins": 0.44403982162475586, "rewards/rejected": -1.7359708547592163, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 18.57203483581543, "learning_rate": 1.547018825197443e-07, "logits/chosen": -1.8125450611114502, "logits/rejected": -1.776877760887146, "logps/chosen": -175.30438232421875, "logps/rejected": -237.8402099609375, "loss": 0.5092, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1814472675323486, "rewards/margins": 0.6386057138442993, "rewards/rejected": -1.8200527429580688, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 33.86186599731445, "learning_rate": 1.5453394302316366e-07, "logits/chosen": -1.775186538696289, "logits/rejected": -1.7512357234954834, "logps/chosen": -197.62759399414062, "logps/rejected": -245.69482421875, "loss": 0.5945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4149936437606812, "rewards/margins": 0.4829793870449066, "rewards/rejected": -1.8979730606079102, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 17.650436401367188, "learning_rate": 1.5436578433266126e-07, "logits/chosen": -1.7831552028656006, "logits/rejected": -1.735656499862671, "logps/chosen": -192.37847900390625, "logps/rejected": -256.8363952636719, "loss": 0.5364, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3545328378677368, "rewards/margins": 0.6783474683761597, "rewards/rejected": -2.0328803062438965, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 19.204593658447266, "learning_rate": 1.5419740712413472e-07, "logits/chosen": -1.7737830877304077, "logits/rejected": -1.7366644144058228, "logps/chosen": -181.94137573242188, "logps/rejected": -244.99801635742188, "loss": 0.5295, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3052650690078735, "rewards/margins": 0.6292239427566528, "rewards/rejected": -1.9344890117645264, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 29.945493698120117, "learning_rate": 1.5402881207436e-07, "logits/chosen": -1.725109338760376, "logits/rejected": -1.6990764141082764, "logps/chosen": -194.0322265625, "logps/rejected": -235.67431640625, "loss": 0.6083, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.373313546180725, "rewards/margins": 0.4306615889072418, "rewards/rejected": -1.8039751052856445, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 27.677576065063477, "learning_rate": 1.5385999986098858e-07, "logits/chosen": -1.7985633611679077, "logits/rejected": -1.774356484413147, "logps/chosen": -179.78250122070312, "logps/rejected": -237.5058135986328, "loss": 0.5401, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.253493070602417, "rewards/margins": 0.5606546401977539, "rewards/rejected": -1.814147710800171, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 19.740476608276367, "learning_rate": 1.5369097116254493e-07, "logits/chosen": -1.8279622793197632, "logits/rejected": -1.795508623123169, "logps/chosen": -190.2114715576172, "logps/rejected": -249.172119140625, "loss": 0.555, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.377532720565796, "rewards/margins": 0.577313244342804, "rewards/rejected": -1.9548461437225342, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 34.197540283203125, "learning_rate": 1.5352172665842351e-07, "logits/chosen": -1.7786098718643188, "logits/rejected": -1.7375552654266357, "logps/chosen": -183.84373474121094, "logps/rejected": -226.8520050048828, "loss": 0.611, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3179031610488892, "rewards/margins": 0.4632466435432434, "rewards/rejected": -1.7811496257781982, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 29.12902069091797, "learning_rate": 1.5335226702888636e-07, "logits/chosen": -1.814850091934204, "logits/rejected": -1.790254831314087, "logps/chosen": -175.25399780273438, "logps/rejected": -229.5296630859375, "loss": 0.557, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2141906023025513, "rewards/margins": 0.5248024463653564, "rewards/rejected": -1.7389930486679077, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 23.56708526611328, "learning_rate": 1.5318259295506004e-07, "logits/chosen": -1.8106752634048462, "logits/rejected": -1.7647393941879272, "logps/chosen": -184.96701049804688, "logps/rejected": -229.0535125732422, "loss": 0.5885, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2846510410308838, "rewards/margins": 0.4726200997829437, "rewards/rejected": -1.75727117061615, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 18.515710830688477, "learning_rate": 1.5301270511893315e-07, "logits/chosen": -1.8454278707504272, "logits/rejected": -1.8067982196807861, "logps/chosen": -164.33692932128906, "logps/rejected": -224.3966827392578, "loss": 0.5411, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1381299495697021, "rewards/margins": 0.5746427774429321, "rewards/rejected": -1.7127727270126343, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 14.323341369628906, "learning_rate": 1.5284260420335345e-07, "logits/chosen": -1.7431209087371826, "logits/rejected": -1.698785424232483, "logps/chosen": -167.23757934570312, "logps/rejected": -224.02554321289062, "loss": 0.5292, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1481921672821045, "rewards/margins": 0.5911040306091309, "rewards/rejected": -1.7392959594726562, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -1.9159184694290161, "eval_logits/rejected": -1.8969097137451172, "eval_logps/chosen": -167.49000549316406, "eval_logps/rejected": -198.10008239746094, "eval_loss": 0.6332610249519348, "eval_rewards/accuracies": 0.6312732100486755, "eval_rewards/chosen": -1.0877811908721924, "eval_rewards/margins": 0.2614184319972992, "eval_rewards/rejected": -1.3491995334625244, "eval_runtime": 384.0244, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 20.453310012817383, "learning_rate": 1.5267229089202514e-07, "logits/chosen": -1.7973759174346924, "logits/rejected": -1.76103937625885, "logps/chosen": -181.02047729492188, "logps/rejected": -228.26077270507812, "loss": 0.5642, "rewards/accuracies": 0.6875, "rewards/chosen": -1.253167748451233, "rewards/margins": 0.5119723677635193, "rewards/rejected": -1.765140175819397, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 24.659183502197266, "learning_rate": 1.5250176586950615e-07, "logits/chosen": -1.8925281763076782, "logits/rejected": -1.8536806106567383, "logps/chosen": -181.89085388183594, "logps/rejected": -227.59896850585938, "loss": 0.5735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2686735391616821, "rewards/margins": 0.47714129090309143, "rewards/rejected": -1.7458146810531616, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 21.895891189575195, "learning_rate": 1.523310298212054e-07, "logits/chosen": -1.9211881160736084, "logits/rejected": -1.894505262374878, "logps/chosen": -168.66549682617188, "logps/rejected": -219.8590850830078, "loss": 0.5657, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1476205587387085, "rewards/margins": 0.5208449959754944, "rewards/rejected": -1.668465256690979, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 19.686376571655273, "learning_rate": 1.5216008343337987e-07, "logits/chosen": -1.863774061203003, "logits/rejected": -1.8285160064697266, "logps/chosen": -182.8833770751953, "logps/rejected": -227.8834991455078, "loss": 0.6113, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2595279216766357, "rewards/margins": 0.46531790494918823, "rewards/rejected": -1.7248455286026, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 19.36058807373047, "learning_rate": 1.5198892739313216e-07, "logits/chosen": -1.753993034362793, "logits/rejected": -1.7107868194580078, "logps/chosen": -165.95858764648438, "logps/rejected": -216.3078155517578, "loss": 0.5649, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1179423332214355, "rewards/margins": 0.5087266564369202, "rewards/rejected": -1.626668930053711, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 19.93489646911621, "learning_rate": 1.518175623884074e-07, "logits/chosen": -1.8617608547210693, "logits/rejected": -1.8141686916351318, "logps/chosen": -180.40768432617188, "logps/rejected": -219.93545532226562, "loss": 0.573, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2129299640655518, "rewards/margins": 0.4632079005241394, "rewards/rejected": -1.676137924194336, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 29.238536834716797, "learning_rate": 1.516459891079907e-07, "logits/chosen": -1.7542908191680908, "logits/rejected": -1.7299522161483765, "logps/chosen": -177.2001495361328, "logps/rejected": -225.78271484375, "loss": 0.5667, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2611467838287354, "rewards/margins": 0.4789832532405853, "rewards/rejected": -1.740130066871643, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 17.341331481933594, "learning_rate": 1.5147420824150435e-07, "logits/chosen": -1.8384802341461182, "logits/rejected": -1.7911484241485596, "logps/chosen": -177.47169494628906, "logps/rejected": -230.1194305419922, "loss": 0.5446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2320877313613892, "rewards/margins": 0.5574935078620911, "rewards/rejected": -1.789581298828125, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 20.969749450683594, "learning_rate": 1.5130222047940492e-07, "logits/chosen": -1.7615314722061157, "logits/rejected": -1.7270469665527344, "logps/chosen": -173.20687866210938, "logps/rejected": -231.8846435546875, "loss": 0.5449, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2089654207229614, "rewards/margins": 0.5859875679016113, "rewards/rejected": -1.7949531078338623, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 25.91954231262207, "learning_rate": 1.5113002651298062e-07, "logits/chosen": -1.8158423900604248, "logits/rejected": -1.7804663181304932, "logps/chosen": -184.11569213867188, "logps/rejected": -229.144775390625, "loss": 0.5881, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3056111335754395, "rewards/margins": 0.4514707028865814, "rewards/rejected": -1.7570817470550537, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 17.826147079467773, "learning_rate": 1.509576270343485e-07, "logits/chosen": -1.84814453125, "logits/rejected": -1.8017213344573975, "logps/chosen": -183.55357360839844, "logps/rejected": -244.83389282226562, "loss": 0.5295, "rewards/accuracies": 0.71875, "rewards/chosen": -1.266675353050232, "rewards/margins": 0.6295980215072632, "rewards/rejected": -1.8962733745574951, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 20.14065170288086, "learning_rate": 1.5078502273645164e-07, "logits/chosen": -1.9001047611236572, "logits/rejected": -1.8588473796844482, "logps/chosen": -191.18313598632812, "logps/rejected": -231.6808624267578, "loss": 0.6016, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3507914543151855, "rewards/margins": 0.4283156991004944, "rewards/rejected": -1.779106855392456, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 22.96848487854004, "learning_rate": 1.5061221431305632e-07, "logits/chosen": -1.7364161014556885, "logits/rejected": -1.6910498142242432, "logps/chosen": -178.3186492919922, "logps/rejected": -237.7119140625, "loss": 0.5287, "rewards/accuracies": 0.78125, "rewards/chosen": -1.206847906112671, "rewards/margins": 0.6277521252632141, "rewards/rejected": -1.8345998525619507, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 25.747020721435547, "learning_rate": 1.5043920245874937e-07, "logits/chosen": -1.7463403940200806, "logits/rejected": -1.6873855590820312, "logps/chosen": -172.3845672607422, "logps/rejected": -225.8962860107422, "loss": 0.5383, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1442162990570068, "rewards/margins": 0.6088359951972961, "rewards/rejected": -1.7530523538589478, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 19.827890396118164, "learning_rate": 1.5026598786893522e-07, "logits/chosen": -1.7630767822265625, "logits/rejected": -1.7269541025161743, "logps/chosen": -183.65208435058594, "logps/rejected": -248.32949829101562, "loss": 0.5261, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2979530096054077, "rewards/margins": 0.655959963798523, "rewards/rejected": -1.9539129734039307, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 20.81599998474121, "learning_rate": 1.5009257123983322e-07, "logits/chosen": -1.9348329305648804, "logits/rejected": -1.8880043029785156, "logps/chosen": -195.40646362304688, "logps/rejected": -229.8538055419922, "loss": 0.6051, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4008803367614746, "rewards/margins": 0.41985026001930237, "rewards/rejected": -1.8207308053970337, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 22.377382278442383, "learning_rate": 1.499189532684747e-07, "logits/chosen": -1.8455537557601929, "logits/rejected": -1.8025840520858765, "logps/chosen": -182.77159118652344, "logps/rejected": -233.11825561523438, "loss": 0.557, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2822978496551514, "rewards/margins": 0.5449867248535156, "rewards/rejected": -1.827284574508667, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 23.947980880737305, "learning_rate": 1.4974513465270049e-07, "logits/chosen": -1.773627519607544, "logits/rejected": -1.7299044132232666, "logps/chosen": -184.4539337158203, "logps/rejected": -245.153076171875, "loss": 0.5341, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3022854328155518, "rewards/margins": 0.6191726326942444, "rewards/rejected": -1.9214584827423096, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 24.419055938720703, "learning_rate": 1.4957111609115761e-07, "logits/chosen": -1.7043527364730835, "logits/rejected": -1.6770222187042236, "logps/chosen": -191.4741973876953, "logps/rejected": -235.8734588623047, "loss": 0.6066, "rewards/accuracies": 0.625, "rewards/chosen": -1.3449076414108276, "rewards/margins": 0.4712006449699402, "rewards/rejected": -1.8161083459854126, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 19.808265686035156, "learning_rate": 1.4939689828329694e-07, "logits/chosen": -1.980425238609314, "logits/rejected": -1.9331204891204834, "logps/chosen": -198.2283172607422, "logps/rejected": -255.0526123046875, "loss": 0.5316, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4193394184112549, "rewards/margins": 0.5842828154563904, "rewards/rejected": -2.003622531890869, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 21.359458923339844, "learning_rate": 1.492224819293701e-07, "logits/chosen": -1.826668381690979, "logits/rejected": -1.7854959964752197, "logps/chosen": -186.2130889892578, "logps/rejected": -239.1843719482422, "loss": 0.5612, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2978785037994385, "rewards/margins": 0.5653413534164429, "rewards/rejected": -1.8632196187973022, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 26.118690490722656, "learning_rate": 1.490478677304268e-07, "logits/chosen": -1.7860314846038818, "logits/rejected": -1.748482346534729, "logps/chosen": -181.08990478515625, "logps/rejected": -227.72579956054688, "loss": 0.5799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.297550916671753, "rewards/margins": 0.45612120628356934, "rewards/rejected": -1.7536720037460327, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 41.83209991455078, "learning_rate": 1.4887305638831207e-07, "logits/chosen": -1.8422186374664307, "logits/rejected": -1.808428168296814, "logps/chosen": -184.91195678710938, "logps/rejected": -236.88973999023438, "loss": 0.5843, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.318117618560791, "rewards/margins": 0.49964022636413574, "rewards/rejected": -1.8177579641342163, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 36.24333572387695, "learning_rate": 1.486980486056631e-07, "logits/chosen": -1.759803056716919, "logits/rejected": -1.7243820428848267, "logps/chosen": -194.1247100830078, "logps/rejected": -251.89547729492188, "loss": 0.5415, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.380479335784912, "rewards/margins": 0.5735012292861938, "rewards/rejected": -1.9539806842803955, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 26.811819076538086, "learning_rate": 1.4852284508590686e-07, "logits/chosen": -1.7498468160629272, "logits/rejected": -1.719957709312439, "logps/chosen": -180.6525115966797, "logps/rejected": -231.7786102294922, "loss": 0.5777, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2919676303863525, "rewards/margins": 0.4920363426208496, "rewards/rejected": -1.7840036153793335, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 24.70426368713379, "learning_rate": 1.483474465332569e-07, "logits/chosen": -1.882567048072815, "logits/rejected": -1.8586457967758179, "logps/chosen": -184.4115447998047, "logps/rejected": -223.13162231445312, "loss": 0.6135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3120129108428955, "rewards/margins": 0.3901762366294861, "rewards/rejected": -1.7021892070770264, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 38.69207000732422, "learning_rate": 1.4817185365271092e-07, "logits/chosen": -1.841565489768982, "logits/rejected": -1.804574966430664, "logps/chosen": -179.31002807617188, "logps/rejected": -210.82839965820312, "loss": 0.6612, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.247070074081421, "rewards/margins": 0.32873469591140747, "rewards/rejected": -1.5758049488067627, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 16.222440719604492, "learning_rate": 1.4799606715004744e-07, "logits/chosen": -1.9310624599456787, "logits/rejected": -1.8868637084960938, "logps/chosen": -163.29208374023438, "logps/rejected": -211.12222290039062, "loss": 0.554, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0982394218444824, "rewards/margins": 0.47333258390426636, "rewards/rejected": -1.5715720653533936, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 17.030590057373047, "learning_rate": 1.4782008773182342e-07, "logits/chosen": -1.9744294881820679, "logits/rejected": -1.9437084197998047, "logps/chosen": -166.2434539794922, "logps/rejected": -219.3644561767578, "loss": 0.5454, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1272560358047485, "rewards/margins": 0.5442889332771301, "rewards/rejected": -1.6715450286865234, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 19.815698623657227, "learning_rate": 1.476439161053711e-07, "logits/chosen": -1.8406782150268555, "logits/rejected": -1.80866277217865, "logps/chosen": -167.04959106445312, "logps/rejected": -208.95480346679688, "loss": 0.5922, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0997631549835205, "rewards/margins": 0.4213009476661682, "rewards/rejected": -1.521064043045044, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 16.055950164794922, "learning_rate": 1.4746755297879535e-07, "logits/chosen": -1.8774404525756836, "logits/rejected": -1.834254264831543, "logps/chosen": -158.6421356201172, "logps/rejected": -191.69210815429688, "loss": 0.6052, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0516932010650635, "rewards/margins": 0.355400413274765, "rewards/rejected": -1.4070935249328613, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 29.036243438720703, "learning_rate": 1.4729099906097074e-07, "logits/chosen": -1.8762614727020264, "logits/rejected": -1.8309389352798462, "logps/chosen": -168.1267852783203, "logps/rejected": -191.68255615234375, "loss": 0.6143, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0614054203033447, "rewards/margins": 0.32982301712036133, "rewards/rejected": -1.391228437423706, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 29.026830673217773, "learning_rate": 1.4711425506153872e-07, "logits/chosen": -1.775841474533081, "logits/rejected": -1.7425835132598877, "logps/chosen": -155.8795928955078, "logps/rejected": -197.4680938720703, "loss": 0.5775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9953833818435669, "rewards/margins": 0.44751301407814026, "rewards/rejected": -1.4428964853286743, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 18.57213592529297, "learning_rate": 1.4693732169090472e-07, "logits/chosen": -1.8745845556259155, "logits/rejected": -1.8567358255386353, "logps/chosen": -151.21255493164062, "logps/rejected": -193.65370178222656, "loss": 0.5888, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9989484548568726, "rewards/margins": 0.41814273595809937, "rewards/rejected": -1.4170913696289062, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 19.837411880493164, "learning_rate": 1.4676019966023537e-07, "logits/chosen": -1.9492120742797852, "logits/rejected": -1.9174597263336182, "logps/chosen": -176.21762084960938, "logps/rejected": -215.0535430908203, "loss": 0.6, "rewards/accuracies": 0.71875, "rewards/chosen": -1.198301076889038, "rewards/margins": 0.4013351500034332, "rewards/rejected": -1.599636435508728, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 29.906728744506836, "learning_rate": 1.4658288968145556e-07, "logits/chosen": -1.858672857284546, "logits/rejected": -1.8087135553359985, "logps/chosen": -156.7094268798828, "logps/rejected": -208.208740234375, "loss": 0.553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0304765701293945, "rewards/margins": 0.5108339190483093, "rewards/rejected": -1.5413105487823486, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 23.859086990356445, "learning_rate": 1.4640539246724565e-07, "logits/chosen": -1.8171465396881104, "logits/rejected": -1.7792974710464478, "logps/chosen": -167.0326385498047, "logps/rejected": -208.37393188476562, "loss": 0.5998, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1146633625030518, "rewards/margins": 0.43098729848861694, "rewards/rejected": -1.545650601387024, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 19.75602912902832, "learning_rate": 1.4622770873103857e-07, "logits/chosen": -1.9519678354263306, "logits/rejected": -1.9202091693878174, "logps/chosen": -157.49551391601562, "logps/rejected": -206.5701446533203, "loss": 0.5476, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0219330787658691, "rewards/margins": 0.49904584884643555, "rewards/rejected": -1.5209788084030151, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 20.737136840820312, "learning_rate": 1.4604983918701692e-07, "logits/chosen": -1.777673363685608, "logits/rejected": -1.73529851436615, "logps/chosen": -161.501220703125, "logps/rejected": -214.7451934814453, "loss": 0.5442, "rewards/accuracies": 0.75, "rewards/chosen": -1.0883067846298218, "rewards/margins": 0.5229275226593018, "rewards/rejected": -1.6112343072891235, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 27.85216522216797, "learning_rate": 1.4587178455011021e-07, "logits/chosen": -1.7661443948745728, "logits/rejected": -1.7274892330169678, "logps/chosen": -173.51914978027344, "logps/rejected": -233.92068481445312, "loss": 0.5473, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2042179107666016, "rewards/margins": 0.5732482671737671, "rewards/rejected": -1.7774661779403687, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -1.9226493835449219, "eval_logits/rejected": -1.904362678527832, "eval_logps/chosen": -163.50006103515625, "eval_logps/rejected": -192.75973510742188, "eval_loss": 0.635442316532135, "eval_rewards/accuracies": 0.6261616945266724, "eval_rewards/chosen": -1.0478816032409668, "eval_rewards/margins": 0.24791431427001953, "eval_rewards/rejected": -1.2957961559295654, "eval_runtime": 384.2418, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 30.415618896484375, "learning_rate": 1.4569354553599186e-07, "logits/chosen": -1.8696123361587524, "logits/rejected": -1.8457527160644531, "logps/chosen": -190.6040802001953, "logps/rejected": -217.7646942138672, "loss": 0.6321, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3357537984848022, "rewards/margins": 0.31159737706184387, "rewards/rejected": -1.6473512649536133, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 27.164396286010742, "learning_rate": 1.4551512286107642e-07, "logits/chosen": -1.7603811025619507, "logits/rejected": -1.7127151489257812, "logps/chosen": -172.73391723632812, "logps/rejected": -218.4431915283203, "loss": 0.583, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1820390224456787, "rewards/margins": 0.4857490658760071, "rewards/rejected": -1.6677881479263306, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 19.279682159423828, "learning_rate": 1.4533651724251654e-07, "logits/chosen": -1.8027547597885132, "logits/rejected": -1.768843650817871, "logps/chosen": -172.1409912109375, "logps/rejected": -215.8574981689453, "loss": 0.5679, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1791961193084717, "rewards/margins": 0.4495778977870941, "rewards/rejected": -1.6287740468978882, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 19.966888427734375, "learning_rate": 1.4515772939820036e-07, "logits/chosen": -1.8357681035995483, "logits/rejected": -1.8127696514129639, "logps/chosen": -173.6414337158203, "logps/rejected": -216.55142211914062, "loss": 0.5735, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.161946415901184, "rewards/margins": 0.45585179328918457, "rewards/rejected": -1.6177982091903687, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 21.268489837646484, "learning_rate": 1.4497876004674824e-07, "logits/chosen": -1.883754014968872, "logits/rejected": -1.8425041437149048, "logps/chosen": -168.72447204589844, "logps/rejected": -211.64157104492188, "loss": 0.5734, "rewards/accuracies": 0.75, "rewards/chosen": -1.1415681838989258, "rewards/margins": 0.44630199670791626, "rewards/rejected": -1.5878701210021973, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 24.966110229492188, "learning_rate": 1.4479960990751037e-07, "logits/chosen": -1.870134949684143, "logits/rejected": -1.8283237218856812, "logps/chosen": -167.22454833984375, "logps/rejected": -217.4507598876953, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.135387897491455, "rewards/margins": 0.515394926071167, "rewards/rejected": -1.650782823562622, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 21.891530990600586, "learning_rate": 1.4462027970056336e-07, "logits/chosen": -1.8796777725219727, "logits/rejected": -1.8357088565826416, "logps/chosen": -165.61715698242188, "logps/rejected": -210.3502655029297, "loss": 0.5759, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1130149364471436, "rewards/margins": 0.46413689851760864, "rewards/rejected": -1.5771516561508179, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 21.23567008972168, "learning_rate": 1.4444077014670767e-07, "logits/chosen": -1.9080616235733032, "logits/rejected": -1.859018087387085, "logps/chosen": -169.698486328125, "logps/rejected": -222.4978790283203, "loss": 0.5285, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1312874555587769, "rewards/margins": 0.5645879507064819, "rewards/rejected": -1.6958754062652588, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 28.257923126220703, "learning_rate": 1.4426108196746465e-07, "logits/chosen": -1.6983267068862915, "logits/rejected": -1.675920844078064, "logps/chosen": -183.1871337890625, "logps/rejected": -221.4242706298828, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2839581966400146, "rewards/margins": 0.37165379524230957, "rewards/rejected": -1.6556117534637451, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 21.172204971313477, "learning_rate": 1.4408121588507358e-07, "logits/chosen": -1.6503427028656006, "logits/rejected": -1.6238577365875244, "logps/chosen": -173.98719787597656, "logps/rejected": -218.13619995117188, "loss": 0.6133, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2372645139694214, "rewards/margins": 0.424022912979126, "rewards/rejected": -1.661287546157837, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 19.044803619384766, "learning_rate": 1.4390117262248886e-07, "logits/chosen": -1.8345047235488892, "logits/rejected": -1.7920280694961548, "logps/chosen": -179.25234985351562, "logps/rejected": -234.50521850585938, "loss": 0.5557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2419331073760986, "rewards/margins": 0.5842617750167847, "rewards/rejected": -1.8261950016021729, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 18.871639251708984, "learning_rate": 1.4372095290337697e-07, "logits/chosen": -1.876389503479004, "logits/rejected": -1.8411738872528076, "logps/chosen": -164.28530883789062, "logps/rejected": -204.193359375, "loss": 0.5911, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0993173122406006, "rewards/margins": 0.4310569167137146, "rewards/rejected": -1.53037428855896, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 29.29106330871582, "learning_rate": 1.4354055745211372e-07, "logits/chosen": -1.7525503635406494, "logits/rejected": -1.7058188915252686, "logps/chosen": -164.0477752685547, "logps/rejected": -222.71859741210938, "loss": 0.5294, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1098519563674927, "rewards/margins": 0.6016186475753784, "rewards/rejected": -1.711470365524292, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 19.067596435546875, "learning_rate": 1.4335998699378123e-07, "logits/chosen": -1.8152233362197876, "logits/rejected": -1.7801440954208374, "logps/chosen": -185.6068572998047, "logps/rejected": -224.10043334960938, "loss": 0.6139, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.302257776260376, "rewards/margins": 0.3995913863182068, "rewards/rejected": -1.7018489837646484, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 21.694255828857422, "learning_rate": 1.4317924225416493e-07, "logits/chosen": -1.901818871498108, "logits/rejected": -1.8575118780136108, "logps/chosen": -161.57791137695312, "logps/rejected": -211.4538116455078, "loss": 0.5496, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0465854406356812, "rewards/margins": 0.550262451171875, "rewards/rejected": -1.5968478918075562, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 26.77164077758789, "learning_rate": 1.42998323959751e-07, "logits/chosen": -1.795769453048706, "logits/rejected": -1.7647924423217773, "logps/chosen": -173.7707977294922, "logps/rejected": -217.08798217773438, "loss": 0.5753, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1947288513183594, "rewards/margins": 0.4542997479438782, "rewards/rejected": -1.6490285396575928, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 26.229225158691406, "learning_rate": 1.4281723283772297e-07, "logits/chosen": -1.7144826650619507, "logits/rejected": -1.681382417678833, "logps/chosen": -166.12722778320312, "logps/rejected": -211.6783447265625, "loss": 0.5726, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.133207082748413, "rewards/margins": 0.4711657464504242, "rewards/rejected": -1.6043727397918701, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 25.314279556274414, "learning_rate": 1.4263596961595913e-07, "logits/chosen": -1.7785885334014893, "logits/rejected": -1.7429101467132568, "logps/chosen": -173.8869171142578, "logps/rejected": -227.0846710205078, "loss": 0.5579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2071913480758667, "rewards/margins": 0.5267850160598755, "rewards/rejected": -1.7339763641357422, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 38.555118560791016, "learning_rate": 1.424545350230296e-07, "logits/chosen": -1.7544944286346436, "logits/rejected": -1.7176777124404907, "logps/chosen": -171.19810485839844, "logps/rejected": -223.80068969726562, "loss": 0.559, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.189843773841858, "rewards/margins": 0.5179719924926758, "rewards/rejected": -1.7078157663345337, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 20.990966796875, "learning_rate": 1.422729297881931e-07, "logits/chosen": -1.7361913919448853, "logits/rejected": -1.678663969039917, "logps/chosen": -187.804443359375, "logps/rejected": -238.89242553710938, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -1.3235337734222412, "rewards/margins": 0.5596829056739807, "rewards/rejected": -1.8832166194915771, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 32.962459564208984, "learning_rate": 1.4209115464139445e-07, "logits/chosen": -1.7074429988861084, "logits/rejected": -1.659502625465393, "logps/chosen": -181.55169677734375, "logps/rejected": -234.9263916015625, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2779533863067627, "rewards/margins": 0.5410931706428528, "rewards/rejected": -1.8190467357635498, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 25.36784553527832, "learning_rate": 1.419092103132612e-07, "logits/chosen": -1.6498420238494873, "logits/rejected": -1.617962121963501, "logps/chosen": -178.5230712890625, "logps/rejected": -221.35165405273438, "loss": 0.5868, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2235511541366577, "rewards/margins": 0.45928892493247986, "rewards/rejected": -1.68284010887146, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 25.063386917114258, "learning_rate": 1.4172709753510117e-07, "logits/chosen": -1.6935100555419922, "logits/rejected": -1.6461021900177002, "logps/chosen": -192.5091094970703, "logps/rejected": -249.29428100585938, "loss": 0.5618, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3910486698150635, "rewards/margins": 0.5831217765808105, "rewards/rejected": -1.9741703271865845, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 24.478483200073242, "learning_rate": 1.41544817038899e-07, "logits/chosen": -1.7873668670654297, "logits/rejected": -1.7440226078033447, "logps/chosen": -201.0021209716797, "logps/rejected": -244.41189575195312, "loss": 0.6088, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4499728679656982, "rewards/margins": 0.46050310134887695, "rewards/rejected": -1.9104760885238647, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 24.15538215637207, "learning_rate": 1.4136236955731354e-07, "logits/chosen": -1.9057048559188843, "logits/rejected": -1.8608615398406982, "logps/chosen": -178.9813232421875, "logps/rejected": -223.97671508789062, "loss": 0.5758, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2305552959442139, "rewards/margins": 0.4831448197364807, "rewards/rejected": -1.7137000560760498, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 19.96843910217285, "learning_rate": 1.4117975582367488e-07, "logits/chosen": -1.7194932699203491, "logits/rejected": -1.701974630355835, "logps/chosen": -170.04483032226562, "logps/rejected": -225.9224853515625, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1988255977630615, "rewards/margins": 0.5305085182189941, "rewards/rejected": -1.7293342351913452, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 26.922449111938477, "learning_rate": 1.4099697657198128e-07, "logits/chosen": -1.8264611959457397, "logits/rejected": -1.8030723333358765, "logps/chosen": -172.75778198242188, "logps/rejected": -204.11093139648438, "loss": 0.6341, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1810314655303955, "rewards/margins": 0.34694355726242065, "rewards/rejected": -1.5279749631881714, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 24.537500381469727, "learning_rate": 1.4081403253689638e-07, "logits/chosen": -1.7876904010772705, "logits/rejected": -1.7532583475112915, "logps/chosen": -161.22964477539062, "logps/rejected": -189.12184143066406, "loss": 0.6182, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0659732818603516, "rewards/margins": 0.3332677483558655, "rewards/rejected": -1.3992409706115723, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 17.613664627075195, "learning_rate": 1.4063092445374591e-07, "logits/chosen": -1.7610737085342407, "logits/rejected": -1.7352359294891357, "logps/chosen": -169.47122192382812, "logps/rejected": -211.7993927001953, "loss": 0.5807, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.16910719871521, "rewards/margins": 0.4174097180366516, "rewards/rejected": -1.5865167379379272, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 18.988906860351562, "learning_rate": 1.404476530585153e-07, "logits/chosen": -1.7912099361419678, "logits/rejected": -1.7529321908950806, "logps/chosen": -160.61988830566406, "logps/rejected": -204.53854370117188, "loss": 0.5962, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0914565324783325, "rewards/margins": 0.43414705991744995, "rewards/rejected": -1.5256035327911377, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 22.593177795410156, "learning_rate": 1.402642190878462e-07, "logits/chosen": -1.8172283172607422, "logits/rejected": -1.7795774936676025, "logps/chosen": -170.80709838867188, "logps/rejected": -204.64862060546875, "loss": 0.5827, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1354600191116333, "rewards/margins": 0.3988160192966461, "rewards/rejected": -1.534276008605957, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 25.542572021484375, "learning_rate": 1.4008062327903373e-07, "logits/chosen": -1.785715103149414, "logits/rejected": -1.754105806350708, "logps/chosen": -163.73199462890625, "logps/rejected": -216.9349822998047, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": -1.1080421209335327, "rewards/margins": 0.5227325558662415, "rewards/rejected": -1.630774736404419, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 19.270915985107422, "learning_rate": 1.398968663700235e-07, "logits/chosen": -1.7185955047607422, "logits/rejected": -1.6924537420272827, "logps/chosen": -163.96920776367188, "logps/rejected": -218.0501251220703, "loss": 0.5497, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1175190210342407, "rewards/margins": 0.528626561164856, "rewards/rejected": -1.646145224571228, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 21.197391510009766, "learning_rate": 1.3971294909940872e-07, "logits/chosen": -1.8468563556671143, "logits/rejected": -1.8190243244171143, "logps/chosen": -165.5609588623047, "logps/rejected": -226.9778289794922, "loss": 0.5295, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1402043104171753, "rewards/margins": 0.6028168797492981, "rewards/rejected": -1.743021011352539, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 22.384174346923828, "learning_rate": 1.395288722064271e-07, "logits/chosen": -1.7478249073028564, "logits/rejected": -1.715561866760254, "logps/chosen": -174.3957061767578, "logps/rejected": -232.8987579345703, "loss": 0.5479, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2260493040084839, "rewards/margins": 0.5848062634468079, "rewards/rejected": -1.810855507850647, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 24.416648864746094, "learning_rate": 1.39344636430958e-07, "logits/chosen": -1.85236394405365, "logits/rejected": -1.8107448816299438, "logps/chosen": -167.5206298828125, "logps/rejected": -227.94570922851562, "loss": 0.5487, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1494567394256592, "rewards/margins": 0.5763713717460632, "rewards/rejected": -1.7258281707763672, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 23.83544921875, "learning_rate": 1.3916024251351922e-07, "logits/chosen": -1.8106807470321655, "logits/rejected": -1.772073745727539, "logps/chosen": -189.89389038085938, "logps/rejected": -243.23342895507812, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": -1.3309684991836548, "rewards/margins": 0.5788313150405884, "rewards/rejected": -1.9097998142242432, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 27.942703247070312, "learning_rate": 1.3897569119526442e-07, "logits/chosen": -1.755277395248413, "logits/rejected": -1.7244822978973389, "logps/chosen": -180.81710815429688, "logps/rejected": -233.8452606201172, "loss": 0.5506, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2480337619781494, "rewards/margins": 0.5438328981399536, "rewards/rejected": -1.791866660118103, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 19.884885787963867, "learning_rate": 1.387909832179798e-07, "logits/chosen": -1.7783870697021484, "logits/rejected": -1.7140800952911377, "logps/chosen": -197.75082397460938, "logps/rejected": -261.7915954589844, "loss": 0.5158, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3942798376083374, "rewards/margins": 0.7095158696174622, "rewards/rejected": -2.1037955284118652, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 37.443603515625, "learning_rate": 1.3860611932408118e-07, "logits/chosen": -1.7441831827163696, "logits/rejected": -1.708203673362732, "logps/chosen": -204.8780975341797, "logps/rejected": -242.55419921875, "loss": 0.6231, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5234088897705078, "rewards/margins": 0.4161691665649414, "rewards/rejected": -1.9395780563354492, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -1.8558425903320312, "eval_logits/rejected": -1.8355497121810913, "eval_logps/chosen": -180.55352783203125, "eval_logps/rejected": -212.970458984375, "eval_loss": 0.6345874071121216, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -1.2184159755706787, "eval_rewards/margins": 0.27948716282844543, "eval_rewards/rejected": -1.4979033470153809, "eval_runtime": 383.3126, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.404, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 19.79640769958496, "learning_rate": 1.3842110025661126e-07, "logits/chosen": -1.7169148921966553, "logits/rejected": -1.6761764287948608, "logps/chosen": -183.58102416992188, "logps/rejected": -238.1451873779297, "loss": 0.5547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.306696891784668, "rewards/margins": 0.5725280046463013, "rewards/rejected": -1.8792247772216797, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 24.631587982177734, "learning_rate": 1.3823592675923625e-07, "logits/chosen": -1.7882808446884155, "logits/rejected": -1.7510807514190674, "logps/chosen": -181.6546173095703, "logps/rejected": -224.89871215820312, "loss": 0.5813, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2414124011993408, "rewards/margins": 0.4696447253227234, "rewards/rejected": -1.7110570669174194, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 19.975292205810547, "learning_rate": 1.3805059957624318e-07, "logits/chosen": -1.7191193103790283, "logits/rejected": -1.6984857320785522, "logps/chosen": -168.42410278320312, "logps/rejected": -231.50979614257812, "loss": 0.529, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1859110593795776, "rewards/margins": 0.5888235569000244, "rewards/rejected": -1.7747347354888916, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 32.9302864074707, "learning_rate": 1.3786511945253675e-07, "logits/chosen": -1.6463232040405273, "logits/rejected": -1.6050297021865845, "logps/chosen": -203.4332275390625, "logps/rejected": -259.23309326171875, "loss": 0.5606, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4551560878753662, "rewards/margins": 0.6130838990211487, "rewards/rejected": -2.068239688873291, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 25.096200942993164, "learning_rate": 1.3767948713363646e-07, "logits/chosen": -1.7451419830322266, "logits/rejected": -1.7046140432357788, "logps/chosen": -219.556884765625, "logps/rejected": -272.8857421875, "loss": 0.5864, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6116056442260742, "rewards/margins": 0.5594407320022583, "rewards/rejected": -2.171046257019043, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 26.969646453857422, "learning_rate": 1.374937033656735e-07, "logits/chosen": -1.7444837093353271, "logits/rejected": -1.693585753440857, "logps/chosen": -201.104736328125, "logps/rejected": -262.66473388671875, "loss": 0.5437, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4407439231872559, "rewards/margins": 0.6381824016571045, "rewards/rejected": -2.0789265632629395, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 25.12961196899414, "learning_rate": 1.3730776889538776e-07, "logits/chosen": -1.7061430215835571, "logits/rejected": -1.6663553714752197, "logps/chosen": -192.5684814453125, "logps/rejected": -237.89523315429688, "loss": 0.5899, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3502416610717773, "rewards/margins": 0.49933284521102905, "rewards/rejected": -1.8495744466781616, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 18.33002471923828, "learning_rate": 1.3712168447012493e-07, "logits/chosen": -1.7825815677642822, "logits/rejected": -1.7495508193969727, "logps/chosen": -182.09512329101562, "logps/rejected": -229.66903686523438, "loss": 0.5568, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3145878314971924, "rewards/margins": 0.48502451181411743, "rewards/rejected": -1.799612283706665, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 20.360734939575195, "learning_rate": 1.369354508378334e-07, "logits/chosen": -1.8760316371917725, "logits/rejected": -1.8178390264511108, "logps/chosen": -170.9095001220703, "logps/rejected": -221.64004516601562, "loss": 0.5396, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1592501401901245, "rewards/margins": 0.5665987133979797, "rewards/rejected": -1.725848913192749, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 16.414636611938477, "learning_rate": 1.3674906874706129e-07, "logits/chosen": -1.7452309131622314, "logits/rejected": -1.6976512670516968, "logps/chosen": -181.38619995117188, "logps/rejected": -235.8791046142578, "loss": 0.5379, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2767575979232788, "rewards/margins": 0.5450109839439392, "rewards/rejected": -1.8217687606811523, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 22.02670669555664, "learning_rate": 1.365625389469534e-07, "logits/chosen": -1.7657630443572998, "logits/rejected": -1.7363771200180054, "logps/chosen": -197.82945251464844, "logps/rejected": -243.3042755126953, "loss": 0.5976, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.462568998336792, "rewards/margins": 0.4612101912498474, "rewards/rejected": -1.9237792491912842, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 30.34868049621582, "learning_rate": 1.363758621872483e-07, "logits/chosen": -1.7491880655288696, "logits/rejected": -1.7047643661499023, "logps/chosen": -195.3718719482422, "logps/rejected": -243.65029907226562, "loss": 0.5547, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3922932147979736, "rewards/margins": 0.5360931158065796, "rewards/rejected": -1.9283863306045532, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 22.87053108215332, "learning_rate": 1.361890392182752e-07, "logits/chosen": -1.7148549556732178, "logits/rejected": -1.6829760074615479, "logps/chosen": -180.99148559570312, "logps/rejected": -230.822509765625, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.270691156387329, "rewards/margins": 0.5148378610610962, "rewards/rejected": -1.7855291366577148, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 22.639476776123047, "learning_rate": 1.3600207079095097e-07, "logits/chosen": -1.715233564376831, "logits/rejected": -1.6812883615493774, "logps/chosen": -183.4813232421875, "logps/rejected": -249.8083953857422, "loss": 0.5269, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3177640438079834, "rewards/margins": 0.6425153613090515, "rewards/rejected": -1.9602794647216797, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 21.633207321166992, "learning_rate": 1.3581495765677718e-07, "logits/chosen": -1.724340796470642, "logits/rejected": -1.675937294960022, "logps/chosen": -197.2760009765625, "logps/rejected": -261.220947265625, "loss": 0.5474, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.433332085609436, "rewards/margins": 0.6569215655326843, "rewards/rejected": -2.0902533531188965, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 28.828248977661133, "learning_rate": 1.3562770056783702e-07, "logits/chosen": -1.622764229774475, "logits/rejected": -1.5804797410964966, "logps/chosen": -179.65103149414062, "logps/rejected": -246.61685180664062, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": -1.291443109512329, "rewards/margins": 0.6578264236450195, "rewards/rejected": -1.9492695331573486, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 22.768138885498047, "learning_rate": 1.3544030027679232e-07, "logits/chosen": -1.6900676488876343, "logits/rejected": -1.646816611289978, "logps/chosen": -185.71786499023438, "logps/rejected": -239.11392211914062, "loss": 0.5586, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3360167741775513, "rewards/margins": 0.5299959182739258, "rewards/rejected": -1.8660128116607666, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 27.77067756652832, "learning_rate": 1.3525275753688042e-07, "logits/chosen": -1.8046811819076538, "logits/rejected": -1.779531717300415, "logps/chosen": -201.95101928710938, "logps/rejected": -256.29058837890625, "loss": 0.5928, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5041205883026123, "rewards/margins": 0.5199225544929504, "rewards/rejected": -2.024043321609497, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 27.013347625732422, "learning_rate": 1.350650731019113e-07, "logits/chosen": -1.7433545589447021, "logits/rejected": -1.7085548639297485, "logps/chosen": -195.16302490234375, "logps/rejected": -261.06683349609375, "loss": 0.5275, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4359261989593506, "rewards/margins": 0.6557725071907043, "rewards/rejected": -2.0916988849639893, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 26.0377140045166, "learning_rate": 1.3487724772626439e-07, "logits/chosen": -1.7717100381851196, "logits/rejected": -1.7416073083877563, "logps/chosen": -196.32034301757812, "logps/rejected": -263.3025817871094, "loss": 0.5472, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4233986139297485, "rewards/margins": 0.6623138785362244, "rewards/rejected": -2.0857126712799072, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 42.655330657958984, "learning_rate": 1.346892821648857e-07, "logits/chosen": -1.7774578332901, "logits/rejected": -1.7309677600860596, "logps/chosen": -198.44158935546875, "logps/rejected": -244.404296875, "loss": 0.5825, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.394239902496338, "rewards/margins": 0.5022423267364502, "rewards/rejected": -1.8964821100234985, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 26.43970489501953, "learning_rate": 1.3450117717328468e-07, "logits/chosen": -1.764257788658142, "logits/rejected": -1.7166850566864014, "logps/chosen": -190.58175659179688, "logps/rejected": -257.0692443847656, "loss": 0.5496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.342162847518921, "rewards/margins": 0.6754031777381897, "rewards/rejected": -2.017565965652466, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 29.67559051513672, "learning_rate": 1.3431293350753115e-07, "logits/chosen": -1.704307198524475, "logits/rejected": -1.672844648361206, "logps/chosen": -186.94482421875, "logps/rejected": -250.39633178710938, "loss": 0.551, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3714474439620972, "rewards/margins": 0.5982364416122437, "rewards/rejected": -1.9696840047836304, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 23.183025360107422, "learning_rate": 1.341245519242524e-07, "logits/chosen": -1.6737979650497437, "logits/rejected": -1.6364225149154663, "logps/chosen": -184.36581420898438, "logps/rejected": -234.9649658203125, "loss": 0.571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3079830408096313, "rewards/margins": 0.499339759349823, "rewards/rejected": -1.8073227405548096, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 23.90206527709961, "learning_rate": 1.3393603318063e-07, "logits/chosen": -1.6684505939483643, "logits/rejected": -1.6141875982284546, "logps/chosen": -190.3370819091797, "logps/rejected": -236.37905883789062, "loss": 0.5752, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.347599983215332, "rewards/margins": 0.5324249267578125, "rewards/rejected": -1.8800249099731445, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 23.96225357055664, "learning_rate": 1.3374737803439685e-07, "logits/chosen": -1.694443941116333, "logits/rejected": -1.6376886367797852, "logps/chosen": -188.59872436523438, "logps/rejected": -254.9580841064453, "loss": 0.5265, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.357022762298584, "rewards/margins": 0.6660300493240356, "rewards/rejected": -2.02305269241333, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 35.446407318115234, "learning_rate": 1.3355858724383415e-07, "logits/chosen": -1.708349585533142, "logits/rejected": -1.6715790033340454, "logps/chosen": -196.78343200683594, "logps/rejected": -252.73013305664062, "loss": 0.5663, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4110945463180542, "rewards/margins": 0.5725733041763306, "rewards/rejected": -1.9836680889129639, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 25.436092376708984, "learning_rate": 1.3336966156776822e-07, "logits/chosen": -1.7855867147445679, "logits/rejected": -1.7552499771118164, "logps/chosen": -191.6211395263672, "logps/rejected": -239.4647674560547, "loss": 0.5904, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.385075569152832, "rewards/margins": 0.4844435155391693, "rewards/rejected": -1.8695189952850342, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 22.131240844726562, "learning_rate": 1.3318060176556756e-07, "logits/chosen": -1.7470848560333252, "logits/rejected": -1.7071688175201416, "logps/chosen": -184.20083618164062, "logps/rejected": -242.040771484375, "loss": 0.5498, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2982476949691772, "rewards/margins": 0.5696505308151245, "rewards/rejected": -1.8678982257843018, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 23.41493797302246, "learning_rate": 1.3299140859713983e-07, "logits/chosen": -1.7388805150985718, "logits/rejected": -1.7148288488388062, "logps/chosen": -182.5740203857422, "logps/rejected": -238.2061004638672, "loss": 0.5593, "rewards/accuracies": 0.71875, "rewards/chosen": -1.303978443145752, "rewards/margins": 0.5415924787521362, "rewards/rejected": -1.8455708026885986, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 27.00302505493164, "learning_rate": 1.3280208282292878e-07, "logits/chosen": -1.7452290058135986, "logits/rejected": -1.703344702720642, "logps/chosen": -188.55166625976562, "logps/rejected": -232.04141235351562, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3172823190689087, "rewards/margins": 0.466630220413208, "rewards/rejected": -1.7839124202728271, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 26.739322662353516, "learning_rate": 1.3261262520391097e-07, "logits/chosen": -1.7465155124664307, "logits/rejected": -1.7103631496429443, "logps/chosen": -181.1072998046875, "logps/rejected": -227.1074981689453, "loss": 0.577, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2628659009933472, "rewards/margins": 0.493598073720932, "rewards/rejected": -1.7564637660980225, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 30.61216926574707, "learning_rate": 1.3242303650159313e-07, "logits/chosen": -1.7928102016448975, "logits/rejected": -1.7448190450668335, "logps/chosen": -187.6417694091797, "logps/rejected": -238.43606567382812, "loss": 0.5541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.293071985244751, "rewards/margins": 0.567300021648407, "rewards/rejected": -1.8603721857070923, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 29.23872947692871, "learning_rate": 1.3223331747800867e-07, "logits/chosen": -1.7208061218261719, "logits/rejected": -1.6839271783828735, "logps/chosen": -185.5218048095703, "logps/rejected": -242.7389678955078, "loss": 0.5544, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2696150541305542, "rewards/margins": 0.6055715680122375, "rewards/rejected": -1.8751866817474365, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 17.264358520507812, "learning_rate": 1.3204346889571494e-07, "logits/chosen": -1.6770089864730835, "logits/rejected": -1.6513347625732422, "logps/chosen": -165.56146240234375, "logps/rejected": -223.57083129882812, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1405541896820068, "rewards/margins": 0.5624696016311646, "rewards/rejected": -1.7030236721038818, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 21.592418670654297, "learning_rate": 1.3185349151779e-07, "logits/chosen": -1.705940842628479, "logits/rejected": -1.6647335290908813, "logps/chosen": -175.36734008789062, "logps/rejected": -224.5946807861328, "loss": 0.5805, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2488831281661987, "rewards/margins": 0.47221383452415466, "rewards/rejected": -1.7210969924926758, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 20.810039520263672, "learning_rate": 1.3166338610782957e-07, "logits/chosen": -1.7742341756820679, "logits/rejected": -1.7324060201644897, "logps/chosen": -189.1077117919922, "logps/rejected": -235.3397979736328, "loss": 0.588, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3371036052703857, "rewards/margins": 0.4955763816833496, "rewards/rejected": -1.8326799869537354, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 24.57638168334961, "learning_rate": 1.31473153429944e-07, "logits/chosen": -1.8342373371124268, "logits/rejected": -1.8030681610107422, "logps/chosen": -182.3204345703125, "logps/rejected": -236.19168090820312, "loss": 0.5656, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.254408359527588, "rewards/margins": 0.5464503765106201, "rewards/rejected": -1.8008590936660767, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 21.95906639099121, "learning_rate": 1.3128279424875523e-07, "logits/chosen": -1.839450478553772, "logits/rejected": -1.8062585592269897, "logps/chosen": -177.2168731689453, "logps/rejected": -235.6778106689453, "loss": 0.5459, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2654964923858643, "rewards/margins": 0.5746921896934509, "rewards/rejected": -1.84018874168396, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 19.835283279418945, "learning_rate": 1.3109230932939354e-07, "logits/chosen": -1.6997854709625244, "logits/rejected": -1.6718209981918335, "logps/chosen": -175.2350311279297, "logps/rejected": -238.880859375, "loss": 0.5403, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2071750164031982, "rewards/margins": 0.6192091703414917, "rewards/rejected": -1.82638418674469, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -1.8848055601119995, "eval_logits/rejected": -1.864662766456604, "eval_logps/chosen": -173.08419799804688, "eval_logps/rejected": -204.28668212890625, "eval_loss": 0.6338950991630554, "eval_rewards/accuracies": 0.6263940334320068, "eval_rewards/chosen": -1.1437228918075562, "eval_rewards/margins": 0.2673425078392029, "eval_rewards/rejected": -1.4110653400421143, "eval_runtime": 383.1441, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 29.697978973388672, "learning_rate": 1.3090169943749475e-07, "logits/chosen": -1.7395660877227783, "logits/rejected": -1.6922681331634521, "logps/chosen": -180.59262084960938, "logps/rejected": -233.02597045898438, "loss": 0.5438, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.234327793121338, "rewards/margins": 0.5716985464096069, "rewards/rejected": -1.8060262203216553, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 30.644569396972656, "learning_rate": 1.307109653391969e-07, "logits/chosen": -1.7602342367172241, "logits/rejected": -1.7022281885147095, "logps/chosen": -199.9853973388672, "logps/rejected": -250.7794952392578, "loss": 0.548, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3974756002426147, "rewards/margins": 0.5628911852836609, "rewards/rejected": -1.9603666067123413, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 27.904361724853516, "learning_rate": 1.3052010780113726e-07, "logits/chosen": -1.7528932094573975, "logits/rejected": -1.7155113220214844, "logps/chosen": -174.95889282226562, "logps/rejected": -240.86398315429688, "loss": 0.5346, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2381556034088135, "rewards/margins": 0.6363126635551453, "rewards/rejected": -1.874468207359314, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 20.0029239654541, "learning_rate": 1.3032912759044937e-07, "logits/chosen": -1.7023286819458008, "logits/rejected": -1.6535085439682007, "logps/chosen": -193.00807189941406, "logps/rejected": -251.18673706054688, "loss": 0.5622, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3858143091201782, "rewards/margins": 0.6057716608047485, "rewards/rejected": -1.9915859699249268, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 17.56081771850586, "learning_rate": 1.301380254747597e-07, "logits/chosen": -1.7698986530303955, "logits/rejected": -1.7235695123672485, "logps/chosen": -182.3731689453125, "logps/rejected": -253.9183349609375, "loss": 0.5107, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2726563215255737, "rewards/margins": 0.7080651521682739, "rewards/rejected": -1.9807217121124268, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 27.260448455810547, "learning_rate": 1.2994680222218478e-07, "logits/chosen": -1.7978088855743408, "logits/rejected": -1.7510201930999756, "logps/chosen": -194.02896118164062, "logps/rejected": -236.45883178710938, "loss": 0.6043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3869035243988037, "rewards/margins": 0.47681206464767456, "rewards/rejected": -1.8637157678604126, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 33.41956329345703, "learning_rate": 1.29755458601328e-07, "logits/chosen": -1.7090890407562256, "logits/rejected": -1.6601349115371704, "logps/chosen": -193.67401123046875, "logps/rejected": -243.97958374023438, "loss": 0.5677, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4021570682525635, "rewards/margins": 0.5379910469055176, "rewards/rejected": -1.9401483535766602, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 24.615610122680664, "learning_rate": 1.2956399538127665e-07, "logits/chosen": -1.717283844947815, "logits/rejected": -1.688805341720581, "logps/chosen": -174.60430908203125, "logps/rejected": -242.5743865966797, "loss": 0.5228, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2161786556243896, "rewards/margins": 0.6299420595169067, "rewards/rejected": -1.8461205959320068, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 20.54537582397461, "learning_rate": 1.2937241333159854e-07, "logits/chosen": -1.6753578186035156, "logits/rejected": -1.6328798532485962, "logps/chosen": -182.584716796875, "logps/rejected": -242.11477661132812, "loss": 0.5507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2946635484695435, "rewards/margins": 0.611949622631073, "rewards/rejected": -1.9066131114959717, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 25.101362228393555, "learning_rate": 1.2918071322233933e-07, "logits/chosen": -1.7218458652496338, "logits/rejected": -1.7032239437103271, "logps/chosen": -193.86203002929688, "logps/rejected": -240.07717895507812, "loss": 0.5975, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3978016376495361, "rewards/margins": 0.42204397916793823, "rewards/rejected": -1.8198457956314087, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 29.090593338012695, "learning_rate": 1.2898889582401912e-07, "logits/chosen": -1.756087303161621, "logits/rejected": -1.7036664485931396, "logps/chosen": -185.19163513183594, "logps/rejected": -235.901123046875, "loss": 0.5544, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2913007736206055, "rewards/margins": 0.5572675466537476, "rewards/rejected": -1.848568320274353, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 28.600011825561523, "learning_rate": 1.287969619076294e-07, "logits/chosen": -1.7284362316131592, "logits/rejected": -1.692164659500122, "logps/chosen": -177.6201934814453, "logps/rejected": -226.9577178955078, "loss": 0.5815, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.235177755355835, "rewards/margins": 0.5055698156356812, "rewards/rejected": -1.7407476902008057, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 25.980693817138672, "learning_rate": 1.2860491224463003e-07, "logits/chosen": -1.7584434747695923, "logits/rejected": -1.727425217628479, "logps/chosen": -173.2230682373047, "logps/rejected": -219.60348510742188, "loss": 0.5641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1840296983718872, "rewards/margins": 0.5059834122657776, "rewards/rejected": -1.6900132894515991, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 23.344661712646484, "learning_rate": 1.2841274760694607e-07, "logits/chosen": -1.722978949546814, "logits/rejected": -1.6852538585662842, "logps/chosen": -175.82997131347656, "logps/rejected": -232.1393585205078, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2310302257537842, "rewards/margins": 0.5622873902320862, "rewards/rejected": -1.7933177947998047, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 17.72876739501953, "learning_rate": 1.282204687669648e-07, "logits/chosen": -1.7822058200836182, "logits/rejected": -1.7442264556884766, "logps/chosen": -184.4099884033203, "logps/rejected": -238.8332977294922, "loss": 0.5873, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3226220607757568, "rewards/margins": 0.5349725484848022, "rewards/rejected": -1.8575942516326904, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 18.7353515625, "learning_rate": 1.280280764975324e-07, "logits/chosen": -1.7413051128387451, "logits/rejected": -1.6966259479522705, "logps/chosen": -173.74789428710938, "logps/rejected": -239.2137908935547, "loss": 0.5069, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.177348017692566, "rewards/margins": 0.6919088363647461, "rewards/rejected": -1.8692569732666016, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 28.141429901123047, "learning_rate": 1.278355715719511e-07, "logits/chosen": -1.7984817028045654, "logits/rejected": -1.7529537677764893, "logps/chosen": -178.94229125976562, "logps/rejected": -222.255859375, "loss": 0.5441, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1923143863677979, "rewards/margins": 0.5107215642929077, "rewards/rejected": -1.7030360698699951, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 30.230098724365234, "learning_rate": 1.276429547639758e-07, "logits/chosen": -1.7686141729354858, "logits/rejected": -1.7348991632461548, "logps/chosen": -186.8579864501953, "logps/rejected": -231.6160430908203, "loss": 0.5838, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3056573867797852, "rewards/margins": 0.46039143204689026, "rewards/rejected": -1.7660486698150635, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 25.567800521850586, "learning_rate": 1.274502268478112e-07, "logits/chosen": -1.721076250076294, "logits/rejected": -1.6785032749176025, "logps/chosen": -186.8155517578125, "logps/rejected": -235.377197265625, "loss": 0.5803, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2883694171905518, "rewards/margins": 0.5267794728279114, "rewards/rejected": -1.815148949623108, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 34.265472412109375, "learning_rate": 1.2725738859810862e-07, "logits/chosen": -1.7358614206314087, "logits/rejected": -1.6915502548217773, "logps/chosen": -194.3486328125, "logps/rejected": -238.12918090820312, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3935348987579346, "rewards/margins": 0.4622439742088318, "rewards/rejected": -1.8557790517807007, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 22.01947593688965, "learning_rate": 1.270644407899627e-07, "logits/chosen": -1.6316165924072266, "logits/rejected": -1.596050500869751, "logps/chosen": -184.8388214111328, "logps/rejected": -238.46627807617188, "loss": 0.5665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2973389625549316, "rewards/margins": 0.525777280330658, "rewards/rejected": -1.8231165409088135, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 21.25282859802246, "learning_rate": 1.2687138419890863e-07, "logits/chosen": -1.7528893947601318, "logits/rejected": -1.7010984420776367, "logps/chosen": -177.33926391601562, "logps/rejected": -236.0788116455078, "loss": 0.539, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2424618005752563, "rewards/margins": 0.5878540277481079, "rewards/rejected": -1.8303155899047852, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 21.485868453979492, "learning_rate": 1.2667821960091865e-07, "logits/chosen": -1.7280648946762085, "logits/rejected": -1.6949926614761353, "logps/chosen": -184.8418426513672, "logps/rejected": -234.5526885986328, "loss": 0.5588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.295932412147522, "rewards/margins": 0.5362016558647156, "rewards/rejected": -1.8321340084075928, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 36.89909744262695, "learning_rate": 1.2648494777239934e-07, "logits/chosen": -1.8095436096191406, "logits/rejected": -1.7656471729278564, "logps/chosen": -191.72720336914062, "logps/rejected": -241.11117553710938, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": -1.355535626411438, "rewards/margins": 0.5371559858322144, "rewards/rejected": -1.8926916122436523, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 21.338768005371094, "learning_rate": 1.2629156949018805e-07, "logits/chosen": -1.7479426860809326, "logits/rejected": -1.711358666419983, "logps/chosen": -180.8011474609375, "logps/rejected": -240.23995971679688, "loss": 0.535, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2315177917480469, "rewards/margins": 0.627647340297699, "rewards/rejected": -1.8591649532318115, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 25.411182403564453, "learning_rate": 1.260980855315502e-07, "logits/chosen": -1.8247654438018799, "logits/rejected": -1.787501573562622, "logps/chosen": -184.41542053222656, "logps/rejected": -249.8496856689453, "loss": 0.5518, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2800081968307495, "rewards/margins": 0.6656675934791565, "rewards/rejected": -1.9456758499145508, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 24.6607608795166, "learning_rate": 1.2590449667417585e-07, "logits/chosen": -1.8074318170547485, "logits/rejected": -1.7805984020233154, "logps/chosen": -183.58489990234375, "logps/rejected": -243.6260986328125, "loss": 0.5597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3018925189971924, "rewards/margins": 0.5806858539581299, "rewards/rejected": -1.8825784921646118, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 24.326169967651367, "learning_rate": 1.2571080369617673e-07, "logits/chosen": -1.7513822317123413, "logits/rejected": -1.7262996435165405, "logps/chosen": -184.77059936523438, "logps/rejected": -232.02096557617188, "loss": 0.5994, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3239353895187378, "rewards/margins": 0.4675242304801941, "rewards/rejected": -1.7914596796035767, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 19.493026733398438, "learning_rate": 1.2551700737608313e-07, "logits/chosen": -1.7309757471084595, "logits/rejected": -1.6682205200195312, "logps/chosen": -179.61337280273438, "logps/rejected": -220.5546875, "loss": 0.5686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1967713832855225, "rewards/margins": 0.4668409824371338, "rewards/rejected": -1.6636121273040771, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 18.092647552490234, "learning_rate": 1.253231084928406e-07, "logits/chosen": -1.8834679126739502, "logits/rejected": -1.8440923690795898, "logps/chosen": -184.91363525390625, "logps/rejected": -235.0916748046875, "loss": 0.5922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2733423709869385, "rewards/margins": 0.5322745442390442, "rewards/rejected": -1.8056169748306274, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 26.49702262878418, "learning_rate": 1.2512910782580704e-07, "logits/chosen": -1.6803478002548218, "logits/rejected": -1.6344587802886963, "logps/chosen": -173.33175659179688, "logps/rejected": -220.29006958007812, "loss": 0.5545, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1706262826919556, "rewards/margins": 0.5207440853118896, "rewards/rejected": -1.6913702487945557, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 23.054729461669922, "learning_rate": 1.2493500615474937e-07, "logits/chosen": -1.7400583028793335, "logits/rejected": -1.7084745168685913, "logps/chosen": -171.78407287597656, "logps/rejected": -231.578369140625, "loss": 0.5482, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.19270658493042, "rewards/margins": 0.5825310945510864, "rewards/rejected": -1.7752374410629272, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 25.612323760986328, "learning_rate": 1.2474080425984056e-07, "logits/chosen": -1.7509899139404297, "logits/rejected": -1.7259149551391602, "logps/chosen": -183.274169921875, "logps/rejected": -233.5478057861328, "loss": 0.5994, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3465931415557861, "rewards/margins": 0.45025119185447693, "rewards/rejected": -1.796844244003296, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 25.223346710205078, "learning_rate": 1.2454650292165634e-07, "logits/chosen": -1.8134841918945312, "logits/rejected": -1.7871147394180298, "logps/chosen": -183.5989990234375, "logps/rejected": -228.4562225341797, "loss": 0.5981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.291554570198059, "rewards/margins": 0.47639140486717224, "rewards/rejected": -1.7679458856582642, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 21.51726531982422, "learning_rate": 1.2435210292117223e-07, "logits/chosen": -1.6630241870880127, "logits/rejected": -1.6343367099761963, "logps/chosen": -182.60397338867188, "logps/rejected": -225.54019165039062, "loss": 0.5842, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2943296432495117, "rewards/margins": 0.4351266920566559, "rewards/rejected": -1.7294561862945557, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 28.087337493896484, "learning_rate": 1.2415760503976027e-07, "logits/chosen": -1.6792490482330322, "logits/rejected": -1.622058629989624, "logps/chosen": -163.5045166015625, "logps/rejected": -222.25582885742188, "loss": 0.5219, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0895042419433594, "rewards/margins": 0.6140426397323608, "rewards/rejected": -1.7035468816757202, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 28.05508041381836, "learning_rate": 1.2396301005918592e-07, "logits/chosen": -1.7516257762908936, "logits/rejected": -1.712021827697754, "logps/chosen": -174.34732055664062, "logps/rejected": -233.7310791015625, "loss": 0.5609, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.232910394668579, "rewards/margins": 0.5682687759399414, "rewards/rejected": -1.8011791706085205, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 18.621915817260742, "learning_rate": 1.2376831876160493e-07, "logits/chosen": -1.8677845001220703, "logits/rejected": -1.8293062448501587, "logps/chosen": -157.7315673828125, "logps/rejected": -216.45401000976562, "loss": 0.5486, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0649034976959229, "rewards/margins": 0.5581703186035156, "rewards/rejected": -1.6230738162994385, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 19.196773529052734, "learning_rate": 1.2357353192956015e-07, "logits/chosen": -1.837416648864746, "logits/rejected": -1.786107063293457, "logps/chosen": -167.60275268554688, "logps/rejected": -239.2596435546875, "loss": 0.487, "rewards/accuracies": 0.75, "rewards/chosen": -1.109065294265747, "rewards/margins": 0.7224443554878235, "rewards/rejected": -1.8315098285675049, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 19.819705963134766, "learning_rate": 1.2337865034597853e-07, "logits/chosen": -1.7919927835464478, "logits/rejected": -1.7521469593048096, "logps/chosen": -157.2268524169922, "logps/rejected": -214.87722778320312, "loss": 0.5444, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.069208025932312, "rewards/margins": 0.571751594543457, "rewards/rejected": -1.6409597396850586, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -1.8767586946487427, "eval_logits/rejected": -1.8567559719085693, "eval_logps/chosen": -165.97645568847656, "eval_logps/rejected": -196.28274536132812, "eval_loss": 0.6338884234428406, "eval_rewards/accuracies": 0.6287174820899963, "eval_rewards/chosen": -1.072645664215088, "eval_rewards/margins": 0.25838038325309753, "eval_rewards/rejected": -1.3310261964797974, "eval_runtime": 384.2633, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 18.899368286132812, "learning_rate": 1.2318367479416772e-07, "logits/chosen": -1.7205651998519897, "logits/rejected": -1.6706392765045166, "logps/chosen": -180.8457794189453, "logps/rejected": -242.57177734375, "loss": 0.5331, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2318532466888428, "rewards/margins": 0.6757212281227112, "rewards/rejected": -1.9075744152069092, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 23.191862106323242, "learning_rate": 1.2298860605781317e-07, "logits/chosen": -1.61892831325531, "logits/rejected": -1.5845869779586792, "logps/chosen": -162.18161010742188, "logps/rejected": -229.1549530029297, "loss": 0.5114, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1052982807159424, "rewards/margins": 0.6736155152320862, "rewards/rejected": -1.7789137363433838, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 27.311832427978516, "learning_rate": 1.2279344492097482e-07, "logits/chosen": -1.7439725399017334, "logits/rejected": -1.7103437185287476, "logps/chosen": -173.8157501220703, "logps/rejected": -229.6221160888672, "loss": 0.5683, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2032434940338135, "rewards/margins": 0.5264207720756531, "rewards/rejected": -1.7296644449234009, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 26.43974494934082, "learning_rate": 1.2259819216808406e-07, "logits/chosen": -1.76885187625885, "logits/rejected": -1.7441856861114502, "logps/chosen": -179.97201538085938, "logps/rejected": -224.7045440673828, "loss": 0.5869, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2517940998077393, "rewards/margins": 0.47994908690452576, "rewards/rejected": -1.7317432165145874, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 21.743513107299805, "learning_rate": 1.2240284858394048e-07, "logits/chosen": -1.640924096107483, "logits/rejected": -1.6128562688827515, "logps/chosen": -175.72862243652344, "logps/rejected": -240.687744140625, "loss": 0.5499, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2771676778793335, "rewards/margins": 0.6006568670272827, "rewards/rejected": -1.8778244256973267, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 21.22736167907715, "learning_rate": 1.2220741495370875e-07, "logits/chosen": -1.7674133777618408, "logits/rejected": -1.7252048254013062, "logps/chosen": -178.5194854736328, "logps/rejected": -238.6125946044922, "loss": 0.5199, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2571282386779785, "rewards/margins": 0.5916931629180908, "rewards/rejected": -1.8488212823867798, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 40.24439239501953, "learning_rate": 1.220118920629155e-07, "logits/chosen": -1.720158338546753, "logits/rejected": -1.683738112449646, "logps/chosen": -201.67807006835938, "logps/rejected": -249.63784790039062, "loss": 0.6055, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.456115961074829, "rewards/margins": 0.5088047385215759, "rewards/rejected": -1.9649207592010498, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 27.819427490234375, "learning_rate": 1.2181628069744613e-07, "logits/chosen": -1.7302448749542236, "logits/rejected": -1.6805111169815063, "logps/chosen": -181.5083465576172, "logps/rejected": -242.5039520263672, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -1.284711480140686, "rewards/margins": 0.6244471669197083, "rewards/rejected": -1.90915846824646, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 18.368925094604492, "learning_rate": 1.216205816435416e-07, "logits/chosen": -1.8100181818008423, "logits/rejected": -1.7824761867523193, "logps/chosen": -175.33273315429688, "logps/rejected": -233.98095703125, "loss": 0.5516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2059272527694702, "rewards/margins": 0.5618650317192078, "rewards/rejected": -1.7677921056747437, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 25.975460052490234, "learning_rate": 1.2142479568779545e-07, "logits/chosen": -1.6658073663711548, "logits/rejected": -1.6362226009368896, "logps/chosen": -173.92318725585938, "logps/rejected": -230.8607635498047, "loss": 0.5476, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1961640119552612, "rewards/margins": 0.5779889225959778, "rewards/rejected": -1.7741529941558838, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 27.53252601623535, "learning_rate": 1.2122892361715042e-07, "logits/chosen": -1.7083604335784912, "logits/rejected": -1.6629486083984375, "logps/chosen": -184.6015167236328, "logps/rejected": -244.2220001220703, "loss": 0.5126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2675223350524902, "rewards/margins": 0.6594316363334656, "rewards/rejected": -1.9269540309906006, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 21.84817886352539, "learning_rate": 1.2103296621889531e-07, "logits/chosen": -1.6953309774398804, "logits/rejected": -1.6540002822875977, "logps/chosen": -186.7661590576172, "logps/rejected": -238.6102752685547, "loss": 0.565, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3319623470306396, "rewards/margins": 0.5671849250793457, "rewards/rejected": -1.899147391319275, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 26.40680503845215, "learning_rate": 1.2083692428066207e-07, "logits/chosen": -1.6670316457748413, "logits/rejected": -1.6319210529327393, "logps/chosen": -188.43687438964844, "logps/rejected": -238.4445037841797, "loss": 0.5843, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.345499038696289, "rewards/margins": 0.5246737003326416, "rewards/rejected": -1.8701727390289307, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 21.45933723449707, "learning_rate": 1.2064079859042237e-07, "logits/chosen": -1.8144257068634033, "logits/rejected": -1.791337251663208, "logps/chosen": -183.89779663085938, "logps/rejected": -226.5663299560547, "loss": 0.6059, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2920362949371338, "rewards/margins": 0.4292899966239929, "rewards/rejected": -1.721326470375061, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 21.429597854614258, "learning_rate": 1.204445899364844e-07, "logits/chosen": -1.7466418743133545, "logits/rejected": -1.7070821523666382, "logps/chosen": -183.11488342285156, "logps/rejected": -248.70767211914062, "loss": 0.5287, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2927693128585815, "rewards/margins": 0.6410099267959595, "rewards/rejected": -1.9337793588638306, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 19.853425979614258, "learning_rate": 1.2024829910749e-07, "logits/chosen": -1.8415504693984985, "logits/rejected": -1.7988510131835938, "logps/chosen": -182.31796264648438, "logps/rejected": -244.3562469482422, "loss": 0.5303, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.281106948852539, "rewards/margins": 0.6391115188598633, "rewards/rejected": -1.9202184677124023, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 26.883686065673828, "learning_rate": 1.2005192689241111e-07, "logits/chosen": -1.7047302722930908, "logits/rejected": -1.6662166118621826, "logps/chosen": -178.93783569335938, "logps/rejected": -230.54995727539062, "loss": 0.5444, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2458077669143677, "rewards/margins": 0.5393422842025757, "rewards/rejected": -1.7851499319076538, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 22.551843643188477, "learning_rate": 1.1985547408054707e-07, "logits/chosen": -1.786651372909546, "logits/rejected": -1.7402511835098267, "logps/chosen": -171.06886291503906, "logps/rejected": -240.85791015625, "loss": 0.4924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1693259477615356, "rewards/margins": 0.6937976479530334, "rewards/rejected": -1.8631235361099243, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 22.17210578918457, "learning_rate": 1.1965894146152083e-07, "logits/chosen": -1.7574405670166016, "logits/rejected": -1.704829454421997, "logps/chosen": -180.36509704589844, "logps/rejected": -240.45803833007812, "loss": 0.5201, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2337052822113037, "rewards/margins": 0.6370530724525452, "rewards/rejected": -1.8707586526870728, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 29.228626251220703, "learning_rate": 1.1946232982527637e-07, "logits/chosen": -1.677152395248413, "logits/rejected": -1.6436678171157837, "logps/chosen": -199.14393615722656, "logps/rejected": -238.5497589111328, "loss": 0.6175, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4570565223693848, "rewards/margins": 0.41096407175064087, "rewards/rejected": -1.8680204153060913, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 35.54603958129883, "learning_rate": 1.1926563996207518e-07, "logits/chosen": -1.7128098011016846, "logits/rejected": -1.6729986667633057, "logps/chosen": -189.81747436523438, "logps/rejected": -245.4787139892578, "loss": 0.5656, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3567256927490234, "rewards/margins": 0.5924991369247437, "rewards/rejected": -1.9492250680923462, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 22.30665397644043, "learning_rate": 1.1906887266249317e-07, "logits/chosen": -1.6407105922698975, "logits/rejected": -1.6167869567871094, "logps/chosen": -191.15216064453125, "logps/rejected": -230.05224609375, "loss": 0.6086, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.36378014087677, "rewards/margins": 0.4242352545261383, "rewards/rejected": -1.788015365600586, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 24.072481155395508, "learning_rate": 1.1887202871741757e-07, "logits/chosen": -1.6276092529296875, "logits/rejected": -1.5934003591537476, "logps/chosen": -168.8276824951172, "logps/rejected": -233.3299102783203, "loss": 0.5156, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1712262630462646, "rewards/margins": 0.6414912939071655, "rewards/rejected": -1.8127176761627197, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 22.44603157043457, "learning_rate": 1.1867510891804353e-07, "logits/chosen": -1.769152045249939, "logits/rejected": -1.7334001064300537, "logps/chosen": -189.02745056152344, "logps/rejected": -233.2024383544922, "loss": 0.5991, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3328588008880615, "rewards/margins": 0.47136467695236206, "rewards/rejected": -1.804223656654358, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 30.211387634277344, "learning_rate": 1.1847811405587127e-07, "logits/chosen": -1.6871166229248047, "logits/rejected": -1.6457468271255493, "logps/chosen": -183.58999633789062, "logps/rejected": -237.5922393798828, "loss": 0.5813, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2863670587539673, "rewards/margins": 0.5488260984420776, "rewards/rejected": -1.8351930379867554, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 26.927658081054688, "learning_rate": 1.1828104492270254e-07, "logits/chosen": -1.7137506008148193, "logits/rejected": -1.6803970336914062, "logps/chosen": -181.31094360351562, "logps/rejected": -238.88528442382812, "loss": 0.5594, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2794899940490723, "rewards/margins": 0.5921354293823242, "rewards/rejected": -1.871625542640686, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 21.608163833618164, "learning_rate": 1.1808390231063783e-07, "logits/chosen": -1.81405770778656, "logits/rejected": -1.7714502811431885, "logps/chosen": -175.96788024902344, "logps/rejected": -238.8325653076172, "loss": 0.5383, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.239720106124878, "rewards/margins": 0.6207379698753357, "rewards/rejected": -1.8604580163955688, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 30.645854949951172, "learning_rate": 1.1788668701207274e-07, "logits/chosen": -1.7290817499160767, "logits/rejected": -1.7106482982635498, "logps/chosen": -183.47903442382812, "logps/rejected": -224.8456268310547, "loss": 0.6189, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.328007459640503, "rewards/margins": 0.3926061689853668, "rewards/rejected": -1.7206134796142578, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 22.075122833251953, "learning_rate": 1.1768939981969515e-07, "logits/chosen": -1.7659361362457275, "logits/rejected": -1.7287206649780273, "logps/chosen": -184.3316650390625, "logps/rejected": -222.2124481201172, "loss": 0.6251, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2879536151885986, "rewards/margins": 0.4510854184627533, "rewards/rejected": -1.7390391826629639, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 24.525606155395508, "learning_rate": 1.1749204152648191e-07, "logits/chosen": -1.7778053283691406, "logits/rejected": -1.7342383861541748, "logps/chosen": -188.58360290527344, "logps/rejected": -223.57955932617188, "loss": 0.6054, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3208611011505127, "rewards/margins": 0.38850995898246765, "rewards/rejected": -1.7093709707260132, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 32.69189453125, "learning_rate": 1.1729461292569563e-07, "logits/chosen": -1.7515008449554443, "logits/rejected": -1.735420823097229, "logps/chosen": -177.72052001953125, "logps/rejected": -207.87039184570312, "loss": 0.6566, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2488489151000977, "rewards/margins": 0.298393189907074, "rewards/rejected": -1.5472421646118164, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 26.830543518066406, "learning_rate": 1.1709711481088156e-07, "logits/chosen": -1.8059642314910889, "logits/rejected": -1.7639391422271729, "logps/chosen": -162.46495056152344, "logps/rejected": -211.879638671875, "loss": 0.5446, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0611735582351685, "rewards/margins": 0.5130646824836731, "rewards/rejected": -1.5742381811141968, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 20.756858825683594, "learning_rate": 1.1689954797586422e-07, "logits/chosen": -1.789809226989746, "logits/rejected": -1.7468286752700806, "logps/chosen": -163.58743286132812, "logps/rejected": -215.0264892578125, "loss": 0.5693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.113161325454712, "rewards/margins": 0.5186198949813843, "rewards/rejected": -1.6317813396453857, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 32.3831901550293, "learning_rate": 1.1670191321474457e-07, "logits/chosen": -1.7614628076553345, "logits/rejected": -1.7365093231201172, "logps/chosen": -167.097412109375, "logps/rejected": -226.8600616455078, "loss": 0.5446, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1569335460662842, "rewards/margins": 0.5816691517829895, "rewards/rejected": -1.738602638244629, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 24.4361572265625, "learning_rate": 1.1650421132189634e-07, "logits/chosen": -1.799372673034668, "logits/rejected": -1.7554528713226318, "logps/chosen": -168.343017578125, "logps/rejected": -229.10745239257812, "loss": 0.5329, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1464492082595825, "rewards/margins": 0.6126116514205933, "rewards/rejected": -1.7590608596801758, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 18.993227005004883, "learning_rate": 1.1630644309196327e-07, "logits/chosen": -1.725738525390625, "logits/rejected": -1.7081897258758545, "logps/chosen": -167.4300079345703, "logps/rejected": -217.34671020507812, "loss": 0.5799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1593310832977295, "rewards/margins": 0.4624145030975342, "rewards/rejected": -1.6217454671859741, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 21.166257858276367, "learning_rate": 1.1610860931985566e-07, "logits/chosen": -1.7743898630142212, "logits/rejected": -1.730316400527954, "logps/chosen": -176.2876434326172, "logps/rejected": -224.82296752929688, "loss": 0.569, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2121226787567139, "rewards/margins": 0.5385057926177979, "rewards/rejected": -1.7506287097930908, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 27.59946632385254, "learning_rate": 1.1591071080074727e-07, "logits/chosen": -1.799107551574707, "logits/rejected": -1.7788976430892944, "logps/chosen": -169.95645141601562, "logps/rejected": -227.0201873779297, "loss": 0.552, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.208627462387085, "rewards/margins": 0.5270792245864868, "rewards/rejected": -1.7357066869735718, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 26.69658088684082, "learning_rate": 1.1571274833007214e-07, "logits/chosen": -1.8081270456314087, "logits/rejected": -1.7647250890731812, "logps/chosen": -174.61740112304688, "logps/rejected": -226.5430145263672, "loss": 0.5601, "rewards/accuracies": 0.71875, "rewards/chosen": -1.196936011314392, "rewards/margins": 0.5481966137886047, "rewards/rejected": -1.7451328039169312, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 27.48969268798828, "learning_rate": 1.1551472270352125e-07, "logits/chosen": -1.7258638143539429, "logits/rejected": -1.6805026531219482, "logps/chosen": -168.3254852294922, "logps/rejected": -213.35421752929688, "loss": 0.5766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1332638263702393, "rewards/margins": 0.4753327965736389, "rewards/rejected": -1.6085964441299438, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -1.9008620977401733, "eval_logits/rejected": -1.8818625211715698, "eval_logps/chosen": -162.34829711914062, "eval_logps/rejected": -191.97488403320312, "eval_loss": 0.6328625082969666, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -1.036363959312439, "eval_rewards/margins": 0.2515837252140045, "eval_rewards/rejected": -1.287947654724121, "eval_runtime": 384.3016, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 23.977811813354492, "learning_rate": 1.1531663471703956e-07, "logits/chosen": -1.8277661800384521, "logits/rejected": -1.7861217260360718, "logps/chosen": -168.30447387695312, "logps/rejected": -230.71499633789062, "loss": 0.5309, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.17344069480896, "rewards/margins": 0.6076480150222778, "rewards/rejected": -1.7810888290405273, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 31.468040466308594, "learning_rate": 1.1511848516682257e-07, "logits/chosen": -1.8036655187606812, "logits/rejected": -1.7669929265975952, "logps/chosen": -164.54380798339844, "logps/rejected": -223.2119903564453, "loss": 0.5289, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1411962509155273, "rewards/margins": 0.5930995345115662, "rewards/rejected": -1.7342956066131592, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 26.091304779052734, "learning_rate": 1.149202748493133e-07, "logits/chosen": -1.651025414466858, "logits/rejected": -1.6094402074813843, "logps/chosen": -171.48646545410156, "logps/rejected": -223.22842407226562, "loss": 0.5773, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1689049005508423, "rewards/margins": 0.5359553098678589, "rewards/rejected": -1.7048600912094116, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 23.064006805419922, "learning_rate": 1.1472200456119901e-07, "logits/chosen": -1.6805092096328735, "logits/rejected": -1.6452114582061768, "logps/chosen": -164.11929321289062, "logps/rejected": -228.6237335205078, "loss": 0.5354, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1177093982696533, "rewards/margins": 0.6305631995201111, "rewards/rejected": -1.7482725381851196, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 27.771137237548828, "learning_rate": 1.1452367509940794e-07, "logits/chosen": -1.841334581375122, "logits/rejected": -1.8025280237197876, "logps/chosen": -162.31332397460938, "logps/rejected": -228.25985717773438, "loss": 0.5362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1006293296813965, "rewards/margins": 0.6390389204025269, "rewards/rejected": -1.7396682500839233, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 19.63231086730957, "learning_rate": 1.1432528726110628e-07, "logits/chosen": -1.7424705028533936, "logits/rejected": -1.699713110923767, "logps/chosen": -175.4546661376953, "logps/rejected": -235.0323028564453, "loss": 0.5317, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2227356433868408, "rewards/margins": 0.5989742875099182, "rewards/rejected": -1.8217099905014038, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 21.968769073486328, "learning_rate": 1.1412684184369478e-07, "logits/chosen": -1.8747831583023071, "logits/rejected": -1.8180469274520874, "logps/chosen": -182.103271484375, "logps/rejected": -244.1374053955078, "loss": 0.5301, "rewards/accuracies": 0.75, "rewards/chosen": -1.253333568572998, "rewards/margins": 0.6377841234207153, "rewards/rejected": -1.8911176919937134, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 34.719356536865234, "learning_rate": 1.1392833964480564e-07, "logits/chosen": -1.6653770208358765, "logits/rejected": -1.6219761371612549, "logps/chosen": -183.3114776611328, "logps/rejected": -243.77294921875, "loss": 0.5712, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2986027002334595, "rewards/margins": 0.6236371397972107, "rewards/rejected": -1.922239899635315, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 24.384660720825195, "learning_rate": 1.137297814622993e-07, "logits/chosen": -1.6986202001571655, "logits/rejected": -1.6509069204330444, "logps/chosen": -186.1614227294922, "logps/rejected": -248.8933868408203, "loss": 0.5196, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.316670536994934, "rewards/margins": 0.6645750999450684, "rewards/rejected": -1.9812456369400024, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 22.604028701782227, "learning_rate": 1.1353116809426121e-07, "logits/chosen": -1.7376683950424194, "logits/rejected": -1.6882107257843018, "logps/chosen": -189.76602172851562, "logps/rejected": -251.9395294189453, "loss": 0.5366, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3487989902496338, "rewards/margins": 0.6406348347663879, "rewards/rejected": -1.9894338846206665, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 31.242036819458008, "learning_rate": 1.1333250033899867e-07, "logits/chosen": -1.6938707828521729, "logits/rejected": -1.6581497192382812, "logps/chosen": -200.32672119140625, "logps/rejected": -262.11102294921875, "loss": 0.5484, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4465105533599854, "rewards/margins": 0.6440070867538452, "rewards/rejected": -2.09051775932312, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 22.39651107788086, "learning_rate": 1.131337789950375e-07, "logits/chosen": -1.8044532537460327, "logits/rejected": -1.744747519493103, "logps/chosen": -197.68798828125, "logps/rejected": -268.2427673339844, "loss": 0.5196, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.406423807144165, "rewards/margins": 0.7548932433128357, "rewards/rejected": -2.1613171100616455, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 20.711666107177734, "learning_rate": 1.12935004861119e-07, "logits/chosen": -1.7355296611785889, "logits/rejected": -1.6860911846160889, "logps/chosen": -199.24159240722656, "logps/rejected": -260.26495361328125, "loss": 0.5508, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4455238580703735, "rewards/margins": 0.6514202952384949, "rewards/rejected": -2.0969443321228027, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 29.11483383178711, "learning_rate": 1.1273617873619663e-07, "logits/chosen": -1.720091462135315, "logits/rejected": -1.6819565296173096, "logps/chosen": -192.368896484375, "logps/rejected": -250.0859832763672, "loss": 0.5668, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3673410415649414, "rewards/margins": 0.575615644454956, "rewards/rejected": -1.9429569244384766, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 34.54115676879883, "learning_rate": 1.1253730141943276e-07, "logits/chosen": -1.5915120840072632, "logits/rejected": -1.5745168924331665, "logps/chosen": -190.63565063476562, "logps/rejected": -244.02066040039062, "loss": 0.5821, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3653391599655151, "rewards/margins": 0.5031275749206543, "rewards/rejected": -1.8684667348861694, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 29.218130111694336, "learning_rate": 1.1233837371019566e-07, "logits/chosen": -1.7079054117202759, "logits/rejected": -1.6546837091445923, "logps/chosen": -209.74807739257812, "logps/rejected": -284.8275146484375, "loss": 0.5267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5534542798995972, "rewards/margins": 0.7775411605834961, "rewards/rejected": -2.330995559692383, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 26.836565017700195, "learning_rate": 1.1213939640805594e-07, "logits/chosen": -1.6795122623443604, "logits/rejected": -1.6285619735717773, "logps/chosen": -200.71646118164062, "logps/rejected": -264.31719970703125, "loss": 0.5252, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.474325180053711, "rewards/margins": 0.6578629612922668, "rewards/rejected": -2.132188081741333, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 31.637868881225586, "learning_rate": 1.1194037031278378e-07, "logits/chosen": -1.7559791803359985, "logits/rejected": -1.720070481300354, "logps/chosen": -221.94912719726562, "logps/rejected": -261.1572570800781, "loss": 0.6517, "rewards/accuracies": 0.625, "rewards/chosen": -1.674683928489685, "rewards/margins": 0.4098898768424988, "rewards/rejected": -2.084573984146118, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 27.263280868530273, "learning_rate": 1.1174129622434531e-07, "logits/chosen": -1.6646140813827515, "logits/rejected": -1.6244380474090576, "logps/chosen": -190.9231414794922, "logps/rejected": -256.2034606933594, "loss": 0.5111, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3597346544265747, "rewards/margins": 0.6620964407920837, "rewards/rejected": -2.0218310356140137, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 31.816787719726562, "learning_rate": 1.1154217494289966e-07, "logits/chosen": -1.7356491088867188, "logits/rejected": -1.6923658847808838, "logps/chosen": -204.17465209960938, "logps/rejected": -258.885986328125, "loss": 0.6097, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4676018953323364, "rewards/margins": 0.5954405069351196, "rewards/rejected": -2.063042163848877, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 23.65191650390625, "learning_rate": 1.1134300726879557e-07, "logits/chosen": -1.7225637435913086, "logits/rejected": -1.6865419149398804, "logps/chosen": -185.95620727539062, "logps/rejected": -229.1414031982422, "loss": 0.599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3031692504882812, "rewards/margins": 0.4596996307373047, "rewards/rejected": -1.762868881225586, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 26.136945724487305, "learning_rate": 1.1114379400256828e-07, "logits/chosen": -1.6853822469711304, "logits/rejected": -1.6423343420028687, "logps/chosen": -167.6372833251953, "logps/rejected": -240.2078094482422, "loss": 0.5002, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1371324062347412, "rewards/margins": 0.7557451725006104, "rewards/rejected": -1.8928775787353516, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 28.88298225402832, "learning_rate": 1.1094453594493634e-07, "logits/chosen": -1.7078361511230469, "logits/rejected": -1.695569396018982, "logps/chosen": -168.51095581054688, "logps/rejected": -224.92550659179688, "loss": 0.553, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1735893487930298, "rewards/margins": 0.5375136137008667, "rewards/rejected": -1.711102843284607, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 30.364782333374023, "learning_rate": 1.107452338967982e-07, "logits/chosen": -1.7026069164276123, "logits/rejected": -1.673125982284546, "logps/chosen": -173.69056701660156, "logps/rejected": -216.798095703125, "loss": 0.5959, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2252165079116821, "rewards/margins": 0.42275819182395935, "rewards/rejected": -1.6479747295379639, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 37.99259948730469, "learning_rate": 1.1054588865922931e-07, "logits/chosen": -1.7668405771255493, "logits/rejected": -1.7353569269180298, "logps/chosen": -182.21829223632812, "logps/rejected": -229.96426391601562, "loss": 0.6042, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2846075296401978, "rewards/margins": 0.49061161279678345, "rewards/rejected": -1.7752189636230469, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 26.010162353515625, "learning_rate": 1.1034650103347856e-07, "logits/chosen": -1.7860959768295288, "logits/rejected": -1.7449796199798584, "logps/chosen": -171.65902709960938, "logps/rejected": -220.1482391357422, "loss": 0.5695, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1694601774215698, "rewards/margins": 0.4999922811985016, "rewards/rejected": -1.6694523096084595, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 20.758052825927734, "learning_rate": 1.1014707182096525e-07, "logits/chosen": -1.7433793544769287, "logits/rejected": -1.7031667232513428, "logps/chosen": -162.1527862548828, "logps/rejected": -229.52175903320312, "loss": 0.5027, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0758721828460693, "rewards/margins": 0.6713493466377258, "rewards/rejected": -1.7472217082977295, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 18.145647048950195, "learning_rate": 1.0994760182327593e-07, "logits/chosen": -1.764984130859375, "logits/rejected": -1.74074387550354, "logps/chosen": -156.34017944335938, "logps/rejected": -213.22970581054688, "loss": 0.5387, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0435948371887207, "rewards/margins": 0.5344707369804382, "rewards/rejected": -1.5780656337738037, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 20.801746368408203, "learning_rate": 1.0974809184216094e-07, "logits/chosen": -1.7175157070159912, "logits/rejected": -1.6671241521835327, "logps/chosen": -172.3372344970703, "logps/rejected": -223.4744415283203, "loss": 0.53, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1706864833831787, "rewards/margins": 0.5599038600921631, "rewards/rejected": -1.7305902242660522, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 23.008760452270508, "learning_rate": 1.0954854267953146e-07, "logits/chosen": -1.7476812601089478, "logits/rejected": -1.7103242874145508, "logps/chosen": -192.20645141601562, "logps/rejected": -228.8697509765625, "loss": 0.6045, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.354092001914978, "rewards/margins": 0.3995038866996765, "rewards/rejected": -1.7535959482192993, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 20.0081844329834, "learning_rate": 1.0934895513745603e-07, "logits/chosen": -1.7460222244262695, "logits/rejected": -1.7097656726837158, "logps/chosen": -184.85659790039062, "logps/rejected": -237.32064819335938, "loss": 0.5706, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2960811853408813, "rewards/margins": 0.5511730909347534, "rewards/rejected": -1.8472541570663452, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 31.030187606811523, "learning_rate": 1.0914933001815754e-07, "logits/chosen": -1.7750955820083618, "logits/rejected": -1.7259628772735596, "logps/chosen": -182.80413818359375, "logps/rejected": -236.4357147216797, "loss": 0.5427, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2252854108810425, "rewards/margins": 0.5880699157714844, "rewards/rejected": -1.8133554458618164, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 22.988636016845703, "learning_rate": 1.0894966812400992e-07, "logits/chosen": -1.7008718252182007, "logits/rejected": -1.662156343460083, "logps/chosen": -183.43167114257812, "logps/rejected": -231.0472412109375, "loss": 0.5892, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2841278314590454, "rewards/margins": 0.48533502221107483, "rewards/rejected": -1.7694628238677979, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 19.436880111694336, "learning_rate": 1.0874997025753482e-07, "logits/chosen": -1.7695186138153076, "logits/rejected": -1.700583815574646, "logps/chosen": -172.9995574951172, "logps/rejected": -237.1913604736328, "loss": 0.5026, "rewards/accuracies": 0.8125, "rewards/chosen": -1.157179832458496, "rewards/margins": 0.7229998707771301, "rewards/rejected": -1.880179762840271, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 28.90360450744629, "learning_rate": 1.0855023722139864e-07, "logits/chosen": -1.7517004013061523, "logits/rejected": -1.702845573425293, "logps/chosen": -190.75489807128906, "logps/rejected": -255.5432586669922, "loss": 0.5388, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3242889642715454, "rewards/margins": 0.686375081539154, "rewards/rejected": -2.0106639862060547, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 24.89302635192871, "learning_rate": 1.0835046981840896e-07, "logits/chosen": -1.763664960861206, "logits/rejected": -1.7383177280426025, "logps/chosen": -169.76939392089844, "logps/rejected": -232.401611328125, "loss": 0.5413, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2061946392059326, "rewards/margins": 0.5869729518890381, "rewards/rejected": -1.7931678295135498, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 24.991596221923828, "learning_rate": 1.0815066885151165e-07, "logits/chosen": -1.7933380603790283, "logits/rejected": -1.7359075546264648, "logps/chosen": -181.2700958251953, "logps/rejected": -244.0440673828125, "loss": 0.525, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.264285683631897, "rewards/margins": 0.6814316511154175, "rewards/rejected": -1.9457175731658936, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 21.60300636291504, "learning_rate": 1.0795083512378738e-07, "logits/chosen": -1.7519088983535767, "logits/rejected": -1.7141910791397095, "logps/chosen": -182.85592651367188, "logps/rejected": -233.93008422851562, "loss": 0.5742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2812501192092896, "rewards/margins": 0.5364077091217041, "rewards/rejected": -1.8176578283309937, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 21.416772842407227, "learning_rate": 1.077509694384485e-07, "logits/chosen": -1.8482236862182617, "logits/rejected": -1.8232104778289795, "logps/chosen": -186.87045288085938, "logps/rejected": -251.0176544189453, "loss": 0.5146, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.327132225036621, "rewards/margins": 0.6137700080871582, "rewards/rejected": -1.9409021139144897, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 17.748327255249023, "learning_rate": 1.0755107259883591e-07, "logits/chosen": -1.7413246631622314, "logits/rejected": -1.696459412574768, "logps/chosen": -184.2054443359375, "logps/rejected": -251.5696563720703, "loss": 0.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3135349750518799, "rewards/margins": 0.6860525012016296, "rewards/rejected": -1.9995874166488647, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -1.8325300216674805, "eval_logits/rejected": -1.812217116355896, "eval_logps/chosen": -177.4160919189453, "eval_logps/rejected": -209.28688049316406, "eval_loss": 0.6320397853851318, "eval_rewards/accuracies": 0.636617124080658, "eval_rewards/chosen": -1.187041997909546, "eval_rewards/margins": 0.27402570843696594, "eval_rewards/rejected": -1.461067795753479, "eval_runtime": 384.2682, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 30.787199020385742, "learning_rate": 1.0735114540841565e-07, "logits/chosen": -1.614463210105896, "logits/rejected": -1.5715653896331787, "logps/chosen": -194.19384765625, "logps/rejected": -247.31533813476562, "loss": 0.566, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3834686279296875, "rewards/margins": 0.5717785954475403, "rewards/rejected": -1.9552472829818726, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 49.48377990722656, "learning_rate": 1.0715118867077575e-07, "logits/chosen": -1.7034069299697876, "logits/rejected": -1.6654075384140015, "logps/chosen": -182.36663818359375, "logps/rejected": -239.0763397216797, "loss": 0.5678, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3235313892364502, "rewards/margins": 0.5717422962188721, "rewards/rejected": -1.8952735662460327, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 28.817829132080078, "learning_rate": 1.0695120318962305e-07, "logits/chosen": -1.6265385150909424, "logits/rejected": -1.5868934392929077, "logps/chosen": -178.04183959960938, "logps/rejected": -243.2806396484375, "loss": 0.526, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.287351369857788, "rewards/margins": 0.6169094443321228, "rewards/rejected": -1.9042608737945557, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 30.303749084472656, "learning_rate": 1.0675118976877989e-07, "logits/chosen": -1.6701663732528687, "logits/rejected": -1.6437671184539795, "logps/chosen": -196.56332397460938, "logps/rejected": -260.31170654296875, "loss": 0.5495, "rewards/accuracies": 0.6875, "rewards/chosen": -1.404131293296814, "rewards/margins": 0.6530481576919556, "rewards/rejected": -2.0571796894073486, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 18.34560775756836, "learning_rate": 1.0655114921218086e-07, "logits/chosen": -1.665238618850708, "logits/rejected": -1.621919870376587, "logps/chosen": -181.21426391601562, "logps/rejected": -240.7842559814453, "loss": 0.5579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.264282464981079, "rewards/margins": 0.5974792242050171, "rewards/rejected": -1.8617616891860962, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 24.00996208190918, "learning_rate": 1.0635108232386976e-07, "logits/chosen": -1.6609874963760376, "logits/rejected": -1.6281172037124634, "logps/chosen": -189.6760711669922, "logps/rejected": -246.7325897216797, "loss": 0.5674, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.416132926940918, "rewards/margins": 0.5612865090370178, "rewards/rejected": -1.9774194955825806, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 20.443849563598633, "learning_rate": 1.0615098990799607e-07, "logits/chosen": -1.7815415859222412, "logits/rejected": -1.7329866886138916, "logps/chosen": -193.226806640625, "logps/rejected": -249.8080291748047, "loss": 0.538, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3833353519439697, "rewards/margins": 0.5966004133224487, "rewards/rejected": -1.979935646057129, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 22.499149322509766, "learning_rate": 1.05950872768812e-07, "logits/chosen": -1.7292639017105103, "logits/rejected": -1.6948598623275757, "logps/chosen": -181.84854125976562, "logps/rejected": -237.987548828125, "loss": 0.5637, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2827826738357544, "rewards/margins": 0.5591589212417603, "rewards/rejected": -1.8419415950775146, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 25.416688919067383, "learning_rate": 1.0575073171066906e-07, "logits/chosen": -1.6332378387451172, "logits/rejected": -1.6004530191421509, "logps/chosen": -177.553955078125, "logps/rejected": -220.517822265625, "loss": 0.5932, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2289899587631226, "rewards/margins": 0.4675261080265045, "rewards/rejected": -1.6965160369873047, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 35.59848403930664, "learning_rate": 1.0555056753801493e-07, "logits/chosen": -1.694219946861267, "logits/rejected": -1.6638238430023193, "logps/chosen": -190.98104858398438, "logps/rejected": -259.08929443359375, "loss": 0.5266, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3729770183563232, "rewards/margins": 0.6592379808425903, "rewards/rejected": -2.032215118408203, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 21.09294891357422, "learning_rate": 1.0535038105539014e-07, "logits/chosen": -1.713945746421814, "logits/rejected": -1.6748058795928955, "logps/chosen": -183.74774169921875, "logps/rejected": -238.228515625, "loss": 0.5547, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3000211715698242, "rewards/margins": 0.5785013437271118, "rewards/rejected": -1.8785226345062256, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 26.233116149902344, "learning_rate": 1.0515017306742504e-07, "logits/chosen": -1.7640018463134766, "logits/rejected": -1.7215381860733032, "logps/chosen": -193.1175079345703, "logps/rejected": -259.2463073730469, "loss": 0.532, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4228532314300537, "rewards/margins": 0.6254833936691284, "rewards/rejected": -2.0483365058898926, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 31.612598419189453, "learning_rate": 1.0494994437883619e-07, "logits/chosen": -1.740483283996582, "logits/rejected": -1.6970884799957275, "logps/chosen": -192.74623107910156, "logps/rejected": -248.35781860351562, "loss": 0.5737, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3616818189620972, "rewards/margins": 0.5712953209877014, "rewards/rejected": -1.932977318763733, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 17.047866821289062, "learning_rate": 1.0474969579442356e-07, "logits/chosen": -1.65829598903656, "logits/rejected": -1.6235958337783813, "logps/chosen": -190.55990600585938, "logps/rejected": -253.1373748779297, "loss": 0.5435, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3875072002410889, "rewards/margins": 0.627979040145874, "rewards/rejected": -2.015486240386963, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 42.927005767822266, "learning_rate": 1.0454942811906703e-07, "logits/chosen": -1.735147476196289, "logits/rejected": -1.6905243396759033, "logps/chosen": -175.52894592285156, "logps/rejected": -230.59426879882812, "loss": 0.547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.229788899421692, "rewards/margins": 0.5569666028022766, "rewards/rejected": -1.7867555618286133, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 24.976329803466797, "learning_rate": 1.0434914215772318e-07, "logits/chosen": -1.7689049243927002, "logits/rejected": -1.7112462520599365, "logps/chosen": -185.17877197265625, "logps/rejected": -254.5945281982422, "loss": 0.5023, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2849639654159546, "rewards/margins": 0.7352157831192017, "rewards/rejected": -2.0201797485351562, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 27.026674270629883, "learning_rate": 1.0414883871542208e-07, "logits/chosen": -1.7741403579711914, "logits/rejected": -1.712407112121582, "logps/chosen": -188.32443237304688, "logps/rejected": -253.3127899169922, "loss": 0.5304, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3448514938354492, "rewards/margins": 0.6742678880691528, "rewards/rejected": -2.0191197395324707, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 27.5206356048584, "learning_rate": 1.0394851859726408e-07, "logits/chosen": -1.7602388858795166, "logits/rejected": -1.7300840616226196, "logps/chosen": -187.4597625732422, "logps/rejected": -241.29885864257812, "loss": 0.5922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.330510139465332, "rewards/margins": 0.5449169278144836, "rewards/rejected": -1.8754268884658813, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 32.495243072509766, "learning_rate": 1.0374818260841663e-07, "logits/chosen": -1.6052782535552979, "logits/rejected": -1.5658015012741089, "logps/chosen": -188.24819946289062, "logps/rejected": -255.52261352539062, "loss": 0.5263, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3514819145202637, "rewards/margins": 0.6556193232536316, "rewards/rejected": -2.007101535797119, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 22.966014862060547, "learning_rate": 1.035478315541108e-07, "logits/chosen": -1.6737964153289795, "logits/rejected": -1.6404335498809814, "logps/chosen": -181.5256805419922, "logps/rejected": -230.23092651367188, "loss": 0.5898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2745777368545532, "rewards/margins": 0.4850188195705414, "rewards/rejected": -1.759596586227417, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 27.858112335205078, "learning_rate": 1.0334746623963843e-07, "logits/chosen": -1.6823469400405884, "logits/rejected": -1.6463171243667603, "logps/chosen": -185.46388244628906, "logps/rejected": -244.6988525390625, "loss": 0.5469, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.315901517868042, "rewards/margins": 0.6097933650016785, "rewards/rejected": -1.9256948232650757, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 34.802608489990234, "learning_rate": 1.031470874703485e-07, "logits/chosen": -1.714709997177124, "logits/rejected": -1.6808593273162842, "logps/chosen": -196.0220184326172, "logps/rejected": -241.30258178710938, "loss": 0.5762, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4098308086395264, "rewards/margins": 0.48128652572631836, "rewards/rejected": -1.8911174535751343, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 23.209945678710938, "learning_rate": 1.0294669605164417e-07, "logits/chosen": -1.6820461750030518, "logits/rejected": -1.6425174474716187, "logps/chosen": -186.3541259765625, "logps/rejected": -246.5086212158203, "loss": 0.5807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3392202854156494, "rewards/margins": 0.6365295648574829, "rewards/rejected": -1.9757496118545532, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 29.137161254882812, "learning_rate": 1.0274629278897941e-07, "logits/chosen": -1.6789467334747314, "logits/rejected": -1.6548280715942383, "logps/chosen": -176.45706176757812, "logps/rejected": -227.3600616455078, "loss": 0.5697, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2682231664657593, "rewards/margins": 0.4807654917240143, "rewards/rejected": -1.7489887475967407, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 22.850292205810547, "learning_rate": 1.0254587848785574e-07, "logits/chosen": -1.8444761037826538, "logits/rejected": -1.8191983699798584, "logps/chosen": -186.48324584960938, "logps/rejected": -228.24008178710938, "loss": 0.5955, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3309428691864014, "rewards/margins": 0.44507789611816406, "rewards/rejected": -1.7760207653045654, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 20.438674926757812, "learning_rate": 1.0234545395381922e-07, "logits/chosen": -1.7006847858428955, "logits/rejected": -1.6650491952896118, "logps/chosen": -164.76817321777344, "logps/rejected": -252.7565460205078, "loss": 0.4427, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1251728534698486, "rewards/margins": 0.8621363639831543, "rewards/rejected": -1.987309217453003, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 33.695003509521484, "learning_rate": 1.021450199924568e-07, "logits/chosen": -1.5540118217468262, "logits/rejected": -1.5089060068130493, "logps/chosen": -181.3240203857422, "logps/rejected": -224.87307739257812, "loss": 0.5968, "rewards/accuracies": 0.6875, "rewards/chosen": -1.286789059638977, "rewards/margins": 0.4572557806968689, "rewards/rejected": -1.7440446615219116, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 26.688095092773438, "learning_rate": 1.0194457740939353e-07, "logits/chosen": -1.6772072315216064, "logits/rejected": -1.6208477020263672, "logps/chosen": -186.79238891601562, "logps/rejected": -247.66085815429688, "loss": 0.5373, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3279988765716553, "rewards/margins": 0.622045636177063, "rewards/rejected": -1.9500446319580078, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 25.796838760375977, "learning_rate": 1.0174412701028899e-07, "logits/chosen": -1.6382777690887451, "logits/rejected": -1.6029354333877563, "logps/chosen": -189.98422241210938, "logps/rejected": -245.23324584960938, "loss": 0.5523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.358717679977417, "rewards/margins": 0.5708493590354919, "rewards/rejected": -1.9295669794082642, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 21.41780662536621, "learning_rate": 1.0154366960083422e-07, "logits/chosen": -1.6924152374267578, "logits/rejected": -1.6522552967071533, "logps/chosen": -184.74864196777344, "logps/rejected": -259.6523132324219, "loss": 0.4885, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.3110222816467285, "rewards/margins": 0.7347725629806519, "rewards/rejected": -2.04579496383667, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 22.247453689575195, "learning_rate": 1.0134320598674846e-07, "logits/chosen": -1.5799000263214111, "logits/rejected": -1.5400402545928955, "logps/chosen": -191.2025909423828, "logps/rejected": -265.931884765625, "loss": 0.5267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4031956195831299, "rewards/margins": 0.7167037129402161, "rewards/rejected": -2.1198995113372803, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 19.012386322021484, "learning_rate": 1.0114273697377583e-07, "logits/chosen": -1.7850033044815063, "logits/rejected": -1.7514865398406982, "logps/chosen": -206.412109375, "logps/rejected": -278.20819091796875, "loss": 0.5608, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.540743112564087, "rewards/margins": 0.6628342270851135, "rewards/rejected": -2.2035775184631348, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 36.910980224609375, "learning_rate": 1.0094226336768224e-07, "logits/chosen": -1.6955476999282837, "logits/rejected": -1.6423709392547607, "logps/chosen": -193.9436492919922, "logps/rejected": -268.7176818847656, "loss": 0.5012, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.410476565361023, "rewards/margins": 0.7678488492965698, "rewards/rejected": -2.178325653076172, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 26.08768653869629, "learning_rate": 1.0074178597425194e-07, "logits/chosen": -1.605597734451294, "logits/rejected": -1.5660779476165771, "logps/chosen": -198.6016082763672, "logps/rejected": -266.4854736328125, "loss": 0.514, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4448301792144775, "rewards/margins": 0.6859620213508606, "rewards/rejected": -2.1307921409606934, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 22.80348777770996, "learning_rate": 1.0054130559928451e-07, "logits/chosen": -1.6636934280395508, "logits/rejected": -1.6406666040420532, "logps/chosen": -190.80972290039062, "logps/rejected": -261.4796447753906, "loss": 0.5443, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3758639097213745, "rewards/margins": 0.6709795594215393, "rewards/rejected": -2.0468432903289795, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 29.717512130737305, "learning_rate": 1.0034082304859144e-07, "logits/chosen": -1.7326301336288452, "logits/rejected": -1.7004493474960327, "logps/chosen": -198.12722778320312, "logps/rejected": -260.0203552246094, "loss": 0.5488, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4450441598892212, "rewards/margins": 0.6245887279510498, "rewards/rejected": -2.0696330070495605, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 27.482389450073242, "learning_rate": 1.00140339127993e-07, "logits/chosen": -1.620723009109497, "logits/rejected": -1.591686725616455, "logps/chosen": -193.84017944335938, "logps/rejected": -257.40740966796875, "loss": 0.5577, "rewards/accuracies": 0.71875, "rewards/chosen": -1.435881495475769, "rewards/margins": 0.6315909624099731, "rewards/rejected": -2.067472457885742, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 34.06911087036133, "learning_rate": 9.9939854643315e-08, "logits/chosen": -1.7072927951812744, "logits/rejected": -1.6677078008651733, "logps/chosen": -198.4580841064453, "logps/rejected": -262.29534912109375, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4397019147872925, "rewards/margins": 0.6138035655021667, "rewards/rejected": -2.0535054206848145, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 24.818511962890625, "learning_rate": 9.973937040038544e-08, "logits/chosen": -1.796342134475708, "logits/rejected": -1.7624422311782837, "logps/chosen": -194.32196044921875, "logps/rejected": -249.00344848632812, "loss": 0.5681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3822523355484009, "rewards/margins": 0.5446712374687195, "rewards/rejected": -1.9269235134124756, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 29.23993682861328, "learning_rate": 9.953888720503145e-08, "logits/chosen": -1.6400010585784912, "logits/rejected": -1.5827388763427734, "logps/chosen": -192.125732421875, "logps/rejected": -262.93731689453125, "loss": 0.5174, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3706529140472412, "rewards/margins": 0.7350988984107971, "rewards/rejected": -2.1057517528533936, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -1.7810282707214355, "eval_logits/rejected": -1.7596746683120728, "eval_logps/chosen": -185.33482360839844, "eval_logps/rejected": -219.24375915527344, "eval_loss": 0.631009042263031, "eval_rewards/accuracies": 0.6375464797019958, "eval_rewards/chosen": -1.266229271888733, "eval_rewards/margins": 0.2944071292877197, "eval_rewards/rejected": -1.5606361627578735, "eval_runtime": 384.4232, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 25.812196731567383, "learning_rate": 9.933840586307579e-08, "logits/chosen": -1.646094560623169, "logits/rejected": -1.6073650121688843, "logps/chosen": -182.9886474609375, "logps/rejected": -252.9163360595703, "loss": 0.5064, "rewards/accuracies": 0.75, "rewards/chosen": -1.266803503036499, "rewards/margins": 0.7127724885940552, "rewards/rejected": -1.9795758724212646, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 33.57108688354492, "learning_rate": 9.913792718033396e-08, "logits/chosen": -1.7659132480621338, "logits/rejected": -1.7371089458465576, "logps/chosen": -196.85321044921875, "logps/rejected": -243.35916137695312, "loss": 0.607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4226679801940918, "rewards/margins": 0.4569169580936432, "rewards/rejected": -1.8795849084854126, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 32.600406646728516, "learning_rate": 9.893745196261062e-08, "logits/chosen": -1.6798865795135498, "logits/rejected": -1.6302530765533447, "logps/chosen": -204.86288452148438, "logps/rejected": -260.3321838378906, "loss": 0.5815, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4940460920333862, "rewards/margins": 0.5789082050323486, "rewards/rejected": -2.0729544162750244, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 27.517127990722656, "learning_rate": 9.873698101569657e-08, "logits/chosen": -1.7465555667877197, "logits/rejected": -1.706447958946228, "logps/chosen": -190.49761962890625, "logps/rejected": -239.1614227294922, "loss": 0.5647, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.345007300376892, "rewards/margins": 0.5413269400596619, "rewards/rejected": -1.8863341808319092, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 27.598669052124023, "learning_rate": 9.853651514536552e-08, "logits/chosen": -1.6575853824615479, "logits/rejected": -1.6214959621429443, "logps/chosen": -188.8939666748047, "logps/rejected": -233.7520294189453, "loss": 0.5979, "rewards/accuracies": 0.6875, "rewards/chosen": -1.334161400794983, "rewards/margins": 0.46343547105789185, "rewards/rejected": -1.7975966930389404, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 25.109174728393555, "learning_rate": 9.833605515737058e-08, "logits/chosen": -1.669508934020996, "logits/rejected": -1.642592430114746, "logps/chosen": -172.75863647460938, "logps/rejected": -231.64077758789062, "loss": 0.5652, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2217415571212769, "rewards/margins": 0.558169960975647, "rewards/rejected": -1.7799113988876343, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 28.79549217224121, "learning_rate": 9.813560185744138e-08, "logits/chosen": -1.755334496498108, "logits/rejected": -1.706169843673706, "logps/chosen": -177.9940643310547, "logps/rejected": -247.0950469970703, "loss": 0.5132, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.218774676322937, "rewards/margins": 0.7187453508377075, "rewards/rejected": -1.9375197887420654, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 29.618350982666016, "learning_rate": 9.79351560512806e-08, "logits/chosen": -1.6942167282104492, "logits/rejected": -1.6706167459487915, "logps/chosen": -180.63156127929688, "logps/rejected": -221.5099334716797, "loss": 0.6191, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2616262435913086, "rewards/margins": 0.4315156936645508, "rewards/rejected": -1.6931419372558594, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 26.86222267150879, "learning_rate": 9.773471854456087e-08, "logits/chosen": -1.6257823705673218, "logits/rejected": -1.593782901763916, "logps/chosen": -181.58224487304688, "logps/rejected": -229.3589324951172, "loss": 0.5738, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2575411796569824, "rewards/margins": 0.48993149399757385, "rewards/rejected": -1.7474727630615234, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 26.250883102416992, "learning_rate": 9.753429014292132e-08, "logits/chosen": -1.676490068435669, "logits/rejected": -1.633374571800232, "logps/chosen": -175.6143798828125, "logps/rejected": -227.7067108154297, "loss": 0.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2290782928466797, "rewards/margins": 0.5252577066421509, "rewards/rejected": -1.7543357610702515, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 25.921573638916016, "learning_rate": 9.73338716519646e-08, "logits/chosen": -1.5699330568313599, "logits/rejected": -1.5422900915145874, "logps/chosen": -176.53219604492188, "logps/rejected": -222.19589233398438, "loss": 0.5927, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2290360927581787, "rewards/margins": 0.47371214628219604, "rewards/rejected": -1.7027482986450195, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 23.1956787109375, "learning_rate": 9.713346387725355e-08, "logits/chosen": -1.7528129816055298, "logits/rejected": -1.7244980335235596, "logps/chosen": -172.8015594482422, "logps/rejected": -218.1025390625, "loss": 0.5783, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1944303512573242, "rewards/margins": 0.4731059968471527, "rewards/rejected": -1.6675363779067993, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 22.841461181640625, "learning_rate": 9.693306762430782e-08, "logits/chosen": -1.7413135766983032, "logits/rejected": -1.7077674865722656, "logps/chosen": -162.9296875, "logps/rejected": -226.3254852294922, "loss": 0.5319, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.091930627822876, "rewards/margins": 0.6269403696060181, "rewards/rejected": -1.7188708782196045, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 22.896677017211914, "learning_rate": 9.673268369860086e-08, "logits/chosen": -1.7438064813613892, "logits/rejected": -1.7046003341674805, "logps/chosen": -173.13571166992188, "logps/rejected": -228.1545867919922, "loss": 0.5517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1866159439086914, "rewards/margins": 0.5622804164886475, "rewards/rejected": -1.7488963603973389, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 17.43238067626953, "learning_rate": 9.653231290555647e-08, "logits/chosen": -1.7511183023452759, "logits/rejected": -1.688676118850708, "logps/chosen": -173.98178100585938, "logps/rejected": -228.1248779296875, "loss": 0.5351, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.188538908958435, "rewards/margins": 0.5991718769073486, "rewards/rejected": -1.7877107858657837, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 18.74418067932129, "learning_rate": 9.633195605054573e-08, "logits/chosen": -1.7443416118621826, "logits/rejected": -1.698015570640564, "logps/chosen": -180.7736358642578, "logps/rejected": -236.44351196289062, "loss": 0.5572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2759031057357788, "rewards/margins": 0.5757189989089966, "rewards/rejected": -1.8516219854354858, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 21.315763473510742, "learning_rate": 9.613161393888372e-08, "logits/chosen": -1.6301326751708984, "logits/rejected": -1.5908677577972412, "logps/chosen": -178.78009033203125, "logps/rejected": -235.99789428710938, "loss": 0.552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2467206716537476, "rewards/margins": 0.5737360715866089, "rewards/rejected": -1.8204567432403564, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 20.305156707763672, "learning_rate": 9.593128737582623e-08, "logits/chosen": -1.7256050109863281, "logits/rejected": -1.6650545597076416, "logps/chosen": -186.75709533691406, "logps/rejected": -243.6442413330078, "loss": 0.5472, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2587451934814453, "rewards/margins": 0.6616265177726746, "rewards/rejected": -1.920371651649475, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 25.12137794494629, "learning_rate": 9.57309771665665e-08, "logits/chosen": -1.6799179315567017, "logits/rejected": -1.6594957113265991, "logps/chosen": -192.07211303710938, "logps/rejected": -258.8779296875, "loss": 0.566, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3866052627563477, "rewards/margins": 0.6263383030891418, "rewards/rejected": -2.012943744659424, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 32.45927810668945, "learning_rate": 9.553068411623211e-08, "logits/chosen": -1.7195956707000732, "logits/rejected": -1.6630808115005493, "logps/chosen": -191.19000244140625, "logps/rejected": -257.57318115234375, "loss": 0.5365, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3438773155212402, "rewards/margins": 0.6948517560958862, "rewards/rejected": -2.038729190826416, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 24.958833694458008, "learning_rate": 9.533040902988164e-08, "logits/chosen": -1.6515401601791382, "logits/rejected": -1.6001765727996826, "logps/chosen": -191.26797485351562, "logps/rejected": -253.69271850585938, "loss": 0.57, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3584883213043213, "rewards/margins": 0.6395347714424133, "rewards/rejected": -1.9980227947235107, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 24.197202682495117, "learning_rate": 9.51301527125015e-08, "logits/chosen": -1.6761776208877563, "logits/rejected": -1.6309750080108643, "logps/chosen": -185.36248779296875, "logps/rejected": -252.4319305419922, "loss": 0.5254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3124070167541504, "rewards/margins": 0.6722649335861206, "rewards/rejected": -1.98467218875885, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 28.450220108032227, "learning_rate": 9.492991596900265e-08, "logits/chosen": -1.7091443538665771, "logits/rejected": -1.6807483434677124, "logps/chosen": -200.43643188476562, "logps/rejected": -254.0392303466797, "loss": 0.6132, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4244849681854248, "rewards/margins": 0.5424485206604004, "rewards/rejected": -1.9669336080551147, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 21.36317253112793, "learning_rate": 9.47296996042173e-08, "logits/chosen": -1.6725046634674072, "logits/rejected": -1.6177743673324585, "logps/chosen": -187.1649627685547, "logps/rejected": -257.66131591796875, "loss": 0.5377, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3128710985183716, "rewards/margins": 0.7233718633651733, "rewards/rejected": -2.036242961883545, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 24.398290634155273, "learning_rate": 9.452950442289582e-08, "logits/chosen": -1.6904996633529663, "logits/rejected": -1.6662309169769287, "logps/chosen": -180.9543914794922, "logps/rejected": -229.668212890625, "loss": 0.566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2635273933410645, "rewards/margins": 0.5165113210678101, "rewards/rejected": -1.780038833618164, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 25.69253921508789, "learning_rate": 9.432933122970347e-08, "logits/chosen": -1.7612245082855225, "logits/rejected": -1.734895944595337, "logps/chosen": -197.70860290527344, "logps/rejected": -244.4144744873047, "loss": 0.6001, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4138227701187134, "rewards/margins": 0.4914781451225281, "rewards/rejected": -1.9053010940551758, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 21.9511661529541, "learning_rate": 9.412918082921706e-08, "logits/chosen": -1.720068335533142, "logits/rejected": -1.6764943599700928, "logps/chosen": -183.56893920898438, "logps/rejected": -238.9677734375, "loss": 0.5642, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3083776235580444, "rewards/margins": 0.572445809841156, "rewards/rejected": -1.8808234930038452, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 19.187746047973633, "learning_rate": 9.39290540259218e-08, "logits/chosen": -1.6332343816757202, "logits/rejected": -1.6056232452392578, "logps/chosen": -183.8154754638672, "logps/rejected": -251.66244506835938, "loss": 0.528, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3174991607666016, "rewards/margins": 0.6446079015731812, "rewards/rejected": -1.9621070623397827, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 23.748544692993164, "learning_rate": 9.372895162420808e-08, "logits/chosen": -1.6163705587387085, "logits/rejected": -1.5838301181793213, "logps/chosen": -187.1710205078125, "logps/rejected": -254.9395751953125, "loss": 0.5086, "rewards/accuracies": 0.75, "rewards/chosen": -1.307533621788025, "rewards/margins": 0.7030706405639648, "rewards/rejected": -2.0106043815612793, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 28.16010093688965, "learning_rate": 9.352887442836816e-08, "logits/chosen": -1.6951067447662354, "logits/rejected": -1.6399049758911133, "logps/chosen": -177.525634765625, "logps/rejected": -260.8652038574219, "loss": 0.4753, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2581908702850342, "rewards/margins": 0.8525069952011108, "rewards/rejected": -2.1106979846954346, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 19.513193130493164, "learning_rate": 9.332882324259306e-08, "logits/chosen": -1.7475383281707764, "logits/rejected": -1.6854770183563232, "logps/chosen": -193.81378173828125, "logps/rejected": -252.58413696289062, "loss": 0.5561, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3624550104141235, "rewards/margins": 0.6568828821182251, "rewards/rejected": -2.0193378925323486, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 25.817169189453125, "learning_rate": 9.312879887096923e-08, "logits/chosen": -1.7400646209716797, "logits/rejected": -1.6931641101837158, "logps/chosen": -187.50587463378906, "logps/rejected": -254.0473175048828, "loss": 0.5126, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3481698036193848, "rewards/margins": 0.6990029811859131, "rewards/rejected": -2.047173023223877, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 30.18265151977539, "learning_rate": 9.292880211747528e-08, "logits/chosen": -1.6739486455917358, "logits/rejected": -1.644385576248169, "logps/chosen": -180.66278076171875, "logps/rejected": -250.3004913330078, "loss": 0.5329, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.287561297416687, "rewards/margins": 0.6668421030044556, "rewards/rejected": -1.954403281211853, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 20.01820182800293, "learning_rate": 9.27288337859789e-08, "logits/chosen": -1.7534430027008057, "logits/rejected": -1.7249292135238647, "logps/chosen": -198.48370361328125, "logps/rejected": -257.41363525390625, "loss": 0.573, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4342199563980103, "rewards/margins": 0.5325092077255249, "rewards/rejected": -1.966728925704956, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 43.556434631347656, "learning_rate": 9.252889468023348e-08, "logits/chosen": -1.7074737548828125, "logits/rejected": -1.6514174938201904, "logps/chosen": -194.29640197753906, "logps/rejected": -261.7345886230469, "loss": 0.5279, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4033207893371582, "rewards/margins": 0.7007296085357666, "rewards/rejected": -2.104050397872925, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 39.64878845214844, "learning_rate": 9.232898560387503e-08, "logits/chosen": -1.7442843914031982, "logits/rejected": -1.7110795974731445, "logps/chosen": -207.5657196044922, "logps/rejected": -256.70965576171875, "loss": 0.6004, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5412859916687012, "rewards/margins": 0.4797201156616211, "rewards/rejected": -2.0210063457489014, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 26.249374389648438, "learning_rate": 9.212910736041868e-08, "logits/chosen": -1.7053654193878174, "logits/rejected": -1.6709177494049072, "logps/chosen": -192.18594360351562, "logps/rejected": -255.30746459960938, "loss": 0.5629, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4082581996917725, "rewards/margins": 0.6271167993545532, "rewards/rejected": -2.0353751182556152, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 27.637008666992188, "learning_rate": 9.19292607532558e-08, "logits/chosen": -1.6315752267837524, "logits/rejected": -1.595414400100708, "logps/chosen": -198.72315979003906, "logps/rejected": -270.51312255859375, "loss": 0.5359, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4592723846435547, "rewards/margins": 0.7141053080558777, "rewards/rejected": -2.1733779907226562, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 40.08787155151367, "learning_rate": 9.172944658565057e-08, "logits/chosen": -1.7109102010726929, "logits/rejected": -1.6611464023590088, "logps/chosen": -204.422119140625, "logps/rejected": -250.1943817138672, "loss": 0.5955, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5068941116333008, "rewards/margins": 0.49588704109191895, "rewards/rejected": -2.002781391143799, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 21.382776260375977, "learning_rate": 9.15296656607367e-08, "logits/chosen": -1.7126632928848267, "logits/rejected": -1.6783558130264282, "logps/chosen": -196.45445251464844, "logps/rejected": -258.8180847167969, "loss": 0.5312, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4070589542388916, "rewards/margins": 0.6114452481269836, "rewards/rejected": -2.0185041427612305, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -1.7847836017608643, "eval_logits/rejected": -1.762927532196045, "eval_logps/chosen": -188.505615234375, "eval_logps/rejected": -223.30810546875, "eval_loss": 0.6312531232833862, "eval_rewards/accuracies": 0.6359200477600098, "eval_rewards/chosen": -1.2979371547698975, "eval_rewards/margins": 0.3033425211906433, "eval_rewards/rejected": -1.601279616355896, "eval_runtime": 384.1037, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 17.818395614624023, "learning_rate": 9.132991878151444e-08, "logits/chosen": -1.7125742435455322, "logits/rejected": -1.6681439876556396, "logps/chosen": -187.5587158203125, "logps/rejected": -259.1612854003906, "loss": 0.5015, "rewards/accuracies": 0.75, "rewards/chosen": -1.3517935276031494, "rewards/margins": 0.7120456099510193, "rewards/rejected": -2.0638391971588135, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 31.448246002197266, "learning_rate": 9.113020675084693e-08, "logits/chosen": -1.6409164667129517, "logits/rejected": -1.590031385421753, "logps/chosen": -197.2374267578125, "logps/rejected": -255.70834350585938, "loss": 0.544, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4356483221054077, "rewards/margins": 0.6116400957107544, "rewards/rejected": -2.047288417816162, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 43.61131286621094, "learning_rate": 9.093053037145756e-08, "logits/chosen": -1.6513713598251343, "logits/rejected": -1.6045444011688232, "logps/chosen": -202.1380615234375, "logps/rejected": -255.1143341064453, "loss": 0.5476, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4474685192108154, "rewards/margins": 0.5823658108711243, "rewards/rejected": -2.029834270477295, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 18.31475067138672, "learning_rate": 9.073089044592619e-08, "logits/chosen": -1.8138395547866821, "logits/rejected": -1.7728168964385986, "logps/chosen": -197.71688842773438, "logps/rejected": -262.2447204589844, "loss": 0.5454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4287588596343994, "rewards/margins": 0.6493293046951294, "rewards/rejected": -2.0780882835388184, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 26.210664749145508, "learning_rate": 9.053128777668629e-08, "logits/chosen": -1.6510467529296875, "logits/rejected": -1.629809021949768, "logps/chosen": -202.47584533691406, "logps/rejected": -253.1685333251953, "loss": 0.5772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4850410223007202, "rewards/margins": 0.4842820167541504, "rewards/rejected": -1.969322919845581, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 33.081329345703125, "learning_rate": 9.033172316602148e-08, "logits/chosen": -1.6137077808380127, "logits/rejected": -1.5819133520126343, "logps/chosen": -189.3477325439453, "logps/rejected": -262.2599182128906, "loss": 0.5484, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.406105875968933, "rewards/margins": 0.6926298141479492, "rewards/rejected": -2.0987353324890137, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 24.314748764038086, "learning_rate": 9.013219741606244e-08, "logits/chosen": -1.5984914302825928, "logits/rejected": -1.5605851411819458, "logps/chosen": -187.48721313476562, "logps/rejected": -260.6324462890625, "loss": 0.5132, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3503230810165405, "rewards/margins": 0.7312024831771851, "rewards/rejected": -2.0815258026123047, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 34.41969299316406, "learning_rate": 8.993271132878371e-08, "logits/chosen": -1.7018215656280518, "logits/rejected": -1.6646702289581299, "logps/chosen": -197.0696258544922, "logps/rejected": -255.782958984375, "loss": 0.5736, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.405316948890686, "rewards/margins": 0.6016537547111511, "rewards/rejected": -2.0069708824157715, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 20.763137817382812, "learning_rate": 8.973326570600038e-08, "logits/chosen": -1.728520393371582, "logits/rejected": -1.681694746017456, "logps/chosen": -189.59129333496094, "logps/rejected": -262.61431884765625, "loss": 0.5276, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3521802425384521, "rewards/margins": 0.6968034505844116, "rewards/rejected": -2.0489840507507324, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 36.174373626708984, "learning_rate": 8.953386134936489e-08, "logits/chosen": -1.6898266077041626, "logits/rejected": -1.6627204418182373, "logps/chosen": -190.21826171875, "logps/rejected": -250.46133422851562, "loss": 0.562, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.352323293685913, "rewards/margins": 0.6072344183921814, "rewards/rejected": -1.9595577716827393, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 34.1463737487793, "learning_rate": 8.933449906036373e-08, "logits/chosen": -1.7423661947250366, "logits/rejected": -1.7222964763641357, "logps/chosen": -194.2482147216797, "logps/rejected": -260.20965576171875, "loss": 0.5566, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4367506504058838, "rewards/margins": 0.6188918352127075, "rewards/rejected": -2.0556423664093018, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 20.162803649902344, "learning_rate": 8.913517964031447e-08, "logits/chosen": -1.7082332372665405, "logits/rejected": -1.6626908779144287, "logps/chosen": -187.05923461914062, "logps/rejected": -255.07669067382812, "loss": 0.5193, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3333557844161987, "rewards/margins": 0.7113418579101562, "rewards/rejected": -2.0446975231170654, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 27.919374465942383, "learning_rate": 8.893590389036226e-08, "logits/chosen": -1.722672462463379, "logits/rejected": -1.6737346649169922, "logps/chosen": -183.01498413085938, "logps/rejected": -258.7102966308594, "loss": 0.5057, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2966862916946411, "rewards/margins": 0.7539176344871521, "rewards/rejected": -2.0506038665771484, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 28.644804000854492, "learning_rate": 8.873667261147673e-08, "logits/chosen": -1.68411123752594, "logits/rejected": -1.6239120960235596, "logps/chosen": -205.28250122070312, "logps/rejected": -262.96441650390625, "loss": 0.5664, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4485670328140259, "rewards/margins": 0.6414085030555725, "rewards/rejected": -2.089975357055664, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 24.981163024902344, "learning_rate": 8.853748660444881e-08, "logits/chosen": -1.631771445274353, "logits/rejected": -1.582779049873352, "logps/chosen": -190.84922790527344, "logps/rejected": -261.8558044433594, "loss": 0.5279, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.354757308959961, "rewards/margins": 0.7097658514976501, "rewards/rejected": -2.064523220062256, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 23.57000732421875, "learning_rate": 8.833834666988738e-08, "logits/chosen": -1.65898859500885, "logits/rejected": -1.6106218099594116, "logps/chosen": -187.1697235107422, "logps/rejected": -263.8644714355469, "loss": 0.5142, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3509787321090698, "rewards/margins": 0.7656160593032837, "rewards/rejected": -2.1165947914123535, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 22.85065269470215, "learning_rate": 8.813925360821624e-08, "logits/chosen": -1.6529548168182373, "logits/rejected": -1.6138883829116821, "logps/chosen": -196.12030029296875, "logps/rejected": -268.4806213378906, "loss": 0.5297, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4121040105819702, "rewards/margins": 0.7286542057991028, "rewards/rejected": -2.140758514404297, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 25.705286026000977, "learning_rate": 8.794020821967075e-08, "logits/chosen": -1.5729020833969116, "logits/rejected": -1.514883041381836, "logps/chosen": -197.0308837890625, "logps/rejected": -276.1694641113281, "loss": 0.512, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4458892345428467, "rewards/margins": 0.7827070951461792, "rewards/rejected": -2.2285962104797363, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 40.89603805541992, "learning_rate": 8.774121130429464e-08, "logits/chosen": -1.6025034189224243, "logits/rejected": -1.5513317584991455, "logps/chosen": -190.3941650390625, "logps/rejected": -263.5399169921875, "loss": 0.5224, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3943601846694946, "rewards/margins": 0.7269679307937622, "rewards/rejected": -2.121328115463257, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 23.062108993530273, "learning_rate": 8.754226366193677e-08, "logits/chosen": -1.6597881317138672, "logits/rejected": -1.6177161931991577, "logps/chosen": -200.96263122558594, "logps/rejected": -270.7159118652344, "loss": 0.5379, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4854190349578857, "rewards/margins": 0.7124817371368408, "rewards/rejected": -2.1979007720947266, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 45.000587463378906, "learning_rate": 8.734336609224798e-08, "logits/chosen": -1.6575438976287842, "logits/rejected": -1.6288673877716064, "logps/chosen": -218.801513671875, "logps/rejected": -302.0950622558594, "loss": 0.5392, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6478855609893799, "rewards/margins": 0.8018059730529785, "rewards/rejected": -2.4496912956237793, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 31.537446975708008, "learning_rate": 8.714451939467793e-08, "logits/chosen": -1.5889657735824585, "logits/rejected": -1.5587692260742188, "logps/chosen": -202.49703979492188, "logps/rejected": -272.3978576660156, "loss": 0.5276, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5411155223846436, "rewards/margins": 0.6716324090957642, "rewards/rejected": -2.212747573852539, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 22.315608978271484, "learning_rate": 8.69457243684717e-08, "logits/chosen": -1.5495671033859253, "logits/rejected": -1.5024573802947998, "logps/chosen": -207.17160034179688, "logps/rejected": -268.1032409667969, "loss": 0.5932, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.537306547164917, "rewards/margins": 0.6251362562179565, "rewards/rejected": -2.162442684173584, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 19.749860763549805, "learning_rate": 8.67469818126667e-08, "logits/chosen": -1.619339942932129, "logits/rejected": -1.5651066303253174, "logps/chosen": -204.11798095703125, "logps/rejected": -297.8780517578125, "loss": 0.5045, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5266096591949463, "rewards/margins": 0.928512454032898, "rewards/rejected": -2.4551219940185547, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 50.124698638916016, "learning_rate": 8.654829252608947e-08, "logits/chosen": -1.6453462839126587, "logits/rejected": -1.5897960662841797, "logps/chosen": -208.56918334960938, "logps/rejected": -266.9615173339844, "loss": 0.5292, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5179365873336792, "rewards/margins": 0.6548843383789062, "rewards/rejected": -2.172820568084717, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 29.26951789855957, "learning_rate": 8.634965730735238e-08, "logits/chosen": -1.6260560750961304, "logits/rejected": -1.6039345264434814, "logps/chosen": -197.04734802246094, "logps/rejected": -271.06866455078125, "loss": 0.5231, "rewards/accuracies": 0.75, "rewards/chosen": -1.4378256797790527, "rewards/margins": 0.7041373252868652, "rewards/rejected": -2.141962766647339, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 30.01377296447754, "learning_rate": 8.615107695485059e-08, "logits/chosen": -1.6363704204559326, "logits/rejected": -1.5963335037231445, "logps/chosen": -202.7434539794922, "logps/rejected": -271.08990478515625, "loss": 0.5355, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.473127007484436, "rewards/margins": 0.6813068389892578, "rewards/rejected": -2.1544339656829834, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 27.83128547668457, "learning_rate": 8.595255226675867e-08, "logits/chosen": -1.6075325012207031, "logits/rejected": -1.578054428100586, "logps/chosen": -211.5869598388672, "logps/rejected": -255.16415405273438, "loss": 0.6148, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5725781917572021, "rewards/margins": 0.44869309663772583, "rewards/rejected": -2.0212714672088623, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 20.72820472717285, "learning_rate": 8.575408404102739e-08, "logits/chosen": -1.5998598337173462, "logits/rejected": -1.5654428005218506, "logps/chosen": -184.80645751953125, "logps/rejected": -267.18109130859375, "loss": 0.5129, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.333701729774475, "rewards/margins": 0.7832706570625305, "rewards/rejected": -2.1169724464416504, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 25.413875579833984, "learning_rate": 8.555567307538067e-08, "logits/chosen": -1.6682040691375732, "logits/rejected": -1.6348912715911865, "logps/chosen": -195.970947265625, "logps/rejected": -248.9465789794922, "loss": 0.5872, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4132314920425415, "rewards/margins": 0.531366229057312, "rewards/rejected": -1.944597601890564, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 22.22412872314453, "learning_rate": 8.53573201673122e-08, "logits/chosen": -1.6083345413208008, "logits/rejected": -1.566024899482727, "logps/chosen": -199.2248077392578, "logps/rejected": -263.5713195800781, "loss": 0.5186, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4296208620071411, "rewards/margins": 0.6915571093559265, "rewards/rejected": -2.121178150177002, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 26.842947006225586, "learning_rate": 8.515902611408245e-08, "logits/chosen": -1.659576416015625, "logits/rejected": -1.6150563955307007, "logps/chosen": -199.6532440185547, "logps/rejected": -253.9457550048828, "loss": 0.601, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.428916096687317, "rewards/margins": 0.5753315687179565, "rewards/rejected": -2.0042474269866943, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 45.35322570800781, "learning_rate": 8.496079171271512e-08, "logits/chosen": -1.6947290897369385, "logits/rejected": -1.6549278497695923, "logps/chosen": -187.60887145996094, "logps/rejected": -251.35610961914062, "loss": 0.5305, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3546451330184937, "rewards/margins": 0.6137539148330688, "rewards/rejected": -1.9683990478515625, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 25.27505111694336, "learning_rate": 8.476261775999432e-08, "logits/chosen": -1.7075321674346924, "logits/rejected": -1.6534080505371094, "logps/chosen": -185.04373168945312, "logps/rejected": -261.29937744140625, "loss": 0.5162, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2895625829696655, "rewards/margins": 0.7777486443519592, "rewards/rejected": -2.0673112869262695, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 28.808042526245117, "learning_rate": 8.45645050524611e-08, "logits/chosen": -1.7838119268417358, "logits/rejected": -1.7406063079833984, "logps/chosen": -186.5411834716797, "logps/rejected": -245.5423126220703, "loss": 0.5485, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3245922327041626, "rewards/margins": 0.6202830672264099, "rewards/rejected": -1.9448751211166382, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 30.061277389526367, "learning_rate": 8.436645438641038e-08, "logits/chosen": -1.6315284967422485, "logits/rejected": -1.5859811305999756, "logps/chosen": -192.25270080566406, "logps/rejected": -238.0504150390625, "loss": 0.5994, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3630584478378296, "rewards/margins": 0.48939284682273865, "rewards/rejected": -1.8524513244628906, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 31.28719711303711, "learning_rate": 8.416846655788774e-08, "logits/chosen": -1.526947021484375, "logits/rejected": -1.4788224697113037, "logps/chosen": -178.3394012451172, "logps/rejected": -239.5234375, "loss": 0.5311, "rewards/accuracies": 0.75, "rewards/chosen": -1.2636200189590454, "rewards/margins": 0.602812647819519, "rewards/rejected": -1.866432785987854, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 36.3031120300293, "learning_rate": 8.397054236268611e-08, "logits/chosen": -1.669136643409729, "logits/rejected": -1.6464998722076416, "logps/chosen": -196.32110595703125, "logps/rejected": -231.94418334960938, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": -1.4444220066070557, "rewards/margins": 0.35197263956069946, "rewards/rejected": -1.7963947057724, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 19.98095703125, "learning_rate": 8.37726825963427e-08, "logits/chosen": -1.7154592275619507, "logits/rejected": -1.666921854019165, "logps/chosen": -180.02867126464844, "logps/rejected": -242.6608123779297, "loss": 0.5435, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2421947717666626, "rewards/margins": 0.6435031890869141, "rewards/rejected": -1.8856979608535767, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 25.061344146728516, "learning_rate": 8.357488805413576e-08, "logits/chosen": -1.6696021556854248, "logits/rejected": -1.631299376487732, "logps/chosen": -188.3632354736328, "logps/rejected": -265.38037109375, "loss": 0.4923, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3622297048568726, "rewards/margins": 0.7384769320487976, "rewards/rejected": -2.1007065773010254, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -1.796593189239502, "eval_logits/rejected": -1.7754329442977905, "eval_logps/chosen": -174.6746063232422, "eval_logps/rejected": -207.29551696777344, "eval_loss": 0.6312093734741211, "eval_rewards/accuracies": 0.6333643198013306, "eval_rewards/chosen": -1.1596269607543945, "eval_rewards/margins": 0.28152674436569214, "eval_rewards/rejected": -1.441153883934021, "eval_runtime": 384.1355, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.401, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 27.34046173095703, "learning_rate": 8.337715953108133e-08, "logits/chosen": -1.644966721534729, "logits/rejected": -1.5948392152786255, "logps/chosen": -190.6090545654297, "logps/rejected": -245.26416015625, "loss": 0.5639, "rewards/accuracies": 0.75, "rewards/chosen": -1.3409154415130615, "rewards/margins": 0.580964207649231, "rewards/rejected": -1.921879529953003, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 38.71763229370117, "learning_rate": 8.317949782193021e-08, "logits/chosen": -1.6703846454620361, "logits/rejected": -1.6184704303741455, "logps/chosen": -185.3733367919922, "logps/rejected": -248.4432830810547, "loss": 0.5412, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3125276565551758, "rewards/margins": 0.6306508183479309, "rewards/rejected": -1.9431785345077515, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 24.359373092651367, "learning_rate": 8.298190372116449e-08, "logits/chosen": -1.747746229171753, "logits/rejected": -1.7087604999542236, "logps/chosen": -185.07826232910156, "logps/rejected": -240.5188446044922, "loss": 0.5752, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.302076816558838, "rewards/margins": 0.57256019115448, "rewards/rejected": -1.874637246131897, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 25.13093376159668, "learning_rate": 8.278437802299462e-08, "logits/chosen": -1.7765858173370361, "logits/rejected": -1.7487947940826416, "logps/chosen": -191.94027709960938, "logps/rejected": -241.29458618164062, "loss": 0.579, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3407024145126343, "rewards/margins": 0.4993131160736084, "rewards/rejected": -1.8400154113769531, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 25.234697341918945, "learning_rate": 8.258692152135605e-08, "logits/chosen": -1.6926829814910889, "logits/rejected": -1.665917158126831, "logps/chosen": -191.83380126953125, "logps/rejected": -258.0459899902344, "loss": 0.5337, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3821178674697876, "rewards/margins": 0.6219611763954163, "rewards/rejected": -2.0040788650512695, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 21.314926147460938, "learning_rate": 8.238953500990624e-08, "logits/chosen": -1.6903746128082275, "logits/rejected": -1.644550085067749, "logps/chosen": -183.9580078125, "logps/rejected": -237.87936401367188, "loss": 0.5601, "rewards/accuracies": 0.71875, "rewards/chosen": -1.293830156326294, "rewards/margins": 0.5546851754188538, "rewards/rejected": -1.848515272140503, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 40.113525390625, "learning_rate": 8.219221928202108e-08, "logits/chosen": -1.5459710359573364, "logits/rejected": -1.506653904914856, "logps/chosen": -181.66893005371094, "logps/rejected": -239.0261688232422, "loss": 0.5778, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.297624945640564, "rewards/margins": 0.5848358273506165, "rewards/rejected": -1.882460594177246, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 24.86050033569336, "learning_rate": 8.199497513079219e-08, "logits/chosen": -1.6869032382965088, "logits/rejected": -1.6301963329315186, "logps/chosen": -185.90304565429688, "logps/rejected": -254.4204559326172, "loss": 0.5409, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2993216514587402, "rewards/margins": 0.720859944820404, "rewards/rejected": -2.020181655883789, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 22.12966537475586, "learning_rate": 8.179780334902338e-08, "logits/chosen": -1.6784626245498657, "logits/rejected": -1.6307865381240845, "logps/chosen": -171.83340454101562, "logps/rejected": -238.9564666748047, "loss": 0.524, "rewards/accuracies": 0.71875, "rewards/chosen": -1.180229663848877, "rewards/margins": 0.6838089823722839, "rewards/rejected": -1.8640388250350952, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 24.905609130859375, "learning_rate": 8.16007047292276e-08, "logits/chosen": -1.6610034704208374, "logits/rejected": -1.6133527755737305, "logps/chosen": -187.67446899414062, "logps/rejected": -266.53692626953125, "loss": 0.5196, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.34975004196167, "rewards/margins": 0.7645037770271301, "rewards/rejected": -2.1142537593841553, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 26.100194931030273, "learning_rate": 8.140368006362378e-08, "logits/chosen": -1.6718677282333374, "logits/rejected": -1.6218767166137695, "logps/chosen": -185.58583068847656, "logps/rejected": -251.987060546875, "loss": 0.5291, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2999773025512695, "rewards/margins": 0.6881439089775085, "rewards/rejected": -1.9881212711334229, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 38.36790084838867, "learning_rate": 8.120673014413346e-08, "logits/chosen": -1.7166264057159424, "logits/rejected": -1.685956597328186, "logps/chosen": -188.8399658203125, "logps/rejected": -272.0633850097656, "loss": 0.5197, "rewards/accuracies": 0.75, "rewards/chosen": -1.3835443258285522, "rewards/margins": 0.7916911244392395, "rewards/rejected": -2.1752355098724365, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 24.703962326049805, "learning_rate": 8.100985576237789e-08, "logits/chosen": -1.6094785928726196, "logits/rejected": -1.5705649852752686, "logps/chosen": -196.53831481933594, "logps/rejected": -254.22103881835938, "loss": 0.5587, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4319744110107422, "rewards/margins": 0.5889819860458374, "rewards/rejected": -2.020956516265869, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 20.878108978271484, "learning_rate": 8.081305770967466e-08, "logits/chosen": -1.5575045347213745, "logits/rejected": -1.5127557516098022, "logps/chosen": -193.12832641601562, "logps/rejected": -258.6274719238281, "loss": 0.5252, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4079763889312744, "rewards/margins": 0.6843532919883728, "rewards/rejected": -2.092329502105713, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 30.245561599731445, "learning_rate": 8.061633677703457e-08, "logits/chosen": -1.7191566228866577, "logits/rejected": -1.6862128973007202, "logps/chosen": -210.8649444580078, "logps/rejected": -280.70245361328125, "loss": 0.5475, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5711400508880615, "rewards/margins": 0.6780726313591003, "rewards/rejected": -2.2492125034332275, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 32.06334686279297, "learning_rate": 8.041969375515835e-08, "logits/chosen": -1.596172571182251, "logits/rejected": -1.5496784448623657, "logps/chosen": -197.6623992919922, "logps/rejected": -273.310546875, "loss": 0.5502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4203016757965088, "rewards/margins": 0.7665907740592957, "rewards/rejected": -2.18689227104187, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 34.21003723144531, "learning_rate": 8.022312943443369e-08, "logits/chosen": -1.6808770895004272, "logits/rejected": -1.6430126428604126, "logps/chosen": -201.29342651367188, "logps/rejected": -276.6784362792969, "loss": 0.5171, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4534662961959839, "rewards/margins": 0.7456766366958618, "rewards/rejected": -2.1991429328918457, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 24.851125717163086, "learning_rate": 8.002664460493194e-08, "logits/chosen": -1.705336570739746, "logits/rejected": -1.6608549356460571, "logps/chosen": -184.4915008544922, "logps/rejected": -252.7461395263672, "loss": 0.514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3112952709197998, "rewards/margins": 0.6880284547805786, "rewards/rejected": -1.9993234872817993, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 20.24405288696289, "learning_rate": 7.983024005640487e-08, "logits/chosen": -1.6469793319702148, "logits/rejected": -1.5926092863082886, "logps/chosen": -190.94544982910156, "logps/rejected": -249.9911651611328, "loss": 0.5273, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3504091501235962, "rewards/margins": 0.6501537561416626, "rewards/rejected": -2.000562906265259, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 26.51520538330078, "learning_rate": 7.963391657828167e-08, "logits/chosen": -1.7193012237548828, "logits/rejected": -1.6881153583526611, "logps/chosen": -183.8968505859375, "logps/rejected": -246.1622772216797, "loss": 0.5715, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3101425170898438, "rewards/margins": 0.6046009063720703, "rewards/rejected": -1.9147436618804932, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 24.79945182800293, "learning_rate": 7.943767495966556e-08, "logits/chosen": -1.6367594003677368, "logits/rejected": -1.6014162302017212, "logps/chosen": -187.56796264648438, "logps/rejected": -251.03634643554688, "loss": 0.5404, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3650391101837158, "rewards/margins": 0.6266623139381409, "rewards/rejected": -1.991701364517212, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 30.169219970703125, "learning_rate": 7.924151598933077e-08, "logits/chosen": -1.5777955055236816, "logits/rejected": -1.5336124897003174, "logps/chosen": -192.7977294921875, "logps/rejected": -256.5873107910156, "loss": 0.5313, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3899385929107666, "rewards/margins": 0.6526475548744202, "rewards/rejected": -2.042586326599121, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 17.994773864746094, "learning_rate": 7.904544045571942e-08, "logits/chosen": -1.6816425323486328, "logits/rejected": -1.6320663690567017, "logps/chosen": -186.03292846679688, "logps/rejected": -254.05209350585938, "loss": 0.5574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3177438974380493, "rewards/margins": 0.6810862421989441, "rewards/rejected": -1.9988301992416382, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 29.066410064697266, "learning_rate": 7.884944914693819e-08, "logits/chosen": -1.6614630222320557, "logits/rejected": -1.6166385412216187, "logps/chosen": -193.18630981445312, "logps/rejected": -257.2984924316406, "loss": 0.5533, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3812034130096436, "rewards/margins": 0.6659941673278809, "rewards/rejected": -2.0471975803375244, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 22.816099166870117, "learning_rate": 7.865354285075517e-08, "logits/chosen": -1.7261043787002563, "logits/rejected": -1.6789547204971313, "logps/chosen": -193.98812866210938, "logps/rejected": -275.81634521484375, "loss": 0.515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4036664962768555, "rewards/margins": 0.8163207173347473, "rewards/rejected": -2.219987154006958, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 25.88532066345215, "learning_rate": 7.845772235459687e-08, "logits/chosen": -1.6032822132110596, "logits/rejected": -1.5658022165298462, "logps/chosen": -201.2757568359375, "logps/rejected": -263.31585693359375, "loss": 0.5442, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4657762050628662, "rewards/margins": 0.6155818700790405, "rewards/rejected": -2.0813581943511963, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 27.669984817504883, "learning_rate": 7.826198844554484e-08, "logits/chosen": -1.6221199035644531, "logits/rejected": -1.5766185522079468, "logps/chosen": -202.876708984375, "logps/rejected": -271.6616516113281, "loss": 0.5582, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5073853731155396, "rewards/margins": 0.6935917735099792, "rewards/rejected": -2.200977325439453, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 25.442867279052734, "learning_rate": 7.806634191033268e-08, "logits/chosen": -1.6795568466186523, "logits/rejected": -1.6216316223144531, "logps/chosen": -189.34262084960938, "logps/rejected": -251.6562042236328, "loss": 0.5285, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3305351734161377, "rewards/margins": 0.6782180070877075, "rewards/rejected": -2.0087532997131348, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 25.80929183959961, "learning_rate": 7.787078353534276e-08, "logits/chosen": -1.6476482152938843, "logits/rejected": -1.609718918800354, "logps/chosen": -190.88961791992188, "logps/rejected": -267.1067199707031, "loss": 0.5332, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3713963031768799, "rewards/margins": 0.7359045147895813, "rewards/rejected": -2.1073009967803955, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 34.373382568359375, "learning_rate": 7.767531410660307e-08, "logits/chosen": -1.701225996017456, "logits/rejected": -1.6454471349716187, "logps/chosen": -188.1253204345703, "logps/rejected": -243.43905639648438, "loss": 0.5365, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3195775747299194, "rewards/margins": 0.6114214658737183, "rewards/rejected": -1.9309990406036377, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 29.572166442871094, "learning_rate": 7.74799344097841e-08, "logits/chosen": -1.6249849796295166, "logits/rejected": -1.5775136947631836, "logps/chosen": -185.059326171875, "logps/rejected": -244.96792602539062, "loss": 0.5475, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3161468505859375, "rewards/margins": 0.6391556262969971, "rewards/rejected": -1.9553024768829346, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 47.28426742553711, "learning_rate": 7.728464523019574e-08, "logits/chosen": -1.652116060256958, "logits/rejected": -1.6033544540405273, "logps/chosen": -194.1964569091797, "logps/rejected": -249.8228759765625, "loss": 0.5972, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4160946607589722, "rewards/margins": 0.5593221187591553, "rewards/rejected": -1.9754167795181274, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 22.565967559814453, "learning_rate": 7.7089447352784e-08, "logits/chosen": -1.7549766302108765, "logits/rejected": -1.7027647495269775, "logps/chosen": -186.93838500976562, "logps/rejected": -244.3561553955078, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": -1.3297926187515259, "rewards/margins": 0.614985466003418, "rewards/rejected": -1.9447780847549438, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 33.7943229675293, "learning_rate": 7.689434156212788e-08, "logits/chosen": -1.6176128387451172, "logits/rejected": -1.579614281654358, "logps/chosen": -198.7300567626953, "logps/rejected": -254.4537811279297, "loss": 0.5909, "rewards/accuracies": 0.6875, "rewards/chosen": -1.440397024154663, "rewards/margins": 0.5526378154754639, "rewards/rejected": -1.9930353164672852, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 21.347455978393555, "learning_rate": 7.669932864243627e-08, "logits/chosen": -1.612450361251831, "logits/rejected": -1.5564180612564087, "logps/chosen": -184.0469207763672, "logps/rejected": -260.59716796875, "loss": 0.5242, "rewards/accuracies": 0.75, "rewards/chosen": -1.3198864459991455, "rewards/margins": 0.7594377398490906, "rewards/rejected": -2.079324245452881, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 29.009899139404297, "learning_rate": 7.65044093775448e-08, "logits/chosen": -1.6315386295318604, "logits/rejected": -1.6006014347076416, "logps/chosen": -186.20626831054688, "logps/rejected": -243.677978515625, "loss": 0.5665, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3252546787261963, "rewards/margins": 0.5836648941040039, "rewards/rejected": -1.9089195728302002, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 24.822834014892578, "learning_rate": 7.630958455091266e-08, "logits/chosen": -1.6193573474884033, "logits/rejected": -1.590057611465454, "logps/chosen": -188.15383911132812, "logps/rejected": -246.31069946289062, "loss": 0.5482, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3108214139938354, "rewards/margins": 0.6063941121101379, "rewards/rejected": -1.9172155857086182, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 25.640043258666992, "learning_rate": 7.611485494561947e-08, "logits/chosen": -1.7544190883636475, "logits/rejected": -1.7139012813568115, "logps/chosen": -186.97499084472656, "logps/rejected": -251.82894897460938, "loss": 0.5634, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3380534648895264, "rewards/margins": 0.6293826699256897, "rewards/rejected": -1.96743643283844, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 29.79625701904297, "learning_rate": 7.592022134436201e-08, "logits/chosen": -1.787848711013794, "logits/rejected": -1.7382673025131226, "logps/chosen": -173.20809936523438, "logps/rejected": -243.8647003173828, "loss": 0.5089, "rewards/accuracies": 0.75, "rewards/chosen": -1.208669900894165, "rewards/margins": 0.6959220767021179, "rewards/rejected": -1.9045919179916382, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 23.23958969116211, "learning_rate": 7.57256845294513e-08, "logits/chosen": -1.768815279006958, "logits/rejected": -1.7291584014892578, "logps/chosen": -181.27464294433594, "logps/rejected": -250.6145477294922, "loss": 0.5386, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2931530475616455, "rewards/margins": 0.673789381980896, "rewards/rejected": -1.9669424295425415, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -1.7721792459487915, "eval_logits/rejected": -1.7500107288360596, "eval_logps/chosen": -185.76849365234375, "eval_logps/rejected": -220.52793884277344, "eval_loss": 0.6304465532302856, "eval_rewards/accuracies": 0.6373141407966614, "eval_rewards/chosen": -1.2705661058425903, "eval_rewards/margins": 0.3029119074344635, "eval_rewards/rejected": -1.573478102684021, "eval_runtime": 383.7392, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 17.646251678466797, "learning_rate": 7.553124528280928e-08, "logits/chosen": -1.6951982975006104, "logits/rejected": -1.6370052099227905, "logps/chosen": -196.0543670654297, "logps/rejected": -254.51473999023438, "loss": 0.5238, "rewards/accuracies": 0.71875, "rewards/chosen": -1.386745572090149, "rewards/margins": 0.6572982668876648, "rewards/rejected": -2.044044017791748, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 28.62160873413086, "learning_rate": 7.533690438596583e-08, "logits/chosen": -1.622422218322754, "logits/rejected": -1.5795962810516357, "logps/chosen": -177.42306518554688, "logps/rejected": -246.3397979736328, "loss": 0.526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2432811260223389, "rewards/margins": 0.7060755491256714, "rewards/rejected": -1.9493564367294312, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 43.903541564941406, "learning_rate": 7.514266262005528e-08, "logits/chosen": -1.6371924877166748, "logits/rejected": -1.5897127389907837, "logps/chosen": -200.0693359375, "logps/rejected": -261.50897216796875, "loss": 0.5597, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4825317859649658, "rewards/margins": 0.5994998216629028, "rewards/rejected": -2.082031726837158, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 27.840375900268555, "learning_rate": 7.494852076581377e-08, "logits/chosen": -1.680965781211853, "logits/rejected": -1.6448853015899658, "logps/chosen": -187.4963836669922, "logps/rejected": -240.0545654296875, "loss": 0.584, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3450555801391602, "rewards/margins": 0.5464226603507996, "rewards/rejected": -1.8914783000946045, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 26.34052085876465, "learning_rate": 7.475447960357572e-08, "logits/chosen": -1.6020797491073608, "logits/rejected": -1.5666046142578125, "logps/chosen": -180.9534912109375, "logps/rejected": -240.6409912109375, "loss": 0.5594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.307927131652832, "rewards/margins": 0.5539305210113525, "rewards/rejected": -1.8618577718734741, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 29.617908477783203, "learning_rate": 7.456053991327083e-08, "logits/chosen": -1.7373720407485962, "logits/rejected": -1.6805378198623657, "logps/chosen": -185.09268188476562, "logps/rejected": -254.7001495361328, "loss": 0.5303, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.298338770866394, "rewards/margins": 0.7245272397994995, "rewards/rejected": -2.0228660106658936, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 19.6914119720459, "learning_rate": 7.436670247442107e-08, "logits/chosen": -1.5922746658325195, "logits/rejected": -1.5555670261383057, "logps/chosen": -169.83578491210938, "logps/rejected": -254.55313110351562, "loss": 0.4986, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1935006380081177, "rewards/margins": 0.7914150357246399, "rewards/rejected": -1.9849159717559814, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 25.92415428161621, "learning_rate": 7.417296806613718e-08, "logits/chosen": -1.6392488479614258, "logits/rejected": -1.5973610877990723, "logps/chosen": -194.64828491210938, "logps/rejected": -260.9826354980469, "loss": 0.5367, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4043915271759033, "rewards/margins": 0.6784892678260803, "rewards/rejected": -2.082880973815918, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 25.264381408691406, "learning_rate": 7.397933746711603e-08, "logits/chosen": -1.681231141090393, "logits/rejected": -1.642806053161621, "logps/chosen": -192.63475036621094, "logps/rejected": -264.21405029296875, "loss": 0.5312, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3947433233261108, "rewards/margins": 0.6991499662399292, "rewards/rejected": -2.09389328956604, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 29.97552490234375, "learning_rate": 7.378581145563709e-08, "logits/chosen": -1.7367448806762695, "logits/rejected": -1.678744912147522, "logps/chosen": -190.0194549560547, "logps/rejected": -254.99465942382812, "loss": 0.5247, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3502709865570068, "rewards/margins": 0.6936368346214294, "rewards/rejected": -2.043907880783081, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 30.631378173828125, "learning_rate": 7.35923908095595e-08, "logits/chosen": -1.6269598007202148, "logits/rejected": -1.5821008682250977, "logps/chosen": -195.05043029785156, "logps/rejected": -251.58560180664062, "loss": 0.5723, "rewards/accuracies": 0.71875, "rewards/chosen": -1.393839955329895, "rewards/margins": 0.5802146792411804, "rewards/rejected": -1.9740545749664307, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 35.906612396240234, "learning_rate": 7.339907630631886e-08, "logits/chosen": -1.6025956869125366, "logits/rejected": -1.5546543598175049, "logps/chosen": -201.29962158203125, "logps/rejected": -268.4058837890625, "loss": 0.5154, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4546197652816772, "rewards/margins": 0.7002333402633667, "rewards/rejected": -2.154853105545044, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 30.438188552856445, "learning_rate": 7.320586872292413e-08, "logits/chosen": -1.559808373451233, "logits/rejected": -1.5186632871627808, "logps/chosen": -199.2595977783203, "logps/rejected": -255.65878295898438, "loss": 0.5548, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.450476050376892, "rewards/margins": 0.5912362933158875, "rewards/rejected": -2.041712522506714, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 45.64797592163086, "learning_rate": 7.301276883595463e-08, "logits/chosen": -1.551182508468628, "logits/rejected": -1.5193589925765991, "logps/chosen": -204.142822265625, "logps/rejected": -257.69952392578125, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4768362045288086, "rewards/margins": 0.5484408736228943, "rewards/rejected": -2.0252771377563477, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 33.39741897583008, "learning_rate": 7.281977742155669e-08, "logits/chosen": -1.6167634725570679, "logits/rejected": -1.5732940435409546, "logps/chosen": -201.14071655273438, "logps/rejected": -268.1167907714844, "loss": 0.5262, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4653427600860596, "rewards/margins": 0.6688799262046814, "rewards/rejected": -2.1342225074768066, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 35.17470169067383, "learning_rate": 7.262689525544067e-08, "logits/chosen": -1.6693687438964844, "logits/rejected": -1.6231985092163086, "logps/chosen": -196.91688537597656, "logps/rejected": -267.0567932128906, "loss": 0.5512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4310643672943115, "rewards/margins": 0.7137660980224609, "rewards/rejected": -2.1448302268981934, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 36.11570739746094, "learning_rate": 7.243412311287782e-08, "logits/chosen": -1.6791913509368896, "logits/rejected": -1.6281063556671143, "logps/chosen": -193.53982543945312, "logps/rejected": -258.5443420410156, "loss": 0.5791, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.366768717765808, "rewards/margins": 0.6775639653205872, "rewards/rejected": -2.044332504272461, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 28.242664337158203, "learning_rate": 7.224146176869717e-08, "logits/chosen": -1.6848160028457642, "logits/rejected": -1.6459327936172485, "logps/chosen": -201.38186645507812, "logps/rejected": -272.21429443359375, "loss": 0.5731, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4680874347686768, "rewards/margins": 0.6922693252563477, "rewards/rejected": -2.1603565216064453, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 24.009624481201172, "learning_rate": 7.204891199728241e-08, "logits/chosen": -1.6162869930267334, "logits/rejected": -1.576981782913208, "logps/chosen": -179.42196655273438, "logps/rejected": -247.52578735351562, "loss": 0.5222, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2693443298339844, "rewards/margins": 0.6809569597244263, "rewards/rejected": -1.9503014087677002, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 22.96497917175293, "learning_rate": 7.185647457256879e-08, "logits/chosen": -1.7962024211883545, "logits/rejected": -1.7659142017364502, "logps/chosen": -193.4381866455078, "logps/rejected": -259.83184814453125, "loss": 0.5676, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4054908752441406, "rewards/margins": 0.6229832768440247, "rewards/rejected": -2.0284743309020996, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 26.895023345947266, "learning_rate": 7.166415026803991e-08, "logits/chosen": -1.7166029214859009, "logits/rejected": -1.6600326299667358, "logps/chosen": -186.55538940429688, "logps/rejected": -261.6690673828125, "loss": 0.4973, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3334908485412598, "rewards/margins": 0.7684970498085022, "rewards/rejected": -2.101987600326538, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 19.827083587646484, "learning_rate": 7.147193985672477e-08, "logits/chosen": -1.7519909143447876, "logits/rejected": -1.7119252681732178, "logps/chosen": -187.62266540527344, "logps/rejected": -248.4136505126953, "loss": 0.5336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3261867761611938, "rewards/margins": 0.6275477409362793, "rewards/rejected": -1.9537343978881836, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 23.33296012878418, "learning_rate": 7.127984411119461e-08, "logits/chosen": -1.7048231363296509, "logits/rejected": -1.675737738609314, "logps/chosen": -193.9524688720703, "logps/rejected": -260.84259033203125, "loss": 0.5546, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3916425704956055, "rewards/margins": 0.6718882322311401, "rewards/rejected": -2.063530921936035, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 22.681320190429688, "learning_rate": 7.108786380355971e-08, "logits/chosen": -1.736576795578003, "logits/rejected": -1.6962627172470093, "logps/chosen": -196.87417602539062, "logps/rejected": -259.28131103515625, "loss": 0.5495, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.40407395362854, "rewards/margins": 0.6333260536193848, "rewards/rejected": -2.037400007247925, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 25.510026931762695, "learning_rate": 7.089599970546642e-08, "logits/chosen": -1.6437184810638428, "logits/rejected": -1.5956861972808838, "logps/chosen": -179.71234130859375, "logps/rejected": -259.95782470703125, "loss": 0.4912, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.277482032775879, "rewards/margins": 0.7694389224052429, "rewards/rejected": -2.0469212532043457, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 28.625423431396484, "learning_rate": 7.070425258809394e-08, "logits/chosen": -1.6748847961425781, "logits/rejected": -1.605231523513794, "logps/chosen": -192.73068237304688, "logps/rejected": -267.40386962890625, "loss": 0.499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.384558081626892, "rewards/margins": 0.7721072435379028, "rewards/rejected": -2.156665325164795, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 20.10926055908203, "learning_rate": 7.051262322215128e-08, "logits/chosen": -1.6803903579711914, "logits/rejected": -1.6113357543945312, "logps/chosen": -185.80337524414062, "logps/rejected": -265.29815673828125, "loss": 0.4953, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3262794017791748, "rewards/margins": 0.8310495615005493, "rewards/rejected": -2.1573286056518555, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 22.918128967285156, "learning_rate": 7.032111237787424e-08, "logits/chosen": -1.672593355178833, "logits/rejected": -1.626828908920288, "logps/chosen": -202.33993530273438, "logps/rejected": -269.45245361328125, "loss": 0.5286, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4683647155761719, "rewards/margins": 0.7012184858322144, "rewards/rejected": -2.169583320617676, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 29.926124572753906, "learning_rate": 7.01297208250222e-08, "logits/chosen": -1.6541608572006226, "logits/rejected": -1.6142412424087524, "logps/chosen": -204.04806518554688, "logps/rejected": -259.0303955078125, "loss": 0.5442, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4622454643249512, "rewards/margins": 0.6007913947105408, "rewards/rejected": -2.0630366802215576, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 26.798297882080078, "learning_rate": 6.993844933287496e-08, "logits/chosen": -1.5548121929168701, "logits/rejected": -1.518174171447754, "logps/chosen": -181.76849365234375, "logps/rejected": -250.3037567138672, "loss": 0.5323, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3089042901992798, "rewards/margins": 0.6539050340652466, "rewards/rejected": -1.9628093242645264, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 25.093984603881836, "learning_rate": 6.974729867022989e-08, "logits/chosen": -1.6413952112197876, "logits/rejected": -1.587660789489746, "logps/chosen": -214.73489379882812, "logps/rejected": -304.73065185546875, "loss": 0.5347, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6083320379257202, "rewards/margins": 0.9037650227546692, "rewards/rejected": -2.512097120285034, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 33.20296096801758, "learning_rate": 6.955626960539855e-08, "logits/chosen": -1.7690269947052002, "logits/rejected": -1.7244031429290771, "logps/chosen": -202.90762329101562, "logps/rejected": -263.71588134765625, "loss": 0.5506, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4339269399642944, "rewards/margins": 0.6664804220199585, "rewards/rejected": -2.100407600402832, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 34.29365921020508, "learning_rate": 6.936536290620393e-08, "logits/chosen": -1.5904325246810913, "logits/rejected": -1.5409746170043945, "logps/chosen": -205.6435089111328, "logps/rejected": -265.16888427734375, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -1.4976742267608643, "rewards/margins": 0.6221583485603333, "rewards/rejected": -2.119832754135132, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 23.378873825073242, "learning_rate": 6.917457933997706e-08, "logits/chosen": -1.5747454166412354, "logits/rejected": -1.536361813545227, "logps/chosen": -187.07211303710938, "logps/rejected": -245.0606689453125, "loss": 0.5473, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3325872421264648, "rewards/margins": 0.6022460460662842, "rewards/rejected": -1.9348335266113281, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 29.120763778686523, "learning_rate": 6.898391967355405e-08, "logits/chosen": -1.6491073369979858, "logits/rejected": -1.6144596338272095, "logps/chosen": -204.37586975097656, "logps/rejected": -264.35272216796875, "loss": 0.5523, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4628231525421143, "rewards/margins": 0.6155986785888672, "rewards/rejected": -2.0784218311309814, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 29.497880935668945, "learning_rate": 6.879338467327302e-08, "logits/chosen": -1.7183611392974854, "logits/rejected": -1.6814098358154297, "logps/chosen": -186.4435272216797, "logps/rejected": -240.2172393798828, "loss": 0.5788, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3432486057281494, "rewards/margins": 0.54877108335495, "rewards/rejected": -1.8920198678970337, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 34.64222717285156, "learning_rate": 6.860297510497104e-08, "logits/chosen": -1.5618165731430054, "logits/rejected": -1.5183881521224976, "logps/chosen": -196.0279998779297, "logps/rejected": -236.39517211914062, "loss": 0.5921, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3886585235595703, "rewards/margins": 0.4850701689720154, "rewards/rejected": -1.8737287521362305, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 24.274341583251953, "learning_rate": 6.841269173398107e-08, "logits/chosen": -1.7048511505126953, "logits/rejected": -1.6627804040908813, "logps/chosen": -198.62307739257812, "logps/rejected": -262.12396240234375, "loss": 0.5452, "rewards/accuracies": 0.6875, "rewards/chosen": -1.446572184562683, "rewards/margins": 0.666427493095398, "rewards/rejected": -2.112999677658081, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 26.194883346557617, "learning_rate": 6.82225353251286e-08, "logits/chosen": -1.7092431783676147, "logits/rejected": -1.6481502056121826, "logps/chosen": -178.11065673828125, "logps/rejected": -257.3678283691406, "loss": 0.4607, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.232288122177124, "rewards/margins": 0.8250058889389038, "rewards/rejected": -2.0572941303253174, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 32.451969146728516, "learning_rate": 6.80325066427291e-08, "logits/chosen": -1.7004636526107788, "logits/rejected": -1.6555591821670532, "logps/chosen": -179.03671264648438, "logps/rejected": -248.1265869140625, "loss": 0.5178, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.257236123085022, "rewards/margins": 0.6865662336349487, "rewards/rejected": -1.9438022375106812, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -1.7501182556152344, "eval_logits/rejected": -1.7272056341171265, "eval_logps/chosen": -187.30357360839844, "eval_logps/rejected": -223.25985717773438, "eval_loss": 0.629496157169342, "eval_rewards/accuracies": 0.6442843675613403, "eval_rewards/chosen": -1.2859166860580444, "eval_rewards/margins": 0.31488049030303955, "eval_rewards/rejected": -1.600797176361084, "eval_runtime": 384.6652, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 28.927274703979492, "learning_rate": 6.784260645058445e-08, "logits/chosen": -1.5466166734695435, "logits/rejected": -1.500182867050171, "logps/chosen": -189.25881958007812, "logps/rejected": -272.9542541503906, "loss": 0.5235, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3884804248809814, "rewards/margins": 0.8016069531440735, "rewards/rejected": -2.19008731842041, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 31.34682273864746, "learning_rate": 6.765283551198016e-08, "logits/chosen": -1.6393709182739258, "logits/rejected": -1.601914644241333, "logps/chosen": -192.19395446777344, "logps/rejected": -259.9654541015625, "loss": 0.5799, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4014427661895752, "rewards/margins": 0.6413668990135193, "rewards/rejected": -2.04280948638916, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 26.979076385498047, "learning_rate": 6.746319458968226e-08, "logits/chosen": -1.6130883693695068, "logits/rejected": -1.5812774896621704, "logps/chosen": -206.37557983398438, "logps/rejected": -262.9778747558594, "loss": 0.5763, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.506066083908081, "rewards/margins": 0.6084599494934082, "rewards/rejected": -2.1145262718200684, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 19.683889389038086, "learning_rate": 6.727368444593408e-08, "logits/chosen": -1.627913475036621, "logits/rejected": -1.5929877758026123, "logps/chosen": -191.50289916992188, "logps/rejected": -249.98178100585938, "loss": 0.5739, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3685283660888672, "rewards/margins": 0.5968562960624695, "rewards/rejected": -1.9653844833374023, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 21.013734817504883, "learning_rate": 6.708430584245337e-08, "logits/chosen": -1.6611820459365845, "logits/rejected": -1.6172800064086914, "logps/chosen": -184.84738159179688, "logps/rejected": -252.0414276123047, "loss": 0.5245, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3055918216705322, "rewards/margins": 0.6785596609115601, "rewards/rejected": -1.9841514825820923, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 25.888309478759766, "learning_rate": 6.689505954042913e-08, "logits/chosen": -1.611316442489624, "logits/rejected": -1.561462640762329, "logps/chosen": -182.65574645996094, "logps/rejected": -233.82638549804688, "loss": 0.5611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2482223510742188, "rewards/margins": 0.5744415521621704, "rewards/rejected": -1.8226640224456787, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 28.195512771606445, "learning_rate": 6.67059463005187e-08, "logits/chosen": -1.6652758121490479, "logits/rejected": -1.6094785928726196, "logps/chosen": -178.05380249023438, "logps/rejected": -245.34066772460938, "loss": 0.495, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2248725891113281, "rewards/margins": 0.7181330919265747, "rewards/rejected": -1.9430058002471924, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 24.995092391967773, "learning_rate": 6.651696688284438e-08, "logits/chosen": -1.680368185043335, "logits/rejected": -1.633082628250122, "logps/chosen": -196.35501098632812, "logps/rejected": -246.5548095703125, "loss": 0.5885, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4004287719726562, "rewards/margins": 0.541008710861206, "rewards/rejected": -1.9414373636245728, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 27.881084442138672, "learning_rate": 6.632812204699077e-08, "logits/chosen": -1.6912187337875366, "logits/rejected": -1.6491819620132446, "logps/chosen": -184.676025390625, "logps/rejected": -253.5763397216797, "loss": 0.5159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2471721172332764, "rewards/margins": 0.7271748781204224, "rewards/rejected": -1.9743473529815674, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 27.88582420349121, "learning_rate": 6.613941255200147e-08, "logits/chosen": -1.6142715215682983, "logits/rejected": -1.5784027576446533, "logps/chosen": -208.334228515625, "logps/rejected": -247.870849609375, "loss": 0.634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.49410080909729, "rewards/margins": 0.4474979043006897, "rewards/rejected": -1.941598892211914, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 33.243408203125, "learning_rate": 6.595083915637602e-08, "logits/chosen": -1.7283128499984741, "logits/rejected": -1.6947336196899414, "logps/chosen": -181.50262451171875, "logps/rejected": -259.31378173828125, "loss": 0.4967, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3103947639465332, "rewards/margins": 0.7675163149833679, "rewards/rejected": -2.077910900115967, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 17.72072982788086, "learning_rate": 6.576240261806711e-08, "logits/chosen": -1.6511691808700562, "logits/rejected": -1.5994830131530762, "logps/chosen": -189.26026916503906, "logps/rejected": -262.7272033691406, "loss": 0.5067, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3433868885040283, "rewards/margins": 0.7447927594184875, "rewards/rejected": -2.0881800651550293, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 34.079185485839844, "learning_rate": 6.557410369447712e-08, "logits/chosen": -1.643885612487793, "logits/rejected": -1.5888869762420654, "logps/chosen": -198.7957305908203, "logps/rejected": -277.14239501953125, "loss": 0.4892, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4623702764511108, "rewards/margins": 0.8277085423469543, "rewards/rejected": -2.2900784015655518, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 38.53839874267578, "learning_rate": 6.538594314245541e-08, "logits/chosen": -1.5357722043991089, "logits/rejected": -1.4847605228424072, "logps/chosen": -210.06088256835938, "logps/rejected": -287.0927734375, "loss": 0.5374, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5468144416809082, "rewards/margins": 0.7765219807624817, "rewards/rejected": -2.323336124420166, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 21.716381072998047, "learning_rate": 6.51979217182952e-08, "logits/chosen": -1.6712570190429688, "logits/rejected": -1.631908655166626, "logps/chosen": -218.09591674804688, "logps/rejected": -276.07403564453125, "loss": 0.5825, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6118927001953125, "rewards/margins": 0.5883620977401733, "rewards/rejected": -2.2002549171447754, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 25.573238372802734, "learning_rate": 6.501004017773049e-08, "logits/chosen": -1.6209056377410889, "logits/rejected": -1.5758745670318604, "logps/chosen": -205.62026977539062, "logps/rejected": -271.5562744140625, "loss": 0.5405, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5336554050445557, "rewards/margins": 0.6963235139846802, "rewards/rejected": -2.2299787998199463, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 22.5719051361084, "learning_rate": 6.482229927593292e-08, "logits/chosen": -1.6068344116210938, "logits/rejected": -1.5686242580413818, "logps/chosen": -202.43106079101562, "logps/rejected": -264.10528564453125, "loss": 0.5468, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.461557149887085, "rewards/margins": 0.6432181596755981, "rewards/rejected": -2.1047754287719727, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 38.594482421875, "learning_rate": 6.463469976750894e-08, "logits/chosen": -1.56108558177948, "logits/rejected": -1.512900710105896, "logps/chosen": -202.19317626953125, "logps/rejected": -269.82177734375, "loss": 0.5556, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5035988092422485, "rewards/margins": 0.6939393877983093, "rewards/rejected": -2.197537899017334, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 27.154815673828125, "learning_rate": 6.444724240649674e-08, "logits/chosen": -1.7020747661590576, "logits/rejected": -1.6445642709732056, "logps/chosen": -185.58892822265625, "logps/rejected": -272.3781433105469, "loss": 0.5032, "rewards/accuracies": 0.75, "rewards/chosen": -1.3326423168182373, "rewards/margins": 0.8379791378974915, "rewards/rejected": -2.170621395111084, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 27.39109230041504, "learning_rate": 6.425992794636305e-08, "logits/chosen": -1.6527000665664673, "logits/rejected": -1.6063038110733032, "logps/chosen": -188.4795684814453, "logps/rejected": -253.7546844482422, "loss": 0.5426, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3256714344024658, "rewards/margins": 0.6787670850753784, "rewards/rejected": -2.0044384002685547, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 28.39838981628418, "learning_rate": 6.407275714000029e-08, "logits/chosen": -1.645401954650879, "logits/rejected": -1.5962473154067993, "logps/chosen": -192.2163543701172, "logps/rejected": -256.794677734375, "loss": 0.5371, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3653062582015991, "rewards/margins": 0.6519485712051392, "rewards/rejected": -2.0172548294067383, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 26.793954849243164, "learning_rate": 6.388573073972341e-08, "logits/chosen": -1.6396955251693726, "logits/rejected": -1.6010195016860962, "logps/chosen": -196.24127197265625, "logps/rejected": -244.29257202148438, "loss": 0.5792, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.425563097000122, "rewards/margins": 0.514776349067688, "rewards/rejected": -1.9403393268585205, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 26.12976837158203, "learning_rate": 6.3698849497267e-08, "logits/chosen": -1.5891997814178467, "logits/rejected": -1.5519187450408936, "logps/chosen": -193.51866149902344, "logps/rejected": -244.13540649414062, "loss": 0.5918, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3854833841323853, "rewards/margins": 0.529236376285553, "rewards/rejected": -1.9147199392318726, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 27.010652542114258, "learning_rate": 6.351211416378221e-08, "logits/chosen": -1.6656444072723389, "logits/rejected": -1.6361305713653564, "logps/chosen": -188.60604858398438, "logps/rejected": -236.7830810546875, "loss": 0.58, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.360559105873108, "rewards/margins": 0.48112016916275024, "rewards/rejected": -1.8416792154312134, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 28.18216323852539, "learning_rate": 6.332552548983368e-08, "logits/chosen": -1.623583436012268, "logits/rejected": -1.5726052522659302, "logps/chosen": -183.00462341308594, "logps/rejected": -252.4422149658203, "loss": 0.5145, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.302964687347412, "rewards/margins": 0.704452395439148, "rewards/rejected": -2.0074172019958496, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 26.0721492767334, "learning_rate": 6.313908422539656e-08, "logits/chosen": -1.6321194171905518, "logits/rejected": -1.5928866863250732, "logps/chosen": -181.80203247070312, "logps/rejected": -252.208740234375, "loss": 0.5153, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2626416683197021, "rewards/margins": 0.7160847187042236, "rewards/rejected": -1.9787263870239258, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 17.767175674438477, "learning_rate": 6.295279111985354e-08, "logits/chosen": -1.6835839748382568, "logits/rejected": -1.6219911575317383, "logps/chosen": -193.47625732421875, "logps/rejected": -266.53326416015625, "loss": 0.4831, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3663376569747925, "rewards/margins": 0.7892099618911743, "rewards/rejected": -2.1555473804473877, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 25.063413619995117, "learning_rate": 6.276664692199175e-08, "logits/chosen": -1.7087328433990479, "logits/rejected": -1.6501919031143188, "logps/chosen": -179.31600952148438, "logps/rejected": -239.3920440673828, "loss": 0.5349, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2322156429290771, "rewards/margins": 0.657407820224762, "rewards/rejected": -1.8896234035491943, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 23.074682235717773, "learning_rate": 6.258065237999988e-08, "logits/chosen": -1.6238977909088135, "logits/rejected": -1.5825841426849365, "logps/chosen": -195.80282592773438, "logps/rejected": -246.5172119140625, "loss": 0.5983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4111154079437256, "rewards/margins": 0.5428776741027832, "rewards/rejected": -1.9539932012557983, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 30.888504028320312, "learning_rate": 6.239480824146503e-08, "logits/chosen": -1.6757621765136719, "logits/rejected": -1.6433537006378174, "logps/chosen": -181.2732696533203, "logps/rejected": -229.3321075439453, "loss": 0.58, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3098499774932861, "rewards/margins": 0.47558125853538513, "rewards/rejected": -1.785431146621704, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 31.92888069152832, "learning_rate": 6.220911525336977e-08, "logits/chosen": -1.5900300741195679, "logits/rejected": -1.556713342666626, "logps/chosen": -194.94851684570312, "logps/rejected": -261.088623046875, "loss": 0.5596, "rewards/accuracies": 0.6875, "rewards/chosen": -1.394680380821228, "rewards/margins": 0.6835174560546875, "rewards/rejected": -2.078198194503784, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 24.679962158203125, "learning_rate": 6.202357416208911e-08, "logits/chosen": -1.6697218418121338, "logits/rejected": -1.595521330833435, "logps/chosen": -186.24783325195312, "logps/rejected": -250.603515625, "loss": 0.4994, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2644917964935303, "rewards/margins": 0.7417808771133423, "rewards/rejected": -2.006272792816162, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 19.900346755981445, "learning_rate": 6.183818571338766e-08, "logits/chosen": -1.661076545715332, "logits/rejected": -1.6121562719345093, "logps/chosen": -179.8099822998047, "logps/rejected": -233.569580078125, "loss": 0.5344, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2634823322296143, "rewards/margins": 0.5821090340614319, "rewards/rejected": -1.8455913066864014, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 27.895814895629883, "learning_rate": 6.165295065241633e-08, "logits/chosen": -1.6979827880859375, "logits/rejected": -1.6585216522216797, "logps/chosen": -180.02334594726562, "logps/rejected": -265.24761962890625, "loss": 0.4854, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2927592992782593, "rewards/margins": 0.8074501156806946, "rewards/rejected": -2.1002094745635986, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 24.11145782470703, "learning_rate": 6.146786972370959e-08, "logits/chosen": -1.625811219215393, "logits/rejected": -1.5916721820831299, "logps/chosen": -190.41238403320312, "logps/rejected": -249.06112670898438, "loss": 0.5833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.39082932472229, "rewards/margins": 0.5714894533157349, "rewards/rejected": -1.962318778038025, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 24.088064193725586, "learning_rate": 6.128294367118237e-08, "logits/chosen": -1.6487737894058228, "logits/rejected": -1.5956158638000488, "logps/chosen": -194.94357299804688, "logps/rejected": -259.94940185546875, "loss": 0.5307, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3996059894561768, "rewards/margins": 0.6820842027664185, "rewards/rejected": -2.0816903114318848, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 33.55830383300781, "learning_rate": 6.109817323812706e-08, "logits/chosen": -1.641147255897522, "logits/rejected": -1.599494218826294, "logps/chosen": -205.94955444335938, "logps/rejected": -271.0900573730469, "loss": 0.5525, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.518385410308838, "rewards/margins": 0.635517954826355, "rewards/rejected": -2.1539034843444824, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 27.222890853881836, "learning_rate": 6.091355916721064e-08, "logits/chosen": -1.7054399251937866, "logits/rejected": -1.6661192178726196, "logps/chosen": -194.77606201171875, "logps/rejected": -263.6825866699219, "loss": 0.5325, "rewards/accuracies": 0.71875, "rewards/chosen": -1.385178565979004, "rewards/margins": 0.6949024796485901, "rewards/rejected": -2.0800812244415283, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 28.632381439208984, "learning_rate": 6.072910220047159e-08, "logits/chosen": -1.5894039869308472, "logits/rejected": -1.5394232273101807, "logps/chosen": -193.7826385498047, "logps/rejected": -244.916748046875, "loss": 0.5385, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.342811942100525, "rewards/margins": 0.5961362719535828, "rewards/rejected": -1.938948392868042, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 28.379253387451172, "learning_rate": 6.054480307931678e-08, "logits/chosen": -1.657523512840271, "logits/rejected": -1.622125267982483, "logps/chosen": -178.28390502929688, "logps/rejected": -239.18045043945312, "loss": 0.5556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2783455848693848, "rewards/margins": 0.5756335258483887, "rewards/rejected": -1.8539791107177734, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -1.7579905986785889, "eval_logits/rejected": -1.7355717420578003, "eval_logps/chosen": -185.22940063476562, "eval_logps/rejected": -220.3213653564453, "eval_loss": 0.6294985413551331, "eval_rewards/accuracies": 0.6361523866653442, "eval_rewards/chosen": -1.2651748657226562, "eval_rewards/margins": 0.3062376379966736, "eval_rewards/rejected": -1.571412444114685, "eval_runtime": 384.4299, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 11600 }, { "epoch": 2.0003445899379737, "grad_norm": 24.472761154174805, "learning_rate": 6.036066254451881e-08, "logits/chosen": -1.646522879600525, "logits/rejected": -1.6024043560028076, "logps/chosen": -181.7415313720703, "logps/rejected": -251.5871124267578, "loss": 0.5436, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3067373037338257, "rewards/margins": 0.6961520314216614, "rewards/rejected": -2.0028891563415527, "step": 11610 }, { "epoch": 2.0020675396278427, "grad_norm": 29.493276596069336, "learning_rate": 6.017668133621275e-08, "logits/chosen": -1.649038314819336, "logits/rejected": -1.606300950050354, "logps/chosen": -189.8376007080078, "logps/rejected": -259.1862487792969, "loss": 0.5138, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3303192853927612, "rewards/margins": 0.6951020956039429, "rewards/rejected": -2.025421142578125, "step": 11620 }, { "epoch": 2.003790489317712, "grad_norm": 20.440237045288086, "learning_rate": 5.999286019389342e-08, "logits/chosen": -1.6871811151504517, "logits/rejected": -1.6313413381576538, "logps/chosen": -183.3996124267578, "logps/rejected": -255.48953247070312, "loss": 0.4822, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2532579898834229, "rewards/margins": 0.7730938196182251, "rewards/rejected": -2.0263514518737793, "step": 11630 }, { "epoch": 2.005513439007581, "grad_norm": 22.95535659790039, "learning_rate": 5.980919985641202e-08, "logits/chosen": -1.697274923324585, "logits/rejected": -1.6537383794784546, "logps/chosen": -180.3485870361328, "logps/rejected": -251.100830078125, "loss": 0.4927, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2791554927825928, "rewards/margins": 0.7304551601409912, "rewards/rejected": -2.009610652923584, "step": 11640 }, { "epoch": 2.00723638869745, "grad_norm": 26.56332778930664, "learning_rate": 5.962570106197364e-08, "logits/chosen": -1.5988438129425049, "logits/rejected": -1.5431615114212036, "logps/chosen": -177.05831909179688, "logps/rejected": -246.145751953125, "loss": 0.5147, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2243263721466064, "rewards/margins": 0.725864052772522, "rewards/rejected": -1.9501903057098389, "step": 11650 }, { "epoch": 2.008959338387319, "grad_norm": 21.601728439331055, "learning_rate": 5.944236454813396e-08, "logits/chosen": -1.7152572870254517, "logits/rejected": -1.6425163745880127, "logps/chosen": -194.8033447265625, "logps/rejected": -266.7611389160156, "loss": 0.5118, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3984410762786865, "rewards/margins": 0.7663156390190125, "rewards/rejected": -2.1647565364837646, "step": 11660 }, { "epoch": 2.010682288077188, "grad_norm": 24.720033645629883, "learning_rate": 5.9259191051796375e-08, "logits/chosen": -1.6166601181030273, "logits/rejected": -1.5559755563735962, "logps/chosen": -198.079833984375, "logps/rejected": -292.06951904296875, "loss": 0.4613, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4261969327926636, "rewards/margins": 0.9553655385971069, "rewards/rejected": -2.3815624713897705, "step": 11670 }, { "epoch": 2.0124052377670574, "grad_norm": 31.20380973815918, "learning_rate": 5.907618130920919e-08, "logits/chosen": -1.5339603424072266, "logits/rejected": -1.4910167455673218, "logps/chosen": -205.57272338867188, "logps/rejected": -289.71307373046875, "loss": 0.5066, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4919240474700928, "rewards/margins": 0.8347158432006836, "rewards/rejected": -2.3266398906707764, "step": 11680 }, { "epoch": 2.0141281874569263, "grad_norm": 41.9987678527832, "learning_rate": 5.8893336055962254e-08, "logits/chosen": -1.6653470993041992, "logits/rejected": -1.61074697971344, "logps/chosen": -206.09677124023438, "logps/rejected": -269.8291931152344, "loss": 0.5325, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4608094692230225, "rewards/margins": 0.695176362991333, "rewards/rejected": -2.1559858322143555, "step": 11690 }, { "epoch": 2.0158511371467953, "grad_norm": 24.3171443939209, "learning_rate": 5.871065602698451e-08, "logits/chosen": -1.722753882408142, "logits/rejected": -1.6735206842422485, "logps/chosen": -190.55506896972656, "logps/rejected": -254.53768920898438, "loss": 0.5476, "rewards/accuracies": 0.75, "rewards/chosen": -1.337207555770874, "rewards/margins": 0.6379891633987427, "rewards/rejected": -1.9751968383789062, "step": 11700 }, { "epoch": 2.0175740868366643, "grad_norm": 39.6287727355957, "learning_rate": 5.852814195654068e-08, "logits/chosen": -1.6527093648910522, "logits/rejected": -1.6069351434707642, "logps/chosen": -199.5983123779297, "logps/rejected": -264.60858154296875, "loss": 0.5308, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4391547441482544, "rewards/margins": 0.6757218241691589, "rewards/rejected": -2.1148767471313477, "step": 11710 }, { "epoch": 2.0192970365265333, "grad_norm": 26.280908584594727, "learning_rate": 5.834579457822848e-08, "logits/chosen": -1.6911094188690186, "logits/rejected": -1.649491548538208, "logps/chosen": -200.91928100585938, "logps/rejected": -268.92919921875, "loss": 0.5565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.473656415939331, "rewards/margins": 0.6882593035697937, "rewards/rejected": -2.1619155406951904, "step": 11720 }, { "epoch": 2.0210199862164027, "grad_norm": 20.197315216064453, "learning_rate": 5.81636146249756e-08, "logits/chosen": -1.6321799755096436, "logits/rejected": -1.5866422653198242, "logps/chosen": -190.90988159179688, "logps/rejected": -278.8709716796875, "loss": 0.4776, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3572702407836914, "rewards/margins": 0.8569647669792175, "rewards/rejected": -2.2142350673675537, "step": 11730 }, { "epoch": 2.0227429359062716, "grad_norm": 22.812088012695312, "learning_rate": 5.798160282903672e-08, "logits/chosen": -1.701157808303833, "logits/rejected": -1.6628563404083252, "logps/chosen": -185.42343139648438, "logps/rejected": -252.4224090576172, "loss": 0.5333, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3248282670974731, "rewards/margins": 0.6645230650901794, "rewards/rejected": -1.989351511001587, "step": 11740 }, { "epoch": 2.0244658855961406, "grad_norm": 21.9329891204834, "learning_rate": 5.779975992199075e-08, "logits/chosen": -1.6952497959136963, "logits/rejected": -1.661820411682129, "logps/chosen": -179.97988891601562, "logps/rejected": -272.5703125, "loss": 0.4591, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3170697689056396, "rewards/margins": 0.8572857975959778, "rewards/rejected": -2.1743557453155518, "step": 11750 }, { "epoch": 2.0261888352860096, "grad_norm": 24.972156524658203, "learning_rate": 5.761808663473775e-08, "logits/chosen": -1.6464498043060303, "logits/rejected": -1.6037495136260986, "logps/chosen": -195.52725219726562, "logps/rejected": -272.06707763671875, "loss": 0.5244, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.407350778579712, "rewards/margins": 0.7783458828926086, "rewards/rejected": -2.185697078704834, "step": 11760 }, { "epoch": 2.0279117849758785, "grad_norm": 23.84781265258789, "learning_rate": 5.74365836974959e-08, "logits/chosen": -1.5857475996017456, "logits/rejected": -1.5295172929763794, "logps/chosen": -193.87759399414062, "logps/rejected": -268.18048095703125, "loss": 0.5116, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3709337711334229, "rewards/margins": 0.7918566465377808, "rewards/rejected": -2.162790536880493, "step": 11770 }, { "epoch": 2.029634734665748, "grad_norm": 35.31039047241211, "learning_rate": 5.7255251839798726e-08, "logits/chosen": -1.5392212867736816, "logits/rejected": -1.4999496936798096, "logps/chosen": -199.56455993652344, "logps/rejected": -269.37847900390625, "loss": 0.543, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4918761253356934, "rewards/margins": 0.6816176176071167, "rewards/rejected": -2.1734938621520996, "step": 11780 }, { "epoch": 2.031357684355617, "grad_norm": 31.171165466308594, "learning_rate": 5.7074091790492206e-08, "logits/chosen": -1.6171038150787354, "logits/rejected": -1.5785691738128662, "logps/chosen": -181.83547973632812, "logps/rejected": -295.6371765136719, "loss": 0.4036, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3244999647140503, "rewards/margins": 1.0919300317764282, "rewards/rejected": -2.4164299964904785, "step": 11790 }, { "epoch": 2.033080634045486, "grad_norm": 21.47528648376465, "learning_rate": 5.6893104277731594e-08, "logits/chosen": -1.6073768138885498, "logits/rejected": -1.5689818859100342, "logps/chosen": -214.50344848632812, "logps/rejected": -277.74688720703125, "loss": 0.5375, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5738976001739502, "rewards/margins": 0.6701092720031738, "rewards/rejected": -2.244006633758545, "step": 11800 }, { "epoch": 2.034803583735355, "grad_norm": 32.091861724853516, "learning_rate": 5.6712290028978815e-08, "logits/chosen": -1.567014217376709, "logits/rejected": -1.528381109237671, "logps/chosen": -220.6244659423828, "logps/rejected": -298.44293212890625, "loss": 0.5138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6461851596832275, "rewards/margins": 0.810315728187561, "rewards/rejected": -2.456500768661499, "step": 11810 }, { "epoch": 2.036526533425224, "grad_norm": 25.720264434814453, "learning_rate": 5.653164977099921e-08, "logits/chosen": -1.5711066722869873, "logits/rejected": -1.5318381786346436, "logps/chosen": -213.1315460205078, "logps/rejected": -294.34356689453125, "loss": 0.5246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.632672905921936, "rewards/margins": 0.7945891618728638, "rewards/rejected": -2.4272620677948, "step": 11820 }, { "epoch": 2.0382494831150932, "grad_norm": 35.8724479675293, "learning_rate": 5.635118422985896e-08, "logits/chosen": -1.5579307079315186, "logits/rejected": -1.5190536975860596, "logps/chosen": -197.6388702392578, "logps/rejected": -288.42901611328125, "loss": 0.4808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4660093784332275, "rewards/margins": 0.8728569149971008, "rewards/rejected": -2.3388664722442627, "step": 11830 }, { "epoch": 2.039972432804962, "grad_norm": 28.59819793701172, "learning_rate": 5.61708941309218e-08, "logits/chosen": -1.5615540742874146, "logits/rejected": -1.5118497610092163, "logps/chosen": -211.1688995361328, "logps/rejected": -299.4646301269531, "loss": 0.4795, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5894633531570435, "rewards/margins": 0.8861982226371765, "rewards/rejected": -2.4756617546081543, "step": 11840 }, { "epoch": 2.041695382494831, "grad_norm": 33.376678466796875, "learning_rate": 5.5990780198846435e-08, "logits/chosen": -1.4968514442443848, "logits/rejected": -1.4367352724075317, "logps/chosen": -224.6299285888672, "logps/rejected": -293.9331970214844, "loss": 0.5316, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6985681056976318, "rewards/margins": 0.7514779567718506, "rewards/rejected": -2.4500460624694824, "step": 11850 }, { "epoch": 2.0434183321847, "grad_norm": 37.74618911743164, "learning_rate": 5.581084315758351e-08, "logits/chosen": -1.6089210510253906, "logits/rejected": -1.580612301826477, "logps/chosen": -235.47769165039062, "logps/rejected": -290.8262939453125, "loss": 0.606, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8060928583145142, "rewards/margins": 0.5267369747161865, "rewards/rejected": -2.332829236984253, "step": 11860 }, { "epoch": 2.045141281874569, "grad_norm": 27.94704246520996, "learning_rate": 5.563108373037243e-08, "logits/chosen": -1.544832468032837, "logits/rejected": -1.501721978187561, "logps/chosen": -203.64990234375, "logps/rejected": -291.85693359375, "loss": 0.5009, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5535991191864014, "rewards/margins": 0.8287463188171387, "rewards/rejected": -2.38234543800354, "step": 11870 }, { "epoch": 2.0468642315644385, "grad_norm": 46.3896598815918, "learning_rate": 5.545150263973897e-08, "logits/chosen": -1.568872332572937, "logits/rejected": -1.5166070461273193, "logps/chosen": -212.12838745117188, "logps/rejected": -299.6388244628906, "loss": 0.465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5504481792449951, "rewards/margins": 0.8933519124984741, "rewards/rejected": -2.443800449371338, "step": 11880 }, { "epoch": 2.0485871812543075, "grad_norm": 28.92313575744629, "learning_rate": 5.527210060749201e-08, "logits/chosen": -1.5919010639190674, "logits/rejected": -1.5410182476043701, "logps/chosen": -220.18118286132812, "logps/rejected": -303.7860107421875, "loss": 0.495, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6413625478744507, "rewards/margins": 0.8713878393173218, "rewards/rejected": -2.5127501487731934, "step": 11890 }, { "epoch": 2.0503101309441765, "grad_norm": 34.28177261352539, "learning_rate": 5.509287835472067e-08, "logits/chosen": -1.4587892293930054, "logits/rejected": -1.4046921730041504, "logps/chosen": -217.0598602294922, "logps/rejected": -312.3211364746094, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6380027532577515, "rewards/margins": 0.9663599729537964, "rewards/rejected": -2.604362726211548, "step": 11900 }, { "epoch": 2.0520330806340454, "grad_norm": 32.117855072021484, "learning_rate": 5.4913836601791497e-08, "logits/chosen": -1.6434059143066406, "logits/rejected": -1.6171928644180298, "logps/chosen": -219.04653930664062, "logps/rejected": -309.12176513671875, "loss": 0.5203, "rewards/accuracies": 0.71875, "rewards/chosen": -1.670095443725586, "rewards/margins": 0.8413150906562805, "rewards/rejected": -2.511410713195801, "step": 11910 }, { "epoch": 2.0537560303239144, "grad_norm": 29.40235137939453, "learning_rate": 5.473497606834554e-08, "logits/chosen": -1.5855402946472168, "logits/rejected": -1.5456552505493164, "logps/chosen": -226.11276245117188, "logps/rejected": -300.3972473144531, "loss": 0.5177, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6782705783843994, "rewards/margins": 0.7569664716720581, "rewards/rejected": -2.435236692428589, "step": 11920 }, { "epoch": 2.055478980013784, "grad_norm": 23.78101348876953, "learning_rate": 5.4556297473295496e-08, "logits/chosen": -1.5875284671783447, "logits/rejected": -1.541799783706665, "logps/chosen": -206.77352905273438, "logps/rejected": -298.059326171875, "loss": 0.4563, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5398340225219727, "rewards/margins": 0.8825027346611023, "rewards/rejected": -2.4223365783691406, "step": 11930 }, { "epoch": 2.057201929703653, "grad_norm": 25.40555763244629, "learning_rate": 5.4377801534822676e-08, "logits/chosen": -1.5478023290634155, "logits/rejected": -1.4946033954620361, "logps/chosen": -205.4107208251953, "logps/rejected": -288.2338562011719, "loss": 0.4869, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5127077102661133, "rewards/margins": 0.8377426266670227, "rewards/rejected": -2.350450277328491, "step": 11940 }, { "epoch": 2.0589248793935218, "grad_norm": 36.173519134521484, "learning_rate": 5.419948897037436e-08, "logits/chosen": -1.655509352684021, "logits/rejected": -1.6048641204833984, "logps/chosen": -219.49755859375, "logps/rejected": -291.3073425292969, "loss": 0.5625, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6178226470947266, "rewards/margins": 0.7313393354415894, "rewards/rejected": -2.3491616249084473, "step": 11950 }, { "epoch": 2.0606478290833907, "grad_norm": 39.75484848022461, "learning_rate": 5.4021360496660614e-08, "logits/chosen": -1.5307501554489136, "logits/rejected": -1.4881916046142578, "logps/chosen": -209.718505859375, "logps/rejected": -300.6219787597656, "loss": 0.4911, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5740060806274414, "rewards/margins": 0.904276967048645, "rewards/rejected": -2.478283166885376, "step": 11960 }, { "epoch": 2.0623707787732597, "grad_norm": 23.72635841369629, "learning_rate": 5.3843416829651713e-08, "logits/chosen": -1.6101815700531006, "logits/rejected": -1.5637496709823608, "logps/chosen": -210.59402465820312, "logps/rejected": -288.64813232421875, "loss": 0.494, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5444914102554321, "rewards/margins": 0.8065498471260071, "rewards/rejected": -2.351041078567505, "step": 11970 }, { "epoch": 2.0640937284631287, "grad_norm": 26.937618255615234, "learning_rate": 5.3665658684574975e-08, "logits/chosen": -1.612941026687622, "logits/rejected": -1.5580946207046509, "logps/chosen": -199.7908172607422, "logps/rejected": -284.44891357421875, "loss": 0.4494, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4498227834701538, "rewards/margins": 0.8510910868644714, "rewards/rejected": -2.3009140491485596, "step": 11980 }, { "epoch": 2.065816678152998, "grad_norm": 28.87058448791504, "learning_rate": 5.348808677591222e-08, "logits/chosen": -1.6719602346420288, "logits/rejected": -1.6104381084442139, "logps/chosen": -204.49407958984375, "logps/rejected": -271.0589294433594, "loss": 0.5225, "rewards/accuracies": 0.8125, "rewards/chosen": -1.468535304069519, "rewards/margins": 0.7241191864013672, "rewards/rejected": -2.1926543712615967, "step": 11990 }, { "epoch": 2.067539627842867, "grad_norm": 27.164878845214844, "learning_rate": 5.331070181739654e-08, "logits/chosen": -1.5682309865951538, "logits/rejected": -1.5209156274795532, "logps/chosen": -198.87789916992188, "logps/rejected": -287.53662109375, "loss": 0.4901, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.482013463973999, "rewards/margins": 0.8468286395072937, "rewards/rejected": -2.3288419246673584, "step": 12000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -1.6928431987762451, "eval_logits/rejected": -1.6688482761383057, "eval_logps/chosen": -206.20091247558594, "eval_logps/rejected": -245.6420440673828, "eval_loss": 0.6303159594535828, "eval_rewards/accuracies": 0.6447490453720093, "eval_rewards/chosen": -1.4748902320861816, "eval_rewards/margins": 0.34972894191741943, "eval_rewards/rejected": -1.8246192932128906, "eval_runtime": 384.922, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 12000 }, { "epoch": 2.069262577532736, "grad_norm": 24.481056213378906, "learning_rate": 5.313350452200962e-08, "logits/chosen": -1.6024847030639648, "logits/rejected": -1.5504302978515625, "logps/chosen": -218.79531860351562, "logps/rejected": -306.15643310546875, "loss": 0.4969, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.638654112815857, "rewards/margins": 0.8658208847045898, "rewards/rejected": -2.5044751167297363, "step": 12010 }, { "epoch": 2.070985527222605, "grad_norm": 32.93242263793945, "learning_rate": 5.295649560197895e-08, "logits/chosen": -1.6048576831817627, "logits/rejected": -1.5665851831436157, "logps/chosen": -201.8533172607422, "logps/rejected": -278.32830810546875, "loss": 0.5261, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4912683963775635, "rewards/margins": 0.7643145322799683, "rewards/rejected": -2.255582809448242, "step": 12020 }, { "epoch": 2.072708476912474, "grad_norm": 30.427339553833008, "learning_rate": 5.27796757687748e-08, "logits/chosen": -1.71212899684906, "logits/rejected": -1.6749742031097412, "logps/chosen": -188.22976684570312, "logps/rejected": -254.19546508789062, "loss": 0.5499, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3920142650604248, "rewards/margins": 0.6375147700309753, "rewards/rejected": -2.029529094696045, "step": 12030 }, { "epoch": 2.0744314266023434, "grad_norm": 26.58029556274414, "learning_rate": 5.260304573310743e-08, "logits/chosen": -1.5901178121566772, "logits/rejected": -1.539693832397461, "logps/chosen": -204.1492156982422, "logps/rejected": -282.22393798828125, "loss": 0.5055, "rewards/accuracies": 0.78125, "rewards/chosen": -1.484749674797058, "rewards/margins": 0.826759934425354, "rewards/rejected": -2.311509609222412, "step": 12040 }, { "epoch": 2.0761543762922123, "grad_norm": 32.788917541503906, "learning_rate": 5.242660620492416e-08, "logits/chosen": -1.6245148181915283, "logits/rejected": -1.5556128025054932, "logps/chosen": -203.12457275390625, "logps/rejected": -294.1376953125, "loss": 0.4289, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4886772632598877, "rewards/margins": 0.940477728843689, "rewards/rejected": -2.429154872894287, "step": 12050 }, { "epoch": 2.0778773259820813, "grad_norm": 28.661020278930664, "learning_rate": 5.2250357893406703e-08, "logits/chosen": -1.5994634628295898, "logits/rejected": -1.5617071390151978, "logps/chosen": -203.6561737060547, "logps/rejected": -275.8608703613281, "loss": 0.5248, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4781253337860107, "rewards/margins": 0.7392922639846802, "rewards/rejected": -2.2174174785614014, "step": 12060 }, { "epoch": 2.0796002756719503, "grad_norm": 27.835433959960938, "learning_rate": 5.2074301506968165e-08, "logits/chosen": -1.5991908311843872, "logits/rejected": -1.5303010940551758, "logps/chosen": -199.74252319335938, "logps/rejected": -295.2624206542969, "loss": 0.4498, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4509713649749756, "rewards/margins": 1.0048578977584839, "rewards/rejected": -2.45582914352417, "step": 12070 }, { "epoch": 2.0813232253618192, "grad_norm": 41.754215240478516, "learning_rate": 5.189843775325018e-08, "logits/chosen": -1.492857575416565, "logits/rejected": -1.4475312232971191, "logps/chosen": -198.62515258789062, "logps/rejected": -295.5875244140625, "loss": 0.4505, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4303429126739502, "rewards/margins": 0.9779750108718872, "rewards/rejected": -2.408318042755127, "step": 12080 }, { "epoch": 2.0830461750516887, "grad_norm": 39.57783508300781, "learning_rate": 5.172276733912009e-08, "logits/chosen": -1.5722321271896362, "logits/rejected": -1.5255016088485718, "logps/chosen": -215.2079620361328, "logps/rejected": -291.56878662109375, "loss": 0.5355, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5862327814102173, "rewards/margins": 0.7993678450584412, "rewards/rejected": -2.3856005668640137, "step": 12090 }, { "epoch": 2.0847691247415576, "grad_norm": 51.323238372802734, "learning_rate": 5.1547290970668243e-08, "logits/chosen": -1.5735671520233154, "logits/rejected": -1.5332248210906982, "logps/chosen": -205.8524169921875, "logps/rejected": -281.3876953125, "loss": 0.5143, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4781602621078491, "rewards/margins": 0.813062310218811, "rewards/rejected": -2.29122257232666, "step": 12100 }, { "epoch": 2.0864920744314266, "grad_norm": 31.750356674194336, "learning_rate": 5.13720093532049e-08, "logits/chosen": -1.6060386896133423, "logits/rejected": -1.557785153388977, "logps/chosen": -228.09848022460938, "logps/rejected": -296.2030029296875, "loss": 0.5508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7111345529556274, "rewards/margins": 0.7136415243148804, "rewards/rejected": -2.424776077270508, "step": 12110 }, { "epoch": 2.0882150241212956, "grad_norm": 37.41475296020508, "learning_rate": 5.1196923191257654e-08, "logits/chosen": -1.5627740621566772, "logits/rejected": -1.516533613204956, "logps/chosen": -216.0414276123047, "logps/rejected": -286.8224792480469, "loss": 0.5469, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6007273197174072, "rewards/margins": 0.7471935749053955, "rewards/rejected": -2.3479208946228027, "step": 12120 }, { "epoch": 2.0899379738111645, "grad_norm": 20.90497589111328, "learning_rate": 5.102203318856847e-08, "logits/chosen": -1.5501728057861328, "logits/rejected": -1.491492509841919, "logps/chosen": -205.28103637695312, "logps/rejected": -298.6039123535156, "loss": 0.4356, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5086464881896973, "rewards/margins": 0.9435533285140991, "rewards/rejected": -2.4521994590759277, "step": 12130 }, { "epoch": 2.091660923501034, "grad_norm": 40.72807312011719, "learning_rate": 5.084734004809079e-08, "logits/chosen": -1.5945508480072021, "logits/rejected": -1.5598722696304321, "logps/chosen": -214.08544921875, "logps/rejected": -284.52252197265625, "loss": 0.5566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6211093664169312, "rewards/margins": 0.6779786944389343, "rewards/rejected": -2.2990882396698, "step": 12140 }, { "epoch": 2.093383873190903, "grad_norm": 85.24677276611328, "learning_rate": 5.0672844471986806e-08, "logits/chosen": -1.580863118171692, "logits/rejected": -1.538017988204956, "logps/chosen": -198.13323974609375, "logps/rejected": -282.21923828125, "loss": 0.5177, "rewards/accuracies": 0.71875, "rewards/chosen": -1.447094440460205, "rewards/margins": 0.8471297025680542, "rewards/rejected": -2.294224262237549, "step": 12150 }, { "epoch": 2.095106822880772, "grad_norm": 46.52803039550781, "learning_rate": 5.049854716162469e-08, "logits/chosen": -1.527610421180725, "logits/rejected": -1.4756158590316772, "logps/chosen": -205.2031707763672, "logps/rejected": -281.08294677734375, "loss": 0.5246, "rewards/accuracies": 0.75, "rewards/chosen": -1.4923702478408813, "rewards/margins": 0.7814701199531555, "rewards/rejected": -2.2738404273986816, "step": 12160 }, { "epoch": 2.096829772570641, "grad_norm": 25.297910690307617, "learning_rate": 5.032444881757575e-08, "logits/chosen": -1.5589890480041504, "logits/rejected": -1.5120322704315186, "logps/chosen": -207.8610076904297, "logps/rejected": -290.50433349609375, "loss": 0.5286, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5336462259292603, "rewards/margins": 0.8126686215400696, "rewards/rejected": -2.3463149070739746, "step": 12170 }, { "epoch": 2.09855272226051, "grad_norm": 24.670923233032227, "learning_rate": 5.015055013961129e-08, "logits/chosen": -1.5066965818405151, "logits/rejected": -1.46164870262146, "logps/chosen": -208.98617553710938, "logps/rejected": -316.40887451171875, "loss": 0.4464, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6038252115249634, "rewards/margins": 0.9800359010696411, "rewards/rejected": -2.5838611125946045, "step": 12180 }, { "epoch": 2.1002756719503792, "grad_norm": 27.564498901367188, "learning_rate": 4.9976851826700385e-08, "logits/chosen": -1.5501229763031006, "logits/rejected": -1.4985682964324951, "logps/chosen": -211.6484375, "logps/rejected": -279.1171875, "loss": 0.532, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5453133583068848, "rewards/margins": 0.7092097997665405, "rewards/rejected": -2.254523277282715, "step": 12190 }, { "epoch": 2.101998621640248, "grad_norm": 27.813444137573242, "learning_rate": 4.980335457700665e-08, "logits/chosen": -1.642297387123108, "logits/rejected": -1.5962289571762085, "logps/chosen": -208.8214111328125, "logps/rejected": -290.8011169433594, "loss": 0.48, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.522281289100647, "rewards/margins": 0.843533992767334, "rewards/rejected": -2.3658149242401123, "step": 12200 }, { "epoch": 2.103721571330117, "grad_norm": 30.052356719970703, "learning_rate": 4.963005908788547e-08, "logits/chosen": -1.615196943283081, "logits/rejected": -1.5772802829742432, "logps/chosen": -208.3212432861328, "logps/rejected": -278.3284606933594, "loss": 0.5273, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5461933612823486, "rewards/margins": 0.719495415687561, "rewards/rejected": -2.26568865776062, "step": 12210 }, { "epoch": 2.105444521019986, "grad_norm": 43.54231262207031, "learning_rate": 4.945696605588143e-08, "logits/chosen": -1.5061839818954468, "logits/rejected": -1.4668266773223877, "logps/chosen": -211.716552734375, "logps/rejected": -289.04095458984375, "loss": 0.5089, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5821346044540405, "rewards/margins": 0.7677549123764038, "rewards/rejected": -2.3498897552490234, "step": 12220 }, { "epoch": 2.107167470709855, "grad_norm": 56.09331512451172, "learning_rate": 4.928407617672519e-08, "logits/chosen": -1.5421305894851685, "logits/rejected": -1.5023183822631836, "logps/chosen": -217.19921875, "logps/rejected": -291.5745849609375, "loss": 0.556, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6528059244155884, "rewards/margins": 0.7489153742790222, "rewards/rejected": -2.401721477508545, "step": 12230 }, { "epoch": 2.1088904203997245, "grad_norm": 41.50090026855469, "learning_rate": 4.911139014533099e-08, "logits/chosen": -1.513869047164917, "logits/rejected": -1.4628936052322388, "logps/chosen": -205.1170196533203, "logps/rejected": -282.4010925292969, "loss": 0.5169, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5244477987289429, "rewards/margins": 0.7733960747718811, "rewards/rejected": -2.2978439331054688, "step": 12240 }, { "epoch": 2.1106133700895935, "grad_norm": 39.326995849609375, "learning_rate": 4.893890865579362e-08, "logits/chosen": -1.5844281911849976, "logits/rejected": -1.535041093826294, "logps/chosen": -204.37889099121094, "logps/rejected": -270.86981201171875, "loss": 0.5651, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5194447040557861, "rewards/margins": 0.6731320023536682, "rewards/rejected": -2.1925766468048096, "step": 12250 }, { "epoch": 2.1123363197794625, "grad_norm": 28.982343673706055, "learning_rate": 4.8766632401385856e-08, "logits/chosen": -1.5679948329925537, "logits/rejected": -1.5272144079208374, "logps/chosen": -193.6581573486328, "logps/rejected": -274.4389343261719, "loss": 0.5026, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.410930871963501, "rewards/margins": 0.7730732560157776, "rewards/rejected": -2.184004306793213, "step": 12260 }, { "epoch": 2.1140592694693314, "grad_norm": 28.518871307373047, "learning_rate": 4.859456207455539e-08, "logits/chosen": -1.6385746002197266, "logits/rejected": -1.5881214141845703, "logps/chosen": -211.4015350341797, "logps/rejected": -290.3092956542969, "loss": 0.5262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5662710666656494, "rewards/margins": 0.7988941669464111, "rewards/rejected": -2.3651652336120605, "step": 12270 }, { "epoch": 2.1157822191592004, "grad_norm": 25.8980712890625, "learning_rate": 4.842269836692239e-08, "logits/chosen": -1.5845524072647095, "logits/rejected": -1.5420753955841064, "logps/chosen": -204.1547393798828, "logps/rejected": -280.9200744628906, "loss": 0.5048, "rewards/accuracies": 0.75, "rewards/chosen": -1.4760124683380127, "rewards/margins": 0.7984156608581543, "rewards/rejected": -2.274428129196167, "step": 12280 }, { "epoch": 2.11750516884907, "grad_norm": 32.773284912109375, "learning_rate": 4.8251041969276355e-08, "logits/chosen": -1.5275884866714478, "logits/rejected": -1.4831234216690063, "logps/chosen": -200.70103454589844, "logps/rejected": -266.9785461425781, "loss": 0.5278, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4691832065582275, "rewards/margins": 0.6694160103797913, "rewards/rejected": -2.138599157333374, "step": 12290 }, { "epoch": 2.1192281185389388, "grad_norm": 32.303794860839844, "learning_rate": 4.8079593571573654e-08, "logits/chosen": -1.6137571334838867, "logits/rejected": -1.5665721893310547, "logps/chosen": -201.3102569580078, "logps/rejected": -274.618896484375, "loss": 0.5023, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4663169384002686, "rewards/margins": 0.7571481466293335, "rewards/rejected": -2.2234652042388916, "step": 12300 }, { "epoch": 2.1209510682288077, "grad_norm": 23.450542449951172, "learning_rate": 4.7908353862934645e-08, "logits/chosen": -1.5943067073822021, "logits/rejected": -1.5404088497161865, "logps/chosen": -192.87353515625, "logps/rejected": -279.0466613769531, "loss": 0.4782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3797800540924072, "rewards/margins": 0.8839829564094543, "rewards/rejected": -2.263762950897217, "step": 12310 }, { "epoch": 2.1226740179186767, "grad_norm": 22.52594757080078, "learning_rate": 4.773732353164069e-08, "logits/chosen": -1.5985201597213745, "logits/rejected": -1.5497382879257202, "logps/chosen": -187.78636169433594, "logps/rejected": -280.4857177734375, "loss": 0.4772, "rewards/accuracies": 0.75, "rewards/chosen": -1.3348400592803955, "rewards/margins": 0.9259307980537415, "rewards/rejected": -2.2607710361480713, "step": 12320 }, { "epoch": 2.1243969676085457, "grad_norm": 26.383817672729492, "learning_rate": 4.756650326513175e-08, "logits/chosen": -1.6028234958648682, "logits/rejected": -1.568605661392212, "logps/chosen": -202.93075561523438, "logps/rejected": -274.3039245605469, "loss": 0.5205, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4667823314666748, "rewards/margins": 0.7258719205856323, "rewards/rejected": -2.1926541328430176, "step": 12330 }, { "epoch": 2.126119917298415, "grad_norm": 26.518596649169922, "learning_rate": 4.739589375000345e-08, "logits/chosen": -1.621058702468872, "logits/rejected": -1.562735676765442, "logps/chosen": -202.68588256835938, "logps/rejected": -301.7408142089844, "loss": 0.4497, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4908440113067627, "rewards/margins": 0.9897588491439819, "rewards/rejected": -2.480602741241455, "step": 12340 }, { "epoch": 2.127842866988284, "grad_norm": 27.536848068237305, "learning_rate": 4.722549567200423e-08, "logits/chosen": -1.5056012868881226, "logits/rejected": -1.4400701522827148, "logps/chosen": -217.3277130126953, "logps/rejected": -291.8016052246094, "loss": 0.5163, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6065528392791748, "rewards/margins": 0.8035901784896851, "rewards/rejected": -2.4101428985595703, "step": 12350 }, { "epoch": 2.129565816678153, "grad_norm": 35.610374450683594, "learning_rate": 4.70553097160327e-08, "logits/chosen": -1.5891592502593994, "logits/rejected": -1.5435259342193604, "logps/chosen": -198.75253295898438, "logps/rejected": -294.31427001953125, "loss": 0.4536, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4582741260528564, "rewards/margins": 0.9680150747299194, "rewards/rejected": -2.4262890815734863, "step": 12360 }, { "epoch": 2.131288766368022, "grad_norm": 21.72165870666504, "learning_rate": 4.6885336566134905e-08, "logits/chosen": -1.5432701110839844, "logits/rejected": -1.4837857484817505, "logps/chosen": -210.87631225585938, "logps/rejected": -301.46527099609375, "loss": 0.4864, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5829124450683594, "rewards/margins": 0.9285858273506165, "rewards/rejected": -2.511498212814331, "step": 12370 }, { "epoch": 2.133011716057891, "grad_norm": 32.40623474121094, "learning_rate": 4.671557690550158e-08, "logits/chosen": -1.557513952255249, "logits/rejected": -1.524306058883667, "logps/chosen": -203.4970245361328, "logps/rejected": -304.2638244628906, "loss": 0.4518, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5309957265853882, "rewards/margins": 0.9441516995429993, "rewards/rejected": -2.4751474857330322, "step": 12380 }, { "epoch": 2.13473466574776, "grad_norm": 28.967927932739258, "learning_rate": 4.65460314164652e-08, "logits/chosen": -1.661972999572754, "logits/rejected": -1.6170930862426758, "logps/chosen": -209.74435424804688, "logps/rejected": -305.64923095703125, "loss": 0.4876, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5648424625396729, "rewards/margins": 0.9469350576400757, "rewards/rejected": -2.511777400970459, "step": 12390 }, { "epoch": 2.1364576154376294, "grad_norm": 26.420259475708008, "learning_rate": 4.637670078049759e-08, "logits/chosen": -1.5792243480682373, "logits/rejected": -1.5416433811187744, "logps/chosen": -224.37405395507812, "logps/rejected": -317.53863525390625, "loss": 0.4713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6922643184661865, "rewards/margins": 0.9153863191604614, "rewards/rejected": -2.6076505184173584, "step": 12400 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -1.6644082069396973, "eval_logits/rejected": -1.6396886110305786, "eval_logps/chosen": -221.01467895507812, "eval_logps/rejected": -263.3477783203125, "eval_loss": 0.6302575469017029, "eval_rewards/accuracies": 0.6470724940299988, "eval_rewards/chosen": -1.6230275630950928, "eval_rewards/margins": 0.37864890694618225, "eval_rewards/rejected": -2.001676559448242, "eval_runtime": 384.8844, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 12400 }, { "epoch": 2.1381805651274983, "grad_norm": 33.982757568359375, "learning_rate": 4.620758567820686e-08, "logits/chosen": -1.5089236497879028, "logits/rejected": -1.4480992555618286, "logps/chosen": -215.35238647460938, "logps/rejected": -295.104736328125, "loss": 0.4985, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6145118474960327, "rewards/margins": 0.8170326352119446, "rewards/rejected": -2.431544542312622, "step": 12410 }, { "epoch": 2.1399035148173673, "grad_norm": 62.16181564331055, "learning_rate": 4.60386867893348e-08, "logits/chosen": -1.3992679119110107, "logits/rejected": -1.353809118270874, "logps/chosen": -225.4574432373047, "logps/rejected": -309.0572814941406, "loss": 0.5194, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7003612518310547, "rewards/margins": 0.8596186637878418, "rewards/rejected": -2.5599799156188965, "step": 12420 }, { "epoch": 2.1416264645072363, "grad_norm": 39.4116325378418, "learning_rate": 4.5870004792754257e-08, "logits/chosen": -1.5263465642929077, "logits/rejected": -1.4820783138275146, "logps/chosen": -227.6816864013672, "logps/rejected": -318.7822265625, "loss": 0.5033, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7275323867797852, "rewards/margins": 0.9142645001411438, "rewards/rejected": -2.641796588897705, "step": 12430 }, { "epoch": 2.1433494141971057, "grad_norm": 30.580076217651367, "learning_rate": 4.570154036646625e-08, "logits/chosen": -1.4799082279205322, "logits/rejected": -1.4441767930984497, "logps/chosen": -215.04409790039062, "logps/rejected": -296.3481750488281, "loss": 0.5079, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.617315649986267, "rewards/margins": 0.8194793462753296, "rewards/rejected": -2.4367949962615967, "step": 12440 }, { "epoch": 2.1450723638869746, "grad_norm": 35.00650405883789, "learning_rate": 4.553329418759726e-08, "logits/chosen": -1.5489532947540283, "logits/rejected": -1.507712960243225, "logps/chosen": -209.34255981445312, "logps/rejected": -290.799560546875, "loss": 0.5195, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5875122547149658, "rewards/margins": 0.8105241656303406, "rewards/rejected": -2.398036479949951, "step": 12450 }, { "epoch": 2.1467953135768436, "grad_norm": 34.579010009765625, "learning_rate": 4.5365266932396526e-08, "logits/chosen": -1.6123673915863037, "logits/rejected": -1.5568170547485352, "logps/chosen": -220.16244506835938, "logps/rejected": -316.6996154785156, "loss": 0.484, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6562690734863281, "rewards/margins": 0.9824492335319519, "rewards/rejected": -2.6387181282043457, "step": 12460 }, { "epoch": 2.1485182632667126, "grad_norm": 36.69334411621094, "learning_rate": 4.519745927623344e-08, "logits/chosen": -1.618233323097229, "logits/rejected": -1.5750727653503418, "logps/chosen": -231.67236328125, "logps/rejected": -328.7120056152344, "loss": 0.4771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7639925479888916, "rewards/margins": 0.9653264880180359, "rewards/rejected": -2.7293190956115723, "step": 12470 }, { "epoch": 2.1502412129565815, "grad_norm": 32.25230407714844, "learning_rate": 4.5029871893594695e-08, "logits/chosen": -1.6398899555206299, "logits/rejected": -1.5991530418395996, "logps/chosen": -223.4516143798828, "logps/rejected": -302.0787353515625, "loss": 0.5225, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6901556253433228, "rewards/margins": 0.7845726013183594, "rewards/rejected": -2.4747283458709717, "step": 12480 }, { "epoch": 2.1519641626464505, "grad_norm": 32.58366012573242, "learning_rate": 4.486250545808159e-08, "logits/chosen": -1.5485055446624756, "logits/rejected": -1.5052379369735718, "logps/chosen": -208.1356658935547, "logps/rejected": -298.0610656738281, "loss": 0.498, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.536603569984436, "rewards/margins": 0.9074599146842957, "rewards/rejected": -2.444063425064087, "step": 12490 }, { "epoch": 2.15368711233632, "grad_norm": 36.965362548828125, "learning_rate": 4.469536064240731e-08, "logits/chosen": -1.5669692754745483, "logits/rejected": -1.521559476852417, "logps/chosen": -222.0422821044922, "logps/rejected": -315.29248046875, "loss": 0.4904, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.686669945716858, "rewards/margins": 0.9324058294296265, "rewards/rejected": -2.6190757751464844, "step": 12500 }, { "epoch": 2.155410062026189, "grad_norm": 25.924415588378906, "learning_rate": 4.452843811839435e-08, "logits/chosen": -1.6050430536270142, "logits/rejected": -1.56070077419281, "logps/chosen": -210.5176239013672, "logps/rejected": -285.3177185058594, "loss": 0.5205, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5747045278549194, "rewards/margins": 0.7555389404296875, "rewards/rejected": -2.3302433490753174, "step": 12510 }, { "epoch": 2.157133011716058, "grad_norm": 30.48223114013672, "learning_rate": 4.436173855697174e-08, "logits/chosen": -1.5401922464370728, "logits/rejected": -1.496222734451294, "logps/chosen": -215.2317352294922, "logps/rejected": -298.5047302246094, "loss": 0.5345, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6176135540008545, "rewards/margins": 0.8186357617378235, "rewards/rejected": -2.436249256134033, "step": 12520 }, { "epoch": 2.158855961405927, "grad_norm": 38.309688568115234, "learning_rate": 4.4195262628172224e-08, "logits/chosen": -1.5570924282073975, "logits/rejected": -1.5224055051803589, "logps/chosen": -224.19107055664062, "logps/rejected": -312.5212707519531, "loss": 0.4784, "rewards/accuracies": 0.75, "rewards/chosen": -1.6979620456695557, "rewards/margins": 0.8683120608329773, "rewards/rejected": -2.5662739276885986, "step": 12530 }, { "epoch": 2.160578911095796, "grad_norm": 33.87547302246094, "learning_rate": 4.402901100112972e-08, "logits/chosen": -1.5645685195922852, "logits/rejected": -1.5179166793823242, "logps/chosen": -211.88320922851562, "logps/rejected": -289.5576171875, "loss": 0.5615, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5813262462615967, "rewards/margins": 0.77818363904953, "rewards/rejected": -2.3595099449157715, "step": 12540 }, { "epoch": 2.162301860785665, "grad_norm": 40.345947265625, "learning_rate": 4.386298434407666e-08, "logits/chosen": -1.6636282205581665, "logits/rejected": -1.629463791847229, "logps/chosen": -205.6763916015625, "logps/rejected": -274.09100341796875, "loss": 0.5371, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5461241006851196, "rewards/margins": 0.6756289601325989, "rewards/rejected": -2.2217533588409424, "step": 12550 }, { "epoch": 2.164024810475534, "grad_norm": 29.747833251953125, "learning_rate": 4.369718332434109e-08, "logits/chosen": -1.6467325687408447, "logits/rejected": -1.5891529321670532, "logps/chosen": -210.7559051513672, "logps/rejected": -276.63140869140625, "loss": 0.5436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5353169441223145, "rewards/margins": 0.7327998280525208, "rewards/rejected": -2.2681167125701904, "step": 12560 }, { "epoch": 2.165747760165403, "grad_norm": 29.418228149414062, "learning_rate": 4.3531608608344274e-08, "logits/chosen": -1.5815799236297607, "logits/rejected": -1.524218201637268, "logps/chosen": -196.69119262695312, "logps/rejected": -272.20849609375, "loss": 0.4979, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4489413499832153, "rewards/margins": 0.7680349349975586, "rewards/rejected": -2.2169766426086426, "step": 12570 }, { "epoch": 2.167470709855272, "grad_norm": 42.948368072509766, "learning_rate": 4.3366260861597814e-08, "logits/chosen": -1.6030486822128296, "logits/rejected": -1.5377169847488403, "logps/chosen": -213.7222900390625, "logps/rejected": -295.1755065917969, "loss": 0.4841, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5307261943817139, "rewards/margins": 0.8973187208175659, "rewards/rejected": -2.4280447959899902, "step": 12580 }, { "epoch": 2.169193659545141, "grad_norm": 27.345455169677734, "learning_rate": 4.3201140748701e-08, "logits/chosen": -1.5837843418121338, "logits/rejected": -1.5203263759613037, "logps/chosen": -196.2156524658203, "logps/rejected": -289.7723693847656, "loss": 0.4401, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4074121713638306, "rewards/margins": 0.9618197679519653, "rewards/rejected": -2.369231700897217, "step": 12590 }, { "epoch": 2.1709166092350105, "grad_norm": 24.87310028076172, "learning_rate": 4.303624893333816e-08, "logits/chosen": -1.5408040285110474, "logits/rejected": -1.4929521083831787, "logps/chosen": -210.43441772460938, "logps/rejected": -285.0773620605469, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": -1.5213069915771484, "rewards/margins": 0.7937949895858765, "rewards/rejected": -2.3151021003723145, "step": 12600 }, { "epoch": 2.1726395589248795, "grad_norm": 27.389739990234375, "learning_rate": 4.287158607827607e-08, "logits/chosen": -1.6728298664093018, "logits/rejected": -1.6453278064727783, "logps/chosen": -197.07711791992188, "logps/rejected": -278.75213623046875, "loss": 0.4727, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4301495552062988, "rewards/margins": 0.8101191520690918, "rewards/rejected": -2.2402689456939697, "step": 12610 }, { "epoch": 2.1743625086147484, "grad_norm": 34.71573257446289, "learning_rate": 4.270715284536124e-08, "logits/chosen": -1.7020797729492188, "logits/rejected": -1.6502811908721924, "logps/chosen": -224.19589233398438, "logps/rejected": -309.0333557128906, "loss": 0.5049, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.688309669494629, "rewards/margins": 0.8674631118774414, "rewards/rejected": -2.5557727813720703, "step": 12620 }, { "epoch": 2.1760854583046174, "grad_norm": 29.679231643676758, "learning_rate": 4.2542949895517066e-08, "logits/chosen": -1.629212737083435, "logits/rejected": -1.6008615493774414, "logps/chosen": -204.96914672851562, "logps/rejected": -289.08575439453125, "loss": 0.5328, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5458189249038696, "rewards/margins": 0.8251399993896484, "rewards/rejected": -2.3709590435028076, "step": 12630 }, { "epoch": 2.1778084079944864, "grad_norm": 25.9903621673584, "learning_rate": 4.2378977888741506e-08, "logits/chosen": -1.6058130264282227, "logits/rejected": -1.5600433349609375, "logps/chosen": -207.5745849609375, "logps/rejected": -292.06396484375, "loss": 0.473, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5348149538040161, "rewards/margins": 0.8506088256835938, "rewards/rejected": -2.3854236602783203, "step": 12640 }, { "epoch": 2.179531357684356, "grad_norm": 35.16606903076172, "learning_rate": 4.221523748410428e-08, "logits/chosen": -1.5772018432617188, "logits/rejected": -1.5374476909637451, "logps/chosen": -202.94577026367188, "logps/rejected": -267.78656005859375, "loss": 0.5463, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4989752769470215, "rewards/margins": 0.6631990671157837, "rewards/rejected": -2.1621744632720947, "step": 12650 }, { "epoch": 2.1812543073742248, "grad_norm": 29.193828582763672, "learning_rate": 4.2051729339744056e-08, "logits/chosen": -1.4867627620697021, "logits/rejected": -1.451336145401001, "logps/chosen": -214.2347412109375, "logps/rejected": -283.4859924316406, "loss": 0.5661, "rewards/accuracies": 0.6875, "rewards/chosen": -1.621567964553833, "rewards/margins": 0.6800750494003296, "rewards/rejected": -2.301642894744873, "step": 12660 }, { "epoch": 2.1829772570640937, "grad_norm": 41.727237701416016, "learning_rate": 4.1888454112866125e-08, "logits/chosen": -1.779306173324585, "logits/rejected": -1.7223072052001953, "logps/chosen": -202.23556518554688, "logps/rejected": -287.9600524902344, "loss": 0.515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.490797758102417, "rewards/margins": 0.8758557438850403, "rewards/rejected": -2.3666536808013916, "step": 12670 }, { "epoch": 2.1847002067539627, "grad_norm": 23.595829010009766, "learning_rate": 4.172541245973943e-08, "logits/chosen": -1.544134497642517, "logits/rejected": -1.5028505325317383, "logps/chosen": -203.92660522460938, "logps/rejected": -282.19879150390625, "loss": 0.5063, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5235766172409058, "rewards/margins": 0.7782987356185913, "rewards/rejected": -2.301875591278076, "step": 12680 }, { "epoch": 2.1864231564438317, "grad_norm": 22.84954261779785, "learning_rate": 4.156260503569423e-08, "logits/chosen": -1.557756781578064, "logits/rejected": -1.5162785053253174, "logps/chosen": -210.11917114257812, "logps/rejected": -289.6104431152344, "loss": 0.5725, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5822913646697998, "rewards/margins": 0.7867759466171265, "rewards/rejected": -2.369067430496216, "step": 12690 }, { "epoch": 2.188146106133701, "grad_norm": 24.481889724731445, "learning_rate": 4.1400032495119183e-08, "logits/chosen": -1.5616981983184814, "logits/rejected": -1.511423110961914, "logps/chosen": -201.97335815429688, "logps/rejected": -289.2737121582031, "loss": 0.4704, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4522186517715454, "rewards/margins": 0.8924834132194519, "rewards/rejected": -2.3447022438049316, "step": 12700 }, { "epoch": 2.18986905582357, "grad_norm": 40.27986145019531, "learning_rate": 4.123769549145901e-08, "logits/chosen": -1.6570045948028564, "logits/rejected": -1.6221774816513062, "logps/chosen": -206.52645874023438, "logps/rejected": -290.4212951660156, "loss": 0.506, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5287845134735107, "rewards/margins": 0.7977313995361328, "rewards/rejected": -2.3265159130096436, "step": 12710 }, { "epoch": 2.191592005513439, "grad_norm": 23.55912208557129, "learning_rate": 4.10755946772116e-08, "logits/chosen": -1.670885682106018, "logits/rejected": -1.601375937461853, "logps/chosen": -192.55203247070312, "logps/rejected": -279.6814880371094, "loss": 0.4654, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3597242832183838, "rewards/margins": 0.9234593510627747, "rewards/rejected": -2.2831835746765137, "step": 12720 }, { "epoch": 2.193314955203308, "grad_norm": 37.51332092285156, "learning_rate": 4.0913730703925485e-08, "logits/chosen": -1.5183833837509155, "logits/rejected": -1.4677681922912598, "logps/chosen": -201.53402709960938, "logps/rejected": -281.51397705078125, "loss": 0.4951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5104000568389893, "rewards/margins": 0.8004547953605652, "rewards/rejected": -2.31085467338562, "step": 12730 }, { "epoch": 2.195037904893177, "grad_norm": 23.36768341064453, "learning_rate": 4.075210422219732e-08, "logits/chosen": -1.5590684413909912, "logits/rejected": -1.5175398588180542, "logps/chosen": -221.9892578125, "logps/rejected": -299.72515869140625, "loss": 0.546, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6943871974945068, "rewards/margins": 0.7802594304084778, "rewards/rejected": -2.474646806716919, "step": 12740 }, { "epoch": 2.1967608545830464, "grad_norm": 27.73143196105957, "learning_rate": 4.059071588166921e-08, "logits/chosen": -1.5354615449905396, "logits/rejected": -1.4796323776245117, "logps/chosen": -196.08566284179688, "logps/rejected": -288.60552978515625, "loss": 0.4454, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4249557256698608, "rewards/margins": 0.914918065071106, "rewards/rejected": -2.3398735523223877, "step": 12750 }, { "epoch": 2.1984838042729153, "grad_norm": 26.64198112487793, "learning_rate": 4.042956633102597e-08, "logits/chosen": -1.5401995182037354, "logits/rejected": -1.5017931461334229, "logps/chosen": -212.65982055664062, "logps/rejected": -293.1643371582031, "loss": 0.5606, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6058902740478516, "rewards/margins": 0.7976308465003967, "rewards/rejected": -2.4035212993621826, "step": 12760 }, { "epoch": 2.2002067539627843, "grad_norm": 28.978290557861328, "learning_rate": 4.0268656217992615e-08, "logits/chosen": -1.6174194812774658, "logits/rejected": -1.5647356510162354, "logps/chosen": -213.73715209960938, "logps/rejected": -280.42755126953125, "loss": 0.544, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.557701587677002, "rewards/margins": 0.7231639623641968, "rewards/rejected": -2.280865430831909, "step": 12770 }, { "epoch": 2.2019297036526533, "grad_norm": 26.79999542236328, "learning_rate": 4.0107986189331875e-08, "logits/chosen": -1.6159842014312744, "logits/rejected": -1.5725295543670654, "logps/chosen": -208.47622680664062, "logps/rejected": -286.48138427734375, "loss": 0.5424, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5654840469360352, "rewards/margins": 0.7603294253349304, "rewards/rejected": -2.3258132934570312, "step": 12780 }, { "epoch": 2.2036526533425222, "grad_norm": 42.5976676940918, "learning_rate": 3.9947556890841464e-08, "logits/chosen": -1.5539737939834595, "logits/rejected": -1.507218599319458, "logps/chosen": -216.44656372070312, "logps/rejected": -306.5636291503906, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": -1.620617151260376, "rewards/margins": 0.9096146821975708, "rewards/rejected": -2.5302317142486572, "step": 12790 }, { "epoch": 2.205375603032391, "grad_norm": 27.104745864868164, "learning_rate": 3.978736896735141e-08, "logits/chosen": -1.5875989198684692, "logits/rejected": -1.5486294031143188, "logps/chosen": -215.0325927734375, "logps/rejected": -291.66351318359375, "loss": 0.5188, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5896577835083008, "rewards/margins": 0.7712696194648743, "rewards/rejected": -2.360927104949951, "step": 12800 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -1.7011349201202393, "eval_logits/rejected": -1.6775678396224976, "eval_logps/chosen": -204.64537048339844, "eval_logps/rejected": -243.69786071777344, "eval_loss": 0.6304866075515747, "eval_rewards/accuracies": 0.6407992839813232, "eval_rewards/chosen": -1.4593344926834106, "eval_rewards/margins": 0.3458428680896759, "eval_rewards/rejected": -1.8051774501800537, "eval_runtime": 384.5524, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 12800 }, { "epoch": 2.2070985527222606, "grad_norm": 29.665178298950195, "learning_rate": 3.96274230627216e-08, "logits/chosen": -1.647985816001892, "logits/rejected": -1.6106626987457275, "logps/chosen": -204.36851501464844, "logps/rejected": -287.9172058105469, "loss": 0.5028, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5377624034881592, "rewards/margins": 0.7883270382881165, "rewards/rejected": -2.326089382171631, "step": 12810 }, { "epoch": 2.2088215024121296, "grad_norm": 31.872037887573242, "learning_rate": 3.9467719819839186e-08, "logits/chosen": -1.5224041938781738, "logits/rejected": -1.4853503704071045, "logps/chosen": -201.1040496826172, "logps/rejected": -281.5457458496094, "loss": 0.5131, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.480491042137146, "rewards/margins": 0.7895897626876831, "rewards/rejected": -2.270081043243408, "step": 12820 }, { "epoch": 2.2105444521019986, "grad_norm": 30.092105865478516, "learning_rate": 3.930825988061599e-08, "logits/chosen": -1.530009150505066, "logits/rejected": -1.4921272993087769, "logps/chosen": -212.3929443359375, "logps/rejected": -279.1653137207031, "loss": 0.5613, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6196510791778564, "rewards/margins": 0.6495185494422913, "rewards/rejected": -2.269169330596924, "step": 12830 }, { "epoch": 2.2122674017918675, "grad_norm": 24.13979148864746, "learning_rate": 3.914904388598577e-08, "logits/chosen": -1.614890456199646, "logits/rejected": -1.5737498998641968, "logps/chosen": -224.39254760742188, "logps/rejected": -313.24951171875, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": -1.6918249130249023, "rewards/margins": 0.8942005038261414, "rewards/rejected": -2.5860252380371094, "step": 12840 }, { "epoch": 2.213990351481737, "grad_norm": 26.58709716796875, "learning_rate": 3.899007247590191e-08, "logits/chosen": -1.6306241750717163, "logits/rejected": -1.582698106765747, "logps/chosen": -207.58724975585938, "logps/rejected": -283.46636962890625, "loss": 0.507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5449132919311523, "rewards/margins": 0.7790514230728149, "rewards/rejected": -2.3239645957946777, "step": 12850 }, { "epoch": 2.215713301171606, "grad_norm": 30.544235229492188, "learning_rate": 3.883134628933465e-08, "logits/chosen": -1.5273045301437378, "logits/rejected": -1.4818195104599, "logps/chosen": -220.94357299804688, "logps/rejected": -302.7528991699219, "loss": 0.5044, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6472495794296265, "rewards/margins": 0.8625710606575012, "rewards/rejected": -2.5098206996917725, "step": 12860 }, { "epoch": 2.217436250861475, "grad_norm": 37.73915100097656, "learning_rate": 3.867286596426853e-08, "logits/chosen": -1.6090524196624756, "logits/rejected": -1.571989893913269, "logps/chosen": -208.7991180419922, "logps/rejected": -278.4769592285156, "loss": 0.5741, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5641460418701172, "rewards/margins": 0.7138841152191162, "rewards/rejected": -2.2780303955078125, "step": 12870 }, { "epoch": 2.219159200551344, "grad_norm": 36.86833572387695, "learning_rate": 3.851463213769996e-08, "logits/chosen": -1.590273141860962, "logits/rejected": -1.5490907430648804, "logps/chosen": -202.5039825439453, "logps/rejected": -280.5902404785156, "loss": 0.5116, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4931889772415161, "rewards/margins": 0.7568543553352356, "rewards/rejected": -2.2500433921813965, "step": 12880 }, { "epoch": 2.220882150241213, "grad_norm": 22.201927185058594, "learning_rate": 3.8356645445634575e-08, "logits/chosen": -1.6446106433868408, "logits/rejected": -1.6043132543563843, "logps/chosen": -202.05697631835938, "logps/rejected": -271.5600891113281, "loss": 0.5292, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.451763391494751, "rewards/margins": 0.7237731218338013, "rewards/rejected": -2.1755363941192627, "step": 12890 }, { "epoch": 2.222605099931082, "grad_norm": 31.349964141845703, "learning_rate": 3.8198906523084594e-08, "logits/chosen": -1.5713380575180054, "logits/rejected": -1.516449213027954, "logps/chosen": -213.3640594482422, "logps/rejected": -311.71624755859375, "loss": 0.441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5946190357208252, "rewards/margins": 0.9784560203552246, "rewards/rejected": -2.57307505607605, "step": 12900 }, { "epoch": 2.224328049620951, "grad_norm": 30.3024959564209, "learning_rate": 3.8041416004066364e-08, "logits/chosen": -1.671606421470642, "logits/rejected": -1.6348520517349243, "logps/chosen": -189.18295288085938, "logps/rejected": -270.96551513671875, "loss": 0.4925, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.385255217552185, "rewards/margins": 0.7706804275512695, "rewards/rejected": -2.155935764312744, "step": 12910 }, { "epoch": 2.22605099931082, "grad_norm": 22.79236602783203, "learning_rate": 3.7884174521597866e-08, "logits/chosen": -1.5699918270111084, "logits/rejected": -1.526435136795044, "logps/chosen": -205.57717895507812, "logps/rejected": -291.1944274902344, "loss": 0.5002, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.529378056526184, "rewards/margins": 0.8503366708755493, "rewards/rejected": -2.3797144889831543, "step": 12920 }, { "epoch": 2.227773949000689, "grad_norm": 26.476848602294922, "learning_rate": 3.77271827076961e-08, "logits/chosen": -1.6100099086761475, "logits/rejected": -1.5733484029769897, "logps/chosen": -203.7718963623047, "logps/rejected": -272.2909240722656, "loss": 0.5721, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5427305698394775, "rewards/margins": 0.6530159711837769, "rewards/rejected": -2.195746421813965, "step": 12930 }, { "epoch": 2.229496898690558, "grad_norm": 32.81683349609375, "learning_rate": 3.757044119337449e-08, "logits/chosen": -1.544937014579773, "logits/rejected": -1.4977126121520996, "logps/chosen": -205.29995727539062, "logps/rejected": -283.4554748535156, "loss": 0.5148, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5327036380767822, "rewards/margins": 0.8081310391426086, "rewards/rejected": -2.340834617614746, "step": 12940 }, { "epoch": 2.231219848380427, "grad_norm": 39.6876106262207, "learning_rate": 3.741395060864038e-08, "logits/chosen": -1.5567517280578613, "logits/rejected": -1.5120809078216553, "logps/chosen": -207.96401977539062, "logps/rejected": -302.92889404296875, "loss": 0.4579, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5375335216522217, "rewards/margins": 0.9502789378166199, "rewards/rejected": -2.4878125190734863, "step": 12950 }, { "epoch": 2.2329427980702965, "grad_norm": 27.241886138916016, "learning_rate": 3.7257711582492645e-08, "logits/chosen": -1.5266079902648926, "logits/rejected": -1.4739742279052734, "logps/chosen": -215.13552856445312, "logps/rejected": -283.886962890625, "loss": 0.5142, "rewards/accuracies": 0.75, "rewards/chosen": -1.5669468641281128, "rewards/margins": 0.741981565952301, "rewards/rejected": -2.3089284896850586, "step": 12960 }, { "epoch": 2.2346657477601655, "grad_norm": 26.52547264099121, "learning_rate": 3.7101724742918915e-08, "logits/chosen": -1.4905484914779663, "logits/rejected": -1.436008095741272, "logps/chosen": -198.09500122070312, "logps/rejected": -292.9523620605469, "loss": 0.4494, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4538332223892212, "rewards/margins": 0.989672064781189, "rewards/rejected": -2.4435055255889893, "step": 12970 }, { "epoch": 2.2363886974500344, "grad_norm": 32.57485580444336, "learning_rate": 3.694599071689329e-08, "logits/chosen": -1.5237276554107666, "logits/rejected": -1.4900181293487549, "logps/chosen": -203.85452270507812, "logps/rejected": -285.77557373046875, "loss": 0.4753, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5306895971298218, "rewards/margins": 0.7909470796585083, "rewards/rejected": -2.32163667678833, "step": 12980 }, { "epoch": 2.2381116471399034, "grad_norm": 25.511980056762695, "learning_rate": 3.679051013037361e-08, "logits/chosen": -1.625741720199585, "logits/rejected": -1.5625674724578857, "logps/chosen": -222.39999389648438, "logps/rejected": -308.61663818359375, "loss": 0.4847, "rewards/accuracies": 0.75, "rewards/chosen": -1.6366920471191406, "rewards/margins": 0.919256329536438, "rewards/rejected": -2.555948257446289, "step": 12990 }, { "epoch": 2.2398345968297724, "grad_norm": 35.11985397338867, "learning_rate": 3.663528360829915e-08, "logits/chosen": -1.5546085834503174, "logits/rejected": -1.5087900161743164, "logps/chosen": -215.45242309570312, "logps/rejected": -302.48028564453125, "loss": 0.474, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.620145559310913, "rewards/margins": 0.8866864442825317, "rewards/rejected": -2.5068321228027344, "step": 13000 }, { "epoch": 2.241557546519642, "grad_norm": 27.722003936767578, "learning_rate": 3.6480311774587877e-08, "logits/chosen": -1.5375518798828125, "logits/rejected": -1.4975337982177734, "logps/chosen": -218.1388702392578, "logps/rejected": -289.1412658691406, "loss": 0.5443, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6785621643066406, "rewards/margins": 0.7160916328430176, "rewards/rejected": -2.394653558731079, "step": 13010 }, { "epoch": 2.2432804962095108, "grad_norm": 36.15455627441406, "learning_rate": 3.6325595252134144e-08, "logits/chosen": -1.535723090171814, "logits/rejected": -1.4892303943634033, "logps/chosen": -203.90689086914062, "logps/rejected": -283.9646911621094, "loss": 0.5007, "rewards/accuracies": 0.75, "rewards/chosen": -1.5217381715774536, "rewards/margins": 0.818256676197052, "rewards/rejected": -2.3399949073791504, "step": 13020 }, { "epoch": 2.2450034458993797, "grad_norm": 30.270002365112305, "learning_rate": 3.617113466280612e-08, "logits/chosen": -1.6230676174163818, "logits/rejected": -1.5725562572479248, "logps/chosen": -211.6242218017578, "logps/rejected": -279.1073303222656, "loss": 0.5389, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5698533058166504, "rewards/margins": 0.6981257200241089, "rewards/rejected": -2.267979145050049, "step": 13030 }, { "epoch": 2.2467263955892487, "grad_norm": 36.30577087402344, "learning_rate": 3.601693062744322e-08, "logits/chosen": -1.6449174880981445, "logits/rejected": -1.5922715663909912, "logps/chosen": -201.2418670654297, "logps/rejected": -302.2711486816406, "loss": 0.4626, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4769412279129028, "rewards/margins": 0.9876639246940613, "rewards/rejected": -2.4646053314208984, "step": 13040 }, { "epoch": 2.2484493452791177, "grad_norm": 29.94109344482422, "learning_rate": 3.586298376585363e-08, "logits/chosen": -1.6053701639175415, "logits/rejected": -1.5708749294281006, "logps/chosen": -215.24267578125, "logps/rejected": -298.4056701660156, "loss": 0.5218, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.634469747543335, "rewards/margins": 0.810254693031311, "rewards/rejected": -2.4447245597839355, "step": 13050 }, { "epoch": 2.250172294968987, "grad_norm": 37.369834899902344, "learning_rate": 3.5709294696811985e-08, "logits/chosen": -1.6182520389556885, "logits/rejected": -1.577430248260498, "logps/chosen": -204.96432495117188, "logps/rejected": -295.6147155761719, "loss": 0.4989, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5216501951217651, "rewards/margins": 0.8875336647033691, "rewards/rejected": -2.4091837406158447, "step": 13060 }, { "epoch": 2.251895244658856, "grad_norm": 62.851322174072266, "learning_rate": 3.555586403805663e-08, "logits/chosen": -1.5494712591171265, "logits/rejected": -1.4969944953918457, "logps/chosen": -205.6003875732422, "logps/rejected": -283.4789123535156, "loss": 0.5259, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4812865257263184, "rewards/margins": 0.8190127611160278, "rewards/rejected": -2.300299644470215, "step": 13070 }, { "epoch": 2.253618194348725, "grad_norm": 24.81918716430664, "learning_rate": 3.540269240628726e-08, "logits/chosen": -1.5176604986190796, "logits/rejected": -1.4786919355392456, "logps/chosen": -208.3052978515625, "logps/rejected": -295.0064392089844, "loss": 0.486, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.576119065284729, "rewards/margins": 0.8568998575210571, "rewards/rejected": -2.433018684387207, "step": 13080 }, { "epoch": 2.255341144038594, "grad_norm": 29.975021362304688, "learning_rate": 3.52497804171625e-08, "logits/chosen": -1.6407638788223267, "logits/rejected": -1.5833606719970703, "logps/chosen": -214.4311065673828, "logps/rejected": -297.3269348144531, "loss": 0.4981, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6044986248016357, "rewards/margins": 0.8463436961174011, "rewards/rejected": -2.4508423805236816, "step": 13090 }, { "epoch": 2.257064093728463, "grad_norm": 23.319719314575195, "learning_rate": 3.509712868529738e-08, "logits/chosen": -1.682140588760376, "logits/rejected": -1.623839020729065, "logps/chosen": -207.14645385742188, "logps/rejected": -294.4388122558594, "loss": 0.4902, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5284478664398193, "rewards/margins": 0.9110603332519531, "rewards/rejected": -2.4395084381103516, "step": 13100 }, { "epoch": 2.2587870434183324, "grad_norm": 25.40921401977539, "learning_rate": 3.494473782426073e-08, "logits/chosen": -1.535962462425232, "logits/rejected": -1.4818295240402222, "logps/chosen": -212.125244140625, "logps/rejected": -296.21575927734375, "loss": 0.5267, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5828105211257935, "rewards/margins": 0.863786518573761, "rewards/rejected": -2.44659686088562, "step": 13110 }, { "epoch": 2.2605099931082013, "grad_norm": 26.571462631225586, "learning_rate": 3.479260844657297e-08, "logits/chosen": -1.6702839136123657, "logits/rejected": -1.635602593421936, "logps/chosen": -209.6977996826172, "logps/rejected": -287.2583312988281, "loss": 0.5404, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5520085096359253, "rewards/margins": 0.7901738882064819, "rewards/rejected": -2.3421826362609863, "step": 13120 }, { "epoch": 2.2622329427980703, "grad_norm": 26.704050064086914, "learning_rate": 3.46407411637034e-08, "logits/chosen": -1.6938931941986084, "logits/rejected": -1.6417863368988037, "logps/chosen": -199.1559600830078, "logps/rejected": -306.63714599609375, "loss": 0.4476, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4523357152938843, "rewards/margins": 1.0319113731384277, "rewards/rejected": -2.4842472076416016, "step": 13130 }, { "epoch": 2.2639558924879393, "grad_norm": 23.49358367919922, "learning_rate": 3.448913658606798e-08, "logits/chosen": -1.526761531829834, "logits/rejected": -1.4799644947052002, "logps/chosen": -202.72506713867188, "logps/rejected": -284.6557922363281, "loss": 0.4739, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.490138292312622, "rewards/margins": 0.8108283281326294, "rewards/rejected": -2.300966501235962, "step": 13140 }, { "epoch": 2.2656788421778082, "grad_norm": 30.544790267944336, "learning_rate": 3.43377953230266e-08, "logits/chosen": -1.5474731922149658, "logits/rejected": -1.4954991340637207, "logps/chosen": -201.0833282470703, "logps/rejected": -301.4603576660156, "loss": 0.4487, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4589382410049438, "rewards/margins": 1.0210484266281128, "rewards/rejected": -2.4799866676330566, "step": 13150 }, { "epoch": 2.2674017918676777, "grad_norm": 34.979034423828125, "learning_rate": 3.418671798288093e-08, "logits/chosen": -1.5324780941009521, "logits/rejected": -1.4817928075790405, "logps/chosen": -228.0107421875, "logps/rejected": -306.8648681640625, "loss": 0.511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7148380279541016, "rewards/margins": 0.8141125440597534, "rewards/rejected": -2.5289502143859863, "step": 13160 }, { "epoch": 2.2691247415575466, "grad_norm": 35.73521423339844, "learning_rate": 3.403590517287175e-08, "logits/chosen": -1.5787086486816406, "logits/rejected": -1.5398588180541992, "logps/chosen": -215.3478546142578, "logps/rejected": -292.5067443847656, "loss": 0.4972, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6153271198272705, "rewards/margins": 0.7852949500083923, "rewards/rejected": -2.4006218910217285, "step": 13170 }, { "epoch": 2.2708476912474156, "grad_norm": 39.83544921875, "learning_rate": 3.388535749917653e-08, "logits/chosen": -1.5618702173233032, "logits/rejected": -1.5051721334457397, "logps/chosen": -207.89285278320312, "logps/rejected": -304.93505859375, "loss": 0.436, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5455347299575806, "rewards/margins": 0.9910475015640259, "rewards/rejected": -2.5365822315216064, "step": 13180 }, { "epoch": 2.2725706409372846, "grad_norm": 36.779273986816406, "learning_rate": 3.373507556690718e-08, "logits/chosen": -1.5543924570083618, "logits/rejected": -1.5112448930740356, "logps/chosen": -221.789306640625, "logps/rejected": -290.9019775390625, "loss": 0.5493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6608854532241821, "rewards/margins": 0.7263485193252563, "rewards/rejected": -2.3872339725494385, "step": 13190 }, { "epoch": 2.2742935906271535, "grad_norm": 32.65153121948242, "learning_rate": 3.358505998010743e-08, "logits/chosen": -1.5592164993286133, "logits/rejected": -1.5155131816864014, "logps/chosen": -212.46090698242188, "logps/rejected": -281.14141845703125, "loss": 0.5395, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5759353637695312, "rewards/margins": 0.7066248655319214, "rewards/rejected": -2.282560110092163, "step": 13200 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -1.6833596229553223, "eval_logits/rejected": -1.6590664386749268, "eval_logps/chosen": -212.43768310546875, "eval_logps/rejected": -253.689208984375, "eval_loss": 0.6315019726753235, "eval_rewards/accuracies": 0.6428903341293335, "eval_rewards/chosen": -1.5372580289840698, "eval_rewards/margins": 0.36783286929130554, "eval_rewards/rejected": -1.9050910472869873, "eval_runtime": 384.8229, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 13200 }, { "epoch": 2.2760165403170225, "grad_norm": 31.827877044677734, "learning_rate": 3.343531134175046e-08, "logits/chosen": -1.6485798358917236, "logits/rejected": -1.6175024509429932, "logps/chosen": -204.543701171875, "logps/rejected": -279.55572509765625, "loss": 0.5524, "rewards/accuracies": 0.75, "rewards/chosen": -1.5241453647613525, "rewards/margins": 0.7401344180107117, "rewards/rejected": -2.264279842376709, "step": 13210 }, { "epoch": 2.277739490006892, "grad_norm": 24.60109519958496, "learning_rate": 3.3285830253736405e-08, "logits/chosen": -1.5928890705108643, "logits/rejected": -1.5579743385314941, "logps/chosen": -206.6677703857422, "logps/rejected": -278.94769287109375, "loss": 0.5106, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5167338848114014, "rewards/margins": 0.712703287601471, "rewards/rejected": -2.2294368743896484, "step": 13220 }, { "epoch": 2.279462439696761, "grad_norm": 31.586509704589844, "learning_rate": 3.313661731689013e-08, "logits/chosen": -1.5826867818832397, "logits/rejected": -1.5373554229736328, "logps/chosen": -194.94052124023438, "logps/rejected": -287.5800476074219, "loss": 0.4702, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.440435528755188, "rewards/margins": 0.9033792614936829, "rewards/rejected": -2.3438143730163574, "step": 13230 }, { "epoch": 2.28118538938663, "grad_norm": 27.885540008544922, "learning_rate": 3.298767313095865e-08, "logits/chosen": -1.5880454778671265, "logits/rejected": -1.5458592176437378, "logps/chosen": -219.48892211914062, "logps/rejected": -303.6644287109375, "loss": 0.4621, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6815032958984375, "rewards/margins": 0.8260301351547241, "rewards/rejected": -2.507533311843872, "step": 13240 }, { "epoch": 2.282908339076499, "grad_norm": 29.93614387512207, "learning_rate": 3.283899829460873e-08, "logits/chosen": -1.5705511569976807, "logits/rejected": -1.52997624874115, "logps/chosen": -207.859375, "logps/rejected": -308.9605712890625, "loss": 0.477, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.561885952949524, "rewards/margins": 0.9680153131484985, "rewards/rejected": -2.5299010276794434, "step": 13250 }, { "epoch": 2.2846312887663682, "grad_norm": 37.0043830871582, "learning_rate": 3.269059340542448e-08, "logits/chosen": -1.6618578433990479, "logits/rejected": -1.616093635559082, "logps/chosen": -216.92294311523438, "logps/rejected": -316.55963134765625, "loss": 0.4876, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6888090372085571, "rewards/margins": 0.9507328867912292, "rewards/rejected": -2.6395421028137207, "step": 13260 }, { "epoch": 2.286354238456237, "grad_norm": 27.53391456604004, "learning_rate": 3.2542459059905127e-08, "logits/chosen": -1.5043879747390747, "logits/rejected": -1.465038537979126, "logps/chosen": -221.95388793945312, "logps/rejected": -315.71649169921875, "loss": 0.4788, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6740529537200928, "rewards/margins": 0.9358172416687012, "rewards/rejected": -2.609870195388794, "step": 13270 }, { "epoch": 2.288077188146106, "grad_norm": 31.898828506469727, "learning_rate": 3.239459585346228e-08, "logits/chosen": -1.5886051654815674, "logits/rejected": -1.5323841571807861, "logps/chosen": -212.6425018310547, "logps/rejected": -292.26690673828125, "loss": 0.5453, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.589463233947754, "rewards/margins": 0.8199914693832397, "rewards/rejected": -2.409454584121704, "step": 13280 }, { "epoch": 2.289800137835975, "grad_norm": 41.60102844238281, "learning_rate": 3.224700438041789e-08, "logits/chosen": -1.5710105895996094, "logits/rejected": -1.522861361503601, "logps/chosen": -207.9844970703125, "logps/rejected": -298.4026794433594, "loss": 0.4795, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5416791439056396, "rewards/margins": 0.9125336408615112, "rewards/rejected": -2.4542126655578613, "step": 13290 }, { "epoch": 2.291523087525844, "grad_norm": 48.986732482910156, "learning_rate": 3.209968523400165e-08, "logits/chosen": -1.5890921354293823, "logits/rejected": -1.5477979183197021, "logps/chosen": -223.21560668945312, "logps/rejected": -297.00799560546875, "loss": 0.5494, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7203028202056885, "rewards/margins": 0.734029233455658, "rewards/rejected": -2.454331874847412, "step": 13300 }, { "epoch": 2.293246037215713, "grad_norm": 40.81411361694336, "learning_rate": 3.195263900634863e-08, "logits/chosen": -1.5560508966445923, "logits/rejected": -1.5148818492889404, "logps/chosen": -222.29904174804688, "logps/rejected": -302.53082275390625, "loss": 0.5463, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6516530513763428, "rewards/margins": 0.8341676592826843, "rewards/rejected": -2.485820770263672, "step": 13310 }, { "epoch": 2.2949689869055825, "grad_norm": 54.13987350463867, "learning_rate": 3.180586628849692e-08, "logits/chosen": -1.644132375717163, "logits/rejected": -1.5857830047607422, "logps/chosen": -218.32791137695312, "logps/rejected": -278.406982421875, "loss": 0.5784, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6245272159576416, "rewards/margins": 0.6808105111122131, "rewards/rejected": -2.305337429046631, "step": 13320 }, { "epoch": 2.2966919365954515, "grad_norm": 37.29862594604492, "learning_rate": 3.165936767038534e-08, "logits/chosen": -1.5331171751022339, "logits/rejected": -1.4829689264297485, "logps/chosen": -198.60708618164062, "logps/rejected": -291.91204833984375, "loss": 0.4975, "rewards/accuracies": 0.75, "rewards/chosen": -1.4252647161483765, "rewards/margins": 0.9944926500320435, "rewards/rejected": -2.41975736618042, "step": 13330 }, { "epoch": 2.2984148862853204, "grad_norm": 36.688751220703125, "learning_rate": 3.151314374085097e-08, "logits/chosen": -1.6717342138290405, "logits/rejected": -1.6321557760238647, "logps/chosen": -210.4832000732422, "logps/rejected": -285.9844970703125, "loss": 0.5545, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5588440895080566, "rewards/margins": 0.7591500878334045, "rewards/rejected": -2.3179941177368164, "step": 13340 }, { "epoch": 2.3001378359751894, "grad_norm": 36.74166488647461, "learning_rate": 3.136719508762674e-08, "logits/chosen": -1.6364288330078125, "logits/rejected": -1.571699857711792, "logps/chosen": -200.09927368164062, "logps/rejected": -281.8629150390625, "loss": 0.4931, "rewards/accuracies": 0.75, "rewards/chosen": -1.4316269159317017, "rewards/margins": 0.8701766133308411, "rewards/rejected": -2.3018035888671875, "step": 13350 }, { "epoch": 2.301860785665059, "grad_norm": 27.395763397216797, "learning_rate": 3.1221522297339177e-08, "logits/chosen": -1.6450027227401733, "logits/rejected": -1.6021854877471924, "logps/chosen": -205.87466430664062, "logps/rejected": -290.9104919433594, "loss": 0.4869, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4858473539352417, "rewards/margins": 0.8877309560775757, "rewards/rejected": -2.3735783100128174, "step": 13360 }, { "epoch": 2.3035837353549278, "grad_norm": 17.683246612548828, "learning_rate": 3.1076125955506015e-08, "logits/chosen": -1.5695432424545288, "logits/rejected": -1.5173529386520386, "logps/chosen": -202.6400604248047, "logps/rejected": -292.83282470703125, "loss": 0.514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4840691089630127, "rewards/margins": 0.9159029722213745, "rewards/rejected": -2.3999722003936768, "step": 13370 }, { "epoch": 2.3053066850447967, "grad_norm": 39.179649353027344, "learning_rate": 3.0931006646533866e-08, "logits/chosen": -1.5399872064590454, "logits/rejected": -1.4884560108184814, "logps/chosen": -212.501953125, "logps/rejected": -280.26556396484375, "loss": 0.5347, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5561041831970215, "rewards/margins": 0.7119619846343994, "rewards/rejected": -2.26806640625, "step": 13380 }, { "epoch": 2.3070296347346657, "grad_norm": 32.08159255981445, "learning_rate": 3.078616495371574e-08, "logits/chosen": -1.523012399673462, "logits/rejected": -1.4631705284118652, "logps/chosen": -196.67645263671875, "logps/rejected": -275.2851257324219, "loss": 0.5165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3953605890274048, "rewards/margins": 0.8383074998855591, "rewards/rejected": -2.2336678504943848, "step": 13390 }, { "epoch": 2.3087525844245347, "grad_norm": 19.68545913696289, "learning_rate": 3.064160145922884e-08, "logits/chosen": -1.6055904626846313, "logits/rejected": -1.551365613937378, "logps/chosen": -186.95303344726562, "logps/rejected": -281.7250061035156, "loss": 0.4451, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3198156356811523, "rewards/margins": 0.9908710718154907, "rewards/rejected": -2.3106865882873535, "step": 13400 }, { "epoch": 2.3104755341144037, "grad_norm": 17.97285270690918, "learning_rate": 3.0497316744132215e-08, "logits/chosen": -1.6288455724716187, "logits/rejected": -1.5841569900512695, "logps/chosen": -210.47897338867188, "logps/rejected": -289.45245361328125, "loss": 0.5404, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5649375915527344, "rewards/margins": 0.8170774579048157, "rewards/rejected": -2.3820149898529053, "step": 13410 }, { "epoch": 2.312198483804273, "grad_norm": 33.42015838623047, "learning_rate": 3.035331138836431e-08, "logits/chosen": -1.6192560195922852, "logits/rejected": -1.5792990922927856, "logps/chosen": -205.99609375, "logps/rejected": -300.41070556640625, "loss": 0.4713, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5238395929336548, "rewards/margins": 0.9164296984672546, "rewards/rejected": -2.4402692317962646, "step": 13420 }, { "epoch": 2.313921433494142, "grad_norm": 33.72605514526367, "learning_rate": 3.020958597074081e-08, "logits/chosen": -1.6985355615615845, "logits/rejected": -1.6434152126312256, "logps/chosen": -207.11843872070312, "logps/rejected": -298.39898681640625, "loss": 0.4918, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.515989065170288, "rewards/margins": 0.8947912454605103, "rewards/rejected": -2.4107799530029297, "step": 13430 }, { "epoch": 2.315644383184011, "grad_norm": 27.87947654724121, "learning_rate": 3.006614106895211e-08, "logits/chosen": -1.551805853843689, "logits/rejected": -1.5149122476577759, "logps/chosen": -189.2252197265625, "logps/rejected": -256.5485534667969, "loss": 0.5483, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3701688051223755, "rewards/margins": 0.6807070970535278, "rewards/rejected": -2.0508759021759033, "step": 13440 }, { "epoch": 2.31736733287388, "grad_norm": 28.670364379882812, "learning_rate": 2.992297725956121e-08, "logits/chosen": -1.5825433731079102, "logits/rejected": -1.537448763847351, "logps/chosen": -195.5888214111328, "logps/rejected": -270.58636474609375, "loss": 0.5146, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4518344402313232, "rewards/margins": 0.7301121950149536, "rewards/rejected": -2.1819469928741455, "step": 13450 }, { "epoch": 2.3190902825637494, "grad_norm": 34.71762466430664, "learning_rate": 2.978009511800116e-08, "logits/chosen": -1.6074050664901733, "logits/rejected": -1.5511646270751953, "logps/chosen": -187.51683044433594, "logps/rejected": -280.5148010253906, "loss": 0.4486, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.334885597229004, "rewards/margins": 0.9614629745483398, "rewards/rejected": -2.2963485717773438, "step": 13460 }, { "epoch": 2.3208132322536184, "grad_norm": 30.473466873168945, "learning_rate": 2.9637495218572972e-08, "logits/chosen": -1.4833455085754395, "logits/rejected": -1.4295669794082642, "logps/chosen": -211.89492797851562, "logps/rejected": -290.8565368652344, "loss": 0.5145, "rewards/accuracies": 0.75, "rewards/chosen": -1.5683119297027588, "rewards/margins": 0.8340962529182434, "rewards/rejected": -2.4024083614349365, "step": 13470 }, { "epoch": 2.3225361819434873, "grad_norm": 33.9401969909668, "learning_rate": 2.9495178134443254e-08, "logits/chosen": -1.6398298740386963, "logits/rejected": -1.5832942724227905, "logps/chosen": -198.39564514160156, "logps/rejected": -279.4765930175781, "loss": 0.4923, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.455733299255371, "rewards/margins": 0.8250287175178528, "rewards/rejected": -2.280762195587158, "step": 13480 }, { "epoch": 2.3242591316333563, "grad_norm": 24.341140747070312, "learning_rate": 2.9353144437641662e-08, "logits/chosen": -1.6076714992523193, "logits/rejected": -1.5592044591903687, "logps/chosen": -218.0586395263672, "logps/rejected": -297.05633544921875, "loss": 0.515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.641355276107788, "rewards/margins": 0.79271399974823, "rewards/rejected": -2.4340693950653076, "step": 13490 }, { "epoch": 2.3259820813232253, "grad_norm": 31.230804443359375, "learning_rate": 2.9211394699058987e-08, "logits/chosen": -1.6254823207855225, "logits/rejected": -1.5697517395019531, "logps/chosen": -204.39346313476562, "logps/rejected": -297.74896240234375, "loss": 0.4478, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4872136116027832, "rewards/margins": 0.9720531702041626, "rewards/rejected": -2.4592669010162354, "step": 13500 }, { "epoch": 2.3277050310130942, "grad_norm": 19.279212951660156, "learning_rate": 2.9069929488444678e-08, "logits/chosen": -1.5557644367218018, "logits/rejected": -1.5264142751693726, "logps/chosen": -196.2863311767578, "logps/rejected": -276.21881103515625, "loss": 0.5231, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.469422459602356, "rewards/margins": 0.7577401399612427, "rewards/rejected": -2.2271625995635986, "step": 13510 }, { "epoch": 2.3294279807029636, "grad_norm": 26.38505744934082, "learning_rate": 2.8928749374404448e-08, "logits/chosen": -1.4847887754440308, "logits/rejected": -1.440119981765747, "logps/chosen": -201.7764434814453, "logps/rejected": -298.28509521484375, "loss": 0.4588, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5093433856964111, "rewards/margins": 0.9552731513977051, "rewards/rejected": -2.464616298675537, "step": 13520 }, { "epoch": 2.3311509303928326, "grad_norm": 37.105743408203125, "learning_rate": 2.8787854924398123e-08, "logits/chosen": -1.5994256734848022, "logits/rejected": -1.552588939666748, "logps/chosen": -210.1918182373047, "logps/rejected": -270.26116943359375, "loss": 0.5475, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.552570104598999, "rewards/margins": 0.6298242807388306, "rewards/rejected": -2.182394504547119, "step": 13530 }, { "epoch": 2.3328738800827016, "grad_norm": 53.39751434326172, "learning_rate": 2.8647246704737382e-08, "logits/chosen": -1.51864755153656, "logits/rejected": -1.4642963409423828, "logps/chosen": -211.7984619140625, "logps/rejected": -294.93060302734375, "loss": 0.492, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5604420900344849, "rewards/margins": 0.885561466217041, "rewards/rejected": -2.4460034370422363, "step": 13540 }, { "epoch": 2.3345968297725705, "grad_norm": 31.251556396484375, "learning_rate": 2.8506925280583417e-08, "logits/chosen": -1.5718047618865967, "logits/rejected": -1.5317022800445557, "logps/chosen": -213.5127716064453, "logps/rejected": -282.591796875, "loss": 0.5498, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6243717670440674, "rewards/margins": 0.6968979835510254, "rewards/rejected": -2.321269989013672, "step": 13550 }, { "epoch": 2.3363197794624395, "grad_norm": 41.41878890991211, "learning_rate": 2.8366891215944598e-08, "logits/chosen": -1.6352542638778687, "logits/rejected": -1.6042063236236572, "logps/chosen": -200.40451049804688, "logps/rejected": -274.9106750488281, "loss": 0.5705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.486896276473999, "rewards/margins": 0.7334551811218262, "rewards/rejected": -2.220351457595825, "step": 13560 }, { "epoch": 2.338042729152309, "grad_norm": 29.753414154052734, "learning_rate": 2.8227145073674385e-08, "logits/chosen": -1.5187408924102783, "logits/rejected": -1.4685065746307373, "logps/chosen": -203.2227783203125, "logps/rejected": -302.477294921875, "loss": 0.4416, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.52272367477417, "rewards/margins": 0.9911115765571594, "rewards/rejected": -2.5138351917266846, "step": 13570 }, { "epoch": 2.339765678842178, "grad_norm": 24.401880264282227, "learning_rate": 2.8087687415468896e-08, "logits/chosen": -1.5765650272369385, "logits/rejected": -1.5453118085861206, "logps/chosen": -192.10269165039062, "logps/rejected": -274.36383056640625, "loss": 0.4839, "rewards/accuracies": 0.75, "rewards/chosen": -1.4052332639694214, "rewards/margins": 0.8032985925674438, "rewards/rejected": -2.2085318565368652, "step": 13580 }, { "epoch": 2.341488628532047, "grad_norm": 28.540246963500977, "learning_rate": 2.7948518801864697e-08, "logits/chosen": -1.5397393703460693, "logits/rejected": -1.5157146453857422, "logps/chosen": -213.9230194091797, "logps/rejected": -296.255859375, "loss": 0.5176, "rewards/accuracies": 0.75, "rewards/chosen": -1.6143159866333008, "rewards/margins": 0.8101215362548828, "rewards/rejected": -2.4244375228881836, "step": 13590 }, { "epoch": 2.343211578221916, "grad_norm": 47.43010330200195, "learning_rate": 2.780963979223663e-08, "logits/chosen": -1.5442464351654053, "logits/rejected": -1.4999010562896729, "logps/chosen": -220.0675506591797, "logps/rejected": -299.0097351074219, "loss": 0.5059, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6141239404678345, "rewards/margins": 0.8025999069213867, "rewards/rejected": -2.4167237281799316, "step": 13600 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -1.7051271200180054, "eval_logits/rejected": -1.6811729669570923, "eval_logps/chosen": -206.69924926757812, "eval_logps/rejected": -246.98841857910156, "eval_loss": 0.631769061088562, "eval_rewards/accuracies": 0.643122673034668, "eval_rewards/chosen": -1.479873538017273, "eval_rewards/margins": 0.35820940136909485, "eval_rewards/rejected": -1.8380827903747559, "eval_runtime": 384.7838, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 13600 }, { "epoch": 2.344934527911785, "grad_norm": 39.63872528076172, "learning_rate": 2.7671050944795494e-08, "logits/chosen": -1.7334047555923462, "logits/rejected": -1.6934597492218018, "logps/chosen": -201.57763671875, "logps/rejected": -271.024658203125, "loss": 0.5537, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4877710342407227, "rewards/margins": 0.6896355152130127, "rewards/rejected": -2.1774067878723145, "step": 13610 }, { "epoch": 2.346657477601654, "grad_norm": 21.51317024230957, "learning_rate": 2.753275281658578e-08, "logits/chosen": -1.6060326099395752, "logits/rejected": -1.5415525436401367, "logps/chosen": -202.09640502929688, "logps/rejected": -286.78485107421875, "loss": 0.4627, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4595098495483398, "rewards/margins": 0.891812801361084, "rewards/rejected": -2.351322650909424, "step": 13620 }, { "epoch": 2.348380427291523, "grad_norm": 26.84193992614746, "learning_rate": 2.7394745963483414e-08, "logits/chosen": -1.5497287511825562, "logits/rejected": -1.484168291091919, "logps/chosen": -208.8119659423828, "logps/rejected": -305.40679931640625, "loss": 0.4444, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5409910678863525, "rewards/margins": 1.0113693475723267, "rewards/rejected": -2.5523600578308105, "step": 13630 }, { "epoch": 2.350103376981392, "grad_norm": 26.73301887512207, "learning_rate": 2.725703094019368e-08, "logits/chosen": -1.5552833080291748, "logits/rejected": -1.510765790939331, "logps/chosen": -205.0259246826172, "logps/rejected": -292.72039794921875, "loss": 0.4904, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5017683506011963, "rewards/margins": 0.88092440366745, "rewards/rejected": -2.382692813873291, "step": 13640 }, { "epoch": 2.351826326671261, "grad_norm": 31.266645431518555, "learning_rate": 2.7119608300248842e-08, "logits/chosen": -1.6274423599243164, "logits/rejected": -1.5821449756622314, "logps/chosen": -216.30654907226562, "logps/rejected": -301.15802001953125, "loss": 0.5018, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6229690313339233, "rewards/margins": 0.8716036677360535, "rewards/rejected": -2.494572639465332, "step": 13650 }, { "epoch": 2.35354927636113, "grad_norm": 35.00014877319336, "learning_rate": 2.698247859600591e-08, "logits/chosen": -1.5113948583602905, "logits/rejected": -1.4610074758529663, "logps/chosen": -200.38381958007812, "logps/rejected": -281.7772521972656, "loss": 0.4975, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4734694957733154, "rewards/margins": 0.8053814172744751, "rewards/rejected": -2.278850793838501, "step": 13660 }, { "epoch": 2.3552722260509995, "grad_norm": 33.406166076660156, "learning_rate": 2.6845642378644463e-08, "logits/chosen": -1.6255537271499634, "logits/rejected": -1.5784156322479248, "logps/chosen": -210.26931762695312, "logps/rejected": -292.46734619140625, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -1.5554783344268799, "rewards/margins": 0.8226876258850098, "rewards/rejected": -2.3781659603118896, "step": 13670 }, { "epoch": 2.3569951757408685, "grad_norm": 29.371341705322266, "learning_rate": 2.6709100198164513e-08, "logits/chosen": -1.6458193063735962, "logits/rejected": -1.5945569276809692, "logps/chosen": -196.690185546875, "logps/rejected": -265.1627502441406, "loss": 0.5321, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4204047918319702, "rewards/margins": 0.7230075597763062, "rewards/rejected": -2.1434123516082764, "step": 13680 }, { "epoch": 2.3587181254307374, "grad_norm": 30.44771385192871, "learning_rate": 2.657285260338421e-08, "logits/chosen": -1.6045669317245483, "logits/rejected": -1.548130989074707, "logps/chosen": -203.29916381835938, "logps/rejected": -295.7070617675781, "loss": 0.5, "rewards/accuracies": 0.75, "rewards/chosen": -1.4948320388793945, "rewards/margins": 0.9141039848327637, "rewards/rejected": -2.408936023712158, "step": 13690 }, { "epoch": 2.3604410751206064, "grad_norm": 38.34294128417969, "learning_rate": 2.643690014193758e-08, "logits/chosen": -1.602710485458374, "logits/rejected": -1.5532397031784058, "logps/chosen": -212.09945678710938, "logps/rejected": -274.312744140625, "loss": 0.5696, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.530841588973999, "rewards/margins": 0.6810584664344788, "rewards/rejected": -2.211899995803833, "step": 13700 }, { "epoch": 2.3621640248104754, "grad_norm": 45.795570373535156, "learning_rate": 2.6301243360272394e-08, "logits/chosen": -1.566232442855835, "logits/rejected": -1.5085102319717407, "logps/chosen": -197.57620239257812, "logps/rejected": -284.8357849121094, "loss": 0.4789, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.440800428390503, "rewards/margins": 0.868538498878479, "rewards/rejected": -2.3093388080596924, "step": 13710 }, { "epoch": 2.3638869745003444, "grad_norm": 40.264739990234375, "learning_rate": 2.6165882803648055e-08, "logits/chosen": -1.5383836030960083, "logits/rejected": -1.4786090850830078, "logps/chosen": -197.35031127929688, "logps/rejected": -275.8687438964844, "loss": 0.5075, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4295049905776978, "rewards/margins": 0.8047353029251099, "rewards/rejected": -2.2342402935028076, "step": 13720 }, { "epoch": 2.3656099241902138, "grad_norm": 52.54778289794922, "learning_rate": 2.60308190161332e-08, "logits/chosen": -1.693058967590332, "logits/rejected": -1.6434071063995361, "logps/chosen": -199.19119262695312, "logps/rejected": -299.73699951171875, "loss": 0.4603, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4446357488632202, "rewards/margins": 0.9695494771003723, "rewards/rejected": -2.4141855239868164, "step": 13730 }, { "epoch": 2.3673328738800827, "grad_norm": 20.35848045349121, "learning_rate": 2.5896052540603706e-08, "logits/chosen": -1.6675994396209717, "logits/rejected": -1.6155967712402344, "logps/chosen": -201.94395446777344, "logps/rejected": -291.0851745605469, "loss": 0.4777, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4425544738769531, "rewards/margins": 0.9335956573486328, "rewards/rejected": -2.376150369644165, "step": 13740 }, { "epoch": 2.3690558235699517, "grad_norm": 39.45704650878906, "learning_rate": 2.576158391874047e-08, "logits/chosen": -1.6140620708465576, "logits/rejected": -1.5652718544006348, "logps/chosen": -216.77197265625, "logps/rejected": -310.7403564453125, "loss": 0.4879, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.60541570186615, "rewards/margins": 0.9550965428352356, "rewards/rejected": -2.5605123043060303, "step": 13750 }, { "epoch": 2.3707787732598207, "grad_norm": 49.456642150878906, "learning_rate": 2.562741369102711e-08, "logits/chosen": -1.6827857494354248, "logits/rejected": -1.6329879760742188, "logps/chosen": -201.86753845214844, "logps/rejected": -278.7059326171875, "loss": 0.5225, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4746357202529907, "rewards/margins": 0.8057149648666382, "rewards/rejected": -2.280350685119629, "step": 13760 }, { "epoch": 2.37250172294969, "grad_norm": 43.56124496459961, "learning_rate": 2.549354239674786e-08, "logits/chosen": -1.6783527135849, "logits/rejected": -1.6421291828155518, "logps/chosen": -206.2716064453125, "logps/rejected": -290.7262268066406, "loss": 0.5038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.555018424987793, "rewards/margins": 0.8067394495010376, "rewards/rejected": -2.361757755279541, "step": 13770 }, { "epoch": 2.374224672639559, "grad_norm": 22.799421310424805, "learning_rate": 2.5359970573985524e-08, "logits/chosen": -1.7318445444107056, "logits/rejected": -1.6665557622909546, "logps/chosen": -214.9160919189453, "logps/rejected": -292.8385009765625, "loss": 0.4964, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5610339641571045, "rewards/margins": 0.8489457964897156, "rewards/rejected": -2.409980058670044, "step": 13780 }, { "epoch": 2.375947622329428, "grad_norm": 23.744430541992188, "learning_rate": 2.522669875961919e-08, "logits/chosen": -1.620179533958435, "logits/rejected": -1.5787187814712524, "logps/chosen": -200.7430877685547, "logps/rejected": -281.3035583496094, "loss": 0.5127, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4429785013198853, "rewards/margins": 0.8367716073989868, "rewards/rejected": -2.279750108718872, "step": 13790 }, { "epoch": 2.377670572019297, "grad_norm": 50.123878479003906, "learning_rate": 2.509372748932195e-08, "logits/chosen": -1.6756843328475952, "logits/rejected": -1.6160857677459717, "logps/chosen": -209.7655487060547, "logps/rejected": -296.96343994140625, "loss": 0.4796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5245649814605713, "rewards/margins": 0.9218786358833313, "rewards/rejected": -2.446443796157837, "step": 13800 }, { "epoch": 2.379393521709166, "grad_norm": 33.61479187011719, "learning_rate": 2.4961057297559064e-08, "logits/chosen": -1.5745052099227905, "logits/rejected": -1.534232497215271, "logps/chosen": -195.0514373779297, "logps/rejected": -282.4466247558594, "loss": 0.5002, "rewards/accuracies": 0.75, "rewards/chosen": -1.428135633468628, "rewards/margins": 0.8580183982849121, "rewards/rejected": -2.286154270172119, "step": 13810 }, { "epoch": 2.381116471399035, "grad_norm": 39.281761169433594, "learning_rate": 2.4828688717585567e-08, "logits/chosen": -1.6656768321990967, "logits/rejected": -1.6113700866699219, "logps/chosen": -215.00863647460938, "logps/rejected": -290.1063537597656, "loss": 0.5251, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5446885824203491, "rewards/margins": 0.8189195394515991, "rewards/rejected": -2.3636081218719482, "step": 13820 }, { "epoch": 2.3828394210889043, "grad_norm": 26.61870574951172, "learning_rate": 2.4696622281444158e-08, "logits/chosen": -1.6838382482528687, "logits/rejected": -1.6528728008270264, "logps/chosen": -200.13461303710938, "logps/rejected": -268.8764953613281, "loss": 0.516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4510828256607056, "rewards/margins": 0.7129031419754028, "rewards/rejected": -2.1639862060546875, "step": 13830 }, { "epoch": 2.3845623707787733, "grad_norm": 35.59844207763672, "learning_rate": 2.4564858519963195e-08, "logits/chosen": -1.660980463027954, "logits/rejected": -1.6128597259521484, "logps/chosen": -200.73452758789062, "logps/rejected": -267.2124328613281, "loss": 0.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4402029514312744, "rewards/margins": 0.7149636745452881, "rewards/rejected": -2.1551668643951416, "step": 13840 }, { "epoch": 2.3862853204686423, "grad_norm": 35.738765716552734, "learning_rate": 2.443339796275432e-08, "logits/chosen": -1.565151572227478, "logits/rejected": -1.5192712545394897, "logps/chosen": -204.0678253173828, "logps/rejected": -276.42156982421875, "loss": 0.5454, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4998000860214233, "rewards/margins": 0.7670835256576538, "rewards/rejected": -2.266883611679077, "step": 13850 }, { "epoch": 2.3880082701585112, "grad_norm": 25.851102828979492, "learning_rate": 2.4302241138210633e-08, "logits/chosen": -1.5774033069610596, "logits/rejected": -1.5349401235580444, "logps/chosen": -203.85476684570312, "logps/rejected": -286.1795654296875, "loss": 0.4733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5228554010391235, "rewards/margins": 0.8183981776237488, "rewards/rejected": -2.3412535190582275, "step": 13860 }, { "epoch": 2.3897312198483807, "grad_norm": 26.764244079589844, "learning_rate": 2.417138857350428e-08, "logits/chosen": -1.6568161249160767, "logits/rejected": -1.613242745399475, "logps/chosen": -210.5281219482422, "logps/rejected": -302.59844970703125, "loss": 0.4892, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5569124221801758, "rewards/margins": 0.9423718452453613, "rewards/rejected": -2.499284267425537, "step": 13870 }, { "epoch": 2.3914541695382496, "grad_norm": 39.235816955566406, "learning_rate": 2.404084079458457e-08, "logits/chosen": -1.5871946811676025, "logits/rejected": -1.5298856496810913, "logps/chosen": -214.61550903320312, "logps/rejected": -282.5399475097656, "loss": 0.5843, "rewards/accuracies": 0.71875, "rewards/chosen": -1.601596474647522, "rewards/margins": 0.7003291845321655, "rewards/rejected": -2.3019254207611084, "step": 13880 }, { "epoch": 2.3931771192281186, "grad_norm": 24.032691955566406, "learning_rate": 2.3910598326175635e-08, "logits/chosen": -1.6235158443450928, "logits/rejected": -1.5832910537719727, "logps/chosen": -204.44700622558594, "logps/rejected": -277.77716064453125, "loss": 0.4993, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4882326126098633, "rewards/margins": 0.7519177198410034, "rewards/rejected": -2.2401506900787354, "step": 13890 }, { "epoch": 2.3949000689179876, "grad_norm": 23.762805938720703, "learning_rate": 2.3780661691774585e-08, "logits/chosen": -1.5637362003326416, "logits/rejected": -1.5174527168273926, "logps/chosen": -195.5287628173828, "logps/rejected": -269.56658935546875, "loss": 0.5077, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.420799732208252, "rewards/margins": 0.7775405049324036, "rewards/rejected": -2.1983401775360107, "step": 13900 }, { "epoch": 2.3966230186078565, "grad_norm": 37.73569107055664, "learning_rate": 2.3651031413649127e-08, "logits/chosen": -1.5958714485168457, "logits/rejected": -1.552513837814331, "logps/chosen": -190.7194061279297, "logps/rejected": -259.73394775390625, "loss": 0.5196, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.340254783630371, "rewards/margins": 0.7284151315689087, "rewards/rejected": -2.0686697959899902, "step": 13910 }, { "epoch": 2.3983459682977255, "grad_norm": 25.303503036499023, "learning_rate": 2.3521708012835696e-08, "logits/chosen": -1.6298589706420898, "logits/rejected": -1.5667641162872314, "logps/chosen": -205.85693359375, "logps/rejected": -285.18780517578125, "loss": 0.4789, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.456417441368103, "rewards/margins": 0.8772147297859192, "rewards/rejected": -2.333632469177246, "step": 13920 }, { "epoch": 2.400068917987595, "grad_norm": 28.09389305114746, "learning_rate": 2.3392692009137193e-08, "logits/chosen": -1.6002569198608398, "logits/rejected": -1.5511232614517212, "logps/chosen": -190.23448181152344, "logps/rejected": -257.18218994140625, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3869836330413818, "rewards/margins": 0.6741336584091187, "rewards/rejected": -2.061117649078369, "step": 13930 }, { "epoch": 2.401791867677464, "grad_norm": 31.90937042236328, "learning_rate": 2.3263983921120987e-08, "logits/chosen": -1.5503696203231812, "logits/rejected": -1.5013597011566162, "logps/chosen": -186.52639770507812, "logps/rejected": -286.6660461425781, "loss": 0.4748, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3336960077285767, "rewards/margins": 0.9724694490432739, "rewards/rejected": -2.3061652183532715, "step": 13940 }, { "epoch": 2.403514817367333, "grad_norm": 45.16349792480469, "learning_rate": 2.3135584266116837e-08, "logits/chosen": -1.643180251121521, "logits/rejected": -1.5967227220535278, "logps/chosen": -201.06625366210938, "logps/rejected": -273.56024169921875, "loss": 0.5508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4699921607971191, "rewards/margins": 0.7421053647994995, "rewards/rejected": -2.212097644805908, "step": 13950 }, { "epoch": 2.405237767057202, "grad_norm": 33.21641540527344, "learning_rate": 2.3007493560214787e-08, "logits/chosen": -1.4977080821990967, "logits/rejected": -1.477624773979187, "logps/chosen": -202.22499084472656, "logps/rejected": -258.01995849609375, "loss": 0.5666, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4879155158996582, "rewards/margins": 0.5633623003959656, "rewards/rejected": -2.0512776374816895, "step": 13960 }, { "epoch": 2.406960716747071, "grad_norm": 18.644105911254883, "learning_rate": 2.2879712318263056e-08, "logits/chosen": -1.6170930862426758, "logits/rejected": -1.5662033557891846, "logps/chosen": -198.0435791015625, "logps/rejected": -275.56024169921875, "loss": 0.5217, "rewards/accuracies": 0.71875, "rewards/chosen": -1.395129680633545, "rewards/margins": 0.8305740356445312, "rewards/rejected": -2.225703477859497, "step": 13970 }, { "epoch": 2.40868366643694, "grad_norm": 27.033361434936523, "learning_rate": 2.2752241053865973e-08, "logits/chosen": -1.58198082447052, "logits/rejected": -1.5406694412231445, "logps/chosen": -191.66148376464844, "logps/rejected": -282.66217041015625, "loss": 0.4813, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.3707060813903809, "rewards/margins": 0.9057755470275879, "rewards/rejected": -2.2764816284179688, "step": 13980 }, { "epoch": 2.410406616126809, "grad_norm": 48.245643615722656, "learning_rate": 2.2625080279382024e-08, "logits/chosen": -1.6190292835235596, "logits/rejected": -1.5692462921142578, "logps/chosen": -197.7570343017578, "logps/rejected": -270.0393371582031, "loss": 0.5232, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4068973064422607, "rewards/margins": 0.7630517482757568, "rewards/rejected": -2.1699490547180176, "step": 13990 }, { "epoch": 2.412129565816678, "grad_norm": 32.1939582824707, "learning_rate": 2.249823050592169e-08, "logits/chosen": -1.5385420322418213, "logits/rejected": -1.4891750812530518, "logps/chosen": -198.2667236328125, "logps/rejected": -287.6027526855469, "loss": 0.4543, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4719661474227905, "rewards/margins": 0.8670805096626282, "rewards/rejected": -2.3390464782714844, "step": 14000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -1.7365682125091553, "eval_logits/rejected": -1.7133723497390747, "eval_logps/chosen": -195.87925720214844, "eval_logps/rejected": -234.26934814453125, "eval_loss": 0.6318486332893372, "eval_rewards/accuracies": 0.6459107995033264, "eval_rewards/chosen": -1.3716734647750854, "eval_rewards/margins": 0.33921846747398376, "eval_rewards/rejected": -1.710891842842102, "eval_runtime": 384.7361, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 14000 }, { "epoch": 2.413852515506547, "grad_norm": 30.94611358642578, "learning_rate": 2.2371692243345354e-08, "logits/chosen": -1.574171781539917, "logits/rejected": -1.5344856977462769, "logps/chosen": -197.61508178710938, "logps/rejected": -269.8860778808594, "loss": 0.5565, "rewards/accuracies": 0.71875, "rewards/chosen": -1.42230224609375, "rewards/margins": 0.7458277940750122, "rewards/rejected": -2.1681301593780518, "step": 14010 }, { "epoch": 2.415575465196416, "grad_norm": 33.24098205566406, "learning_rate": 2.2245466000261394e-08, "logits/chosen": -1.5797890424728394, "logits/rejected": -1.5521284341812134, "logps/chosen": -204.57333374023438, "logps/rejected": -268.26678466796875, "loss": 0.5693, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5004533529281616, "rewards/margins": 0.6720166802406311, "rewards/rejected": -2.1724698543548584, "step": 14020 }, { "epoch": 2.4172984148862855, "grad_norm": 30.121356964111328, "learning_rate": 2.211955228402399e-08, "logits/chosen": -1.5755398273468018, "logits/rejected": -1.528684377670288, "logps/chosen": -204.1021270751953, "logps/rejected": -272.84130859375, "loss": 0.5418, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5018281936645508, "rewards/margins": 0.7337289452552795, "rewards/rejected": -2.2355570793151855, "step": 14030 }, { "epoch": 2.4190213645761545, "grad_norm": 42.259281158447266, "learning_rate": 2.1993951600731154e-08, "logits/chosen": -1.569271445274353, "logits/rejected": -1.4973461627960205, "logps/chosen": -198.60647583007812, "logps/rejected": -280.525634765625, "loss": 0.4824, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4193156957626343, "rewards/margins": 0.8868446350097656, "rewards/rejected": -2.3061604499816895, "step": 14040 }, { "epoch": 2.4207443142660234, "grad_norm": 24.36745262145996, "learning_rate": 2.186866445522273e-08, "logits/chosen": -1.6486291885375977, "logits/rejected": -1.5946848392486572, "logps/chosen": -185.94216918945312, "logps/rejected": -250.9385223388672, "loss": 0.5323, "rewards/accuracies": 0.75, "rewards/chosen": -1.309471845626831, "rewards/margins": 0.694402813911438, "rewards/rejected": -2.0038745403289795, "step": 14050 }, { "epoch": 2.4224672639558924, "grad_norm": 21.555404663085938, "learning_rate": 2.1743691351078332e-08, "logits/chosen": -1.6609976291656494, "logits/rejected": -1.6012214422225952, "logps/chosen": -188.33303833007812, "logps/rejected": -285.00128173828125, "loss": 0.4531, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3309261798858643, "rewards/margins": 1.0017203092575073, "rewards/rejected": -2.332646369934082, "step": 14060 }, { "epoch": 2.4241902136457614, "grad_norm": 33.299034118652344, "learning_rate": 2.161903279061529e-08, "logits/chosen": -1.6014869213104248, "logits/rejected": -1.549005389213562, "logps/chosen": -203.32826232910156, "logps/rejected": -292.0400390625, "loss": 0.4812, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4871909618377686, "rewards/margins": 0.8758770823478699, "rewards/rejected": -2.363068103790283, "step": 14070 }, { "epoch": 2.425913163335631, "grad_norm": 25.26913070678711, "learning_rate": 2.14946892748866e-08, "logits/chosen": -1.5732542276382446, "logits/rejected": -1.5227702856063843, "logps/chosen": -215.99417114257812, "logps/rejected": -286.9879455566406, "loss": 0.5608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.573691725730896, "rewards/margins": 0.7653611898422241, "rewards/rejected": -2.339052677154541, "step": 14080 }, { "epoch": 2.4276361130254998, "grad_norm": 28.92695426940918, "learning_rate": 2.1370661303679084e-08, "logits/chosen": -1.5760879516601562, "logits/rejected": -1.5211198329925537, "logps/chosen": -203.30577087402344, "logps/rejected": -268.3055419921875, "loss": 0.5498, "rewards/accuracies": 0.71875, "rewards/chosen": -1.474204182624817, "rewards/margins": 0.6897687911987305, "rewards/rejected": -2.163972854614258, "step": 14090 }, { "epoch": 2.4293590627153687, "grad_norm": 22.257734298706055, "learning_rate": 2.1246949375511214e-08, "logits/chosen": -1.6646171808242798, "logits/rejected": -1.6052793264389038, "logps/chosen": -197.08328247070312, "logps/rejected": -284.4249572753906, "loss": 0.4527, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.385813593864441, "rewards/margins": 0.9123753309249878, "rewards/rejected": -2.2981889247894287, "step": 14100 }, { "epoch": 2.4310820124052377, "grad_norm": 50.557247161865234, "learning_rate": 2.1123553987631126e-08, "logits/chosen": -1.6229591369628906, "logits/rejected": -1.5879002809524536, "logps/chosen": -197.52047729492188, "logps/rejected": -273.00579833984375, "loss": 0.533, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4846248626708984, "rewards/margins": 0.7303065061569214, "rewards/rejected": -2.2149314880371094, "step": 14110 }, { "epoch": 2.4328049620951067, "grad_norm": 41.365875244140625, "learning_rate": 2.1000475636014635e-08, "logits/chosen": -1.6093180179595947, "logits/rejected": -1.5649349689483643, "logps/chosen": -205.98617553710938, "logps/rejected": -277.5599365234375, "loss": 0.5421, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5368276834487915, "rewards/margins": 0.7141960859298706, "rewards/rejected": -2.251023769378662, "step": 14120 }, { "epoch": 2.4345279117849756, "grad_norm": 37.92250442504883, "learning_rate": 2.0877714815363366e-08, "logits/chosen": -1.6542012691497803, "logits/rejected": -1.6128876209259033, "logps/chosen": -189.68319702148438, "logps/rejected": -254.00405883789062, "loss": 0.5245, "rewards/accuracies": 0.75, "rewards/chosen": -1.3541688919067383, "rewards/margins": 0.6648411750793457, "rewards/rejected": -2.019010066986084, "step": 14130 }, { "epoch": 2.436250861474845, "grad_norm": 23.785160064697266, "learning_rate": 2.0755272019102542e-08, "logits/chosen": -1.706735372543335, "logits/rejected": -1.6625268459320068, "logps/chosen": -206.888916015625, "logps/rejected": -287.9200134277344, "loss": 0.5175, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5433664321899414, "rewards/margins": 0.8240596055984497, "rewards/rejected": -2.3674259185791016, "step": 14140 }, { "epoch": 2.437973811164714, "grad_norm": 37.91211700439453, "learning_rate": 2.063314773937921e-08, "logits/chosen": -1.6717586517333984, "logits/rejected": -1.6323515176773071, "logps/chosen": -199.97064208984375, "logps/rejected": -282.82415771484375, "loss": 0.5092, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4601263999938965, "rewards/margins": 0.8106845617294312, "rewards/rejected": -2.270810842514038, "step": 14150 }, { "epoch": 2.439696760854583, "grad_norm": 27.181774139404297, "learning_rate": 2.051134246706008e-08, "logits/chosen": -1.6129558086395264, "logits/rejected": -1.571523666381836, "logps/chosen": -197.92718505859375, "logps/rejected": -272.5576171875, "loss": 0.5285, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4462512731552124, "rewards/margins": 0.7152296900749207, "rewards/rejected": -2.1614811420440674, "step": 14160 }, { "epoch": 2.441419710544452, "grad_norm": 35.80717086791992, "learning_rate": 2.0389856691729734e-08, "logits/chosen": -1.541624665260315, "logits/rejected": -1.4962389469146729, "logps/chosen": -206.609130859375, "logps/rejected": -269.45013427734375, "loss": 0.5747, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.510289192199707, "rewards/margins": 0.6496440172195435, "rewards/rejected": -2.159933090209961, "step": 14170 }, { "epoch": 2.4431426602343214, "grad_norm": 41.601417541503906, "learning_rate": 2.026869090168849e-08, "logits/chosen": -1.609832525253296, "logits/rejected": -1.5575616359710693, "logps/chosen": -209.0862274169922, "logps/rejected": -269.556884765625, "loss": 0.554, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5163252353668213, "rewards/margins": 0.664095938205719, "rewards/rejected": -2.1804213523864746, "step": 14180 }, { "epoch": 2.4448656099241903, "grad_norm": 33.28386306762695, "learning_rate": 2.0147845583950552e-08, "logits/chosen": -1.6766802072525024, "logits/rejected": -1.6355340480804443, "logps/chosen": -206.7571258544922, "logps/rejected": -266.9535217285156, "loss": 0.5501, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4788180589675903, "rewards/margins": 0.6512070894241333, "rewards/rejected": -2.1300249099731445, "step": 14190 }, { "epoch": 2.4465885596140593, "grad_norm": 32.834320068359375, "learning_rate": 2.0027321224242067e-08, "logits/chosen": -1.518226146697998, "logits/rejected": -1.4756324291229248, "logps/chosen": -183.6091766357422, "logps/rejected": -269.2956237792969, "loss": 0.4708, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.29989492893219, "rewards/margins": 0.8477428555488586, "rewards/rejected": -2.1476378440856934, "step": 14200 }, { "epoch": 2.4483115093039283, "grad_norm": 33.95294952392578, "learning_rate": 1.9907118306999017e-08, "logits/chosen": -1.6634937524795532, "logits/rejected": -1.6183143854141235, "logps/chosen": -197.2618865966797, "logps/rejected": -266.377197265625, "loss": 0.5297, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4434157609939575, "rewards/margins": 0.6969861388206482, "rewards/rejected": -2.140401840209961, "step": 14210 }, { "epoch": 2.4500344589937972, "grad_norm": 28.349445343017578, "learning_rate": 1.9787237315365424e-08, "logits/chosen": -1.7196086645126343, "logits/rejected": -1.6710901260375977, "logps/chosen": -197.17166137695312, "logps/rejected": -271.1381530761719, "loss": 0.509, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3899104595184326, "rewards/margins": 0.7698068618774414, "rewards/rejected": -2.159717321395874, "step": 14220 }, { "epoch": 2.451757408683666, "grad_norm": 27.328739166259766, "learning_rate": 1.9667678731191373e-08, "logits/chosen": -1.555780053138733, "logits/rejected": -1.4922558069229126, "logps/chosen": -192.60348510742188, "logps/rejected": -280.90203857421875, "loss": 0.4816, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3687331676483154, "rewards/margins": 0.9207097887992859, "rewards/rejected": -2.289443016052246, "step": 14230 }, { "epoch": 2.4534803583735356, "grad_norm": 28.51763343811035, "learning_rate": 1.9548443035031125e-08, "logits/chosen": -1.5582568645477295, "logits/rejected": -1.5182602405548096, "logps/chosen": -197.07421875, "logps/rejected": -287.04376220703125, "loss": 0.5135, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4305438995361328, "rewards/margins": 0.8921090364456177, "rewards/rejected": -2.322652816772461, "step": 14240 }, { "epoch": 2.4552033080634046, "grad_norm": 34.41429901123047, "learning_rate": 1.942953070614094e-08, "logits/chosen": -1.557747483253479, "logits/rejected": -1.5177257061004639, "logps/chosen": -198.07131958007812, "logps/rejected": -260.2146911621094, "loss": 0.539, "rewards/accuracies": 0.71875, "rewards/chosen": -1.43120539188385, "rewards/margins": 0.6406923532485962, "rewards/rejected": -2.0718979835510254, "step": 14250 }, { "epoch": 2.4569262577532736, "grad_norm": 48.78285217285156, "learning_rate": 1.93109422224775e-08, "logits/chosen": -1.6656968593597412, "logits/rejected": -1.6043860912322998, "logps/chosen": -202.08387756347656, "logps/rejected": -270.35601806640625, "loss": 0.542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4505724906921387, "rewards/margins": 0.7518132925033569, "rewards/rejected": -2.202385902404785, "step": 14260 }, { "epoch": 2.4586492074431425, "grad_norm": 28.05662727355957, "learning_rate": 1.9192678060695812e-08, "logits/chosen": -1.6255466938018799, "logits/rejected": -1.5749638080596924, "logps/chosen": -191.5539093017578, "logps/rejected": -275.77142333984375, "loss": 0.4911, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3733799457550049, "rewards/margins": 0.8732690811157227, "rewards/rejected": -2.2466492652893066, "step": 14270 }, { "epoch": 2.460372157133012, "grad_norm": 26.489620208740234, "learning_rate": 1.9074738696147196e-08, "logits/chosen": -1.522174596786499, "logits/rejected": -1.4888569116592407, "logps/chosen": -200.908203125, "logps/rejected": -266.4534606933594, "loss": 0.5756, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4906336069107056, "rewards/margins": 0.6441228985786438, "rewards/rejected": -2.134756565093994, "step": 14280 }, { "epoch": 2.462095106822881, "grad_norm": 22.64809799194336, "learning_rate": 1.8957124602877618e-08, "logits/chosen": -1.5703630447387695, "logits/rejected": -1.5144102573394775, "logps/chosen": -200.39471435546875, "logps/rejected": -273.7787780761719, "loss": 0.5235, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4013175964355469, "rewards/margins": 0.8190463185310364, "rewards/rejected": -2.2203638553619385, "step": 14290 }, { "epoch": 2.46381805651275, "grad_norm": 26.21112060546875, "learning_rate": 1.8839836253625496e-08, "logits/chosen": -1.7248750925064087, "logits/rejected": -1.6809425354003906, "logps/chosen": -180.10690307617188, "logps/rejected": -273.7902526855469, "loss": 0.4575, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2610968351364136, "rewards/margins": 0.910190761089325, "rewards/rejected": -2.171287775039673, "step": 14300 }, { "epoch": 2.465541006202619, "grad_norm": 26.50727081298828, "learning_rate": 1.872287411982011e-08, "logits/chosen": -1.5628302097320557, "logits/rejected": -1.5138075351715088, "logps/chosen": -195.84312438964844, "logps/rejected": -272.01519775390625, "loss": 0.5121, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4113115072250366, "rewards/margins": 0.7861210107803345, "rewards/rejected": -2.197432518005371, "step": 14310 }, { "epoch": 2.467263955892488, "grad_norm": 22.044160842895508, "learning_rate": 1.860623867157941e-08, "logits/chosen": -1.605048418045044, "logits/rejected": -1.5548312664031982, "logps/chosen": -176.4993896484375, "logps/rejected": -260.81231689453125, "loss": 0.4771, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.209468126296997, "rewards/margins": 0.8460773229598999, "rewards/rejected": -2.0555453300476074, "step": 14320 }, { "epoch": 2.468986905582357, "grad_norm": 28.703174591064453, "learning_rate": 1.8489930377708372e-08, "logits/chosen": -1.7636277675628662, "logits/rejected": -1.6995710134506226, "logps/chosen": -198.1187744140625, "logps/rejected": -292.19293212890625, "loss": 0.466, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.4311959743499756, "rewards/margins": 0.9489741325378418, "rewards/rejected": -2.3801698684692383, "step": 14330 }, { "epoch": 2.470709855272226, "grad_norm": 27.461868286132812, "learning_rate": 1.8373949705696934e-08, "logits/chosen": -1.606766700744629, "logits/rejected": -1.5721355676651, "logps/chosen": -199.48684692382812, "logps/rejected": -288.28839111328125, "loss": 0.5039, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4460034370422363, "rewards/margins": 0.8741511106491089, "rewards/rejected": -2.3201546669006348, "step": 14340 }, { "epoch": 2.472432804962095, "grad_norm": 29.6517276763916, "learning_rate": 1.8258297121718204e-08, "logits/chosen": -1.6252644062042236, "logits/rejected": -1.5800553560256958, "logps/chosen": -200.2357940673828, "logps/rejected": -272.8708190917969, "loss": 0.5294, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4642345905303955, "rewards/margins": 0.7204502820968628, "rewards/rejected": -2.1846847534179688, "step": 14350 }, { "epoch": 2.474155754651964, "grad_norm": 40.31829833984375, "learning_rate": 1.81429730906266e-08, "logits/chosen": -1.6263774633407593, "logits/rejected": -1.5871617794036865, "logps/chosen": -203.97201538085938, "logps/rejected": -262.5541076660156, "loss": 0.5591, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4734426736831665, "rewards/margins": 0.6249311566352844, "rewards/rejected": -2.0983736515045166, "step": 14360 }, { "epoch": 2.475878704341833, "grad_norm": 34.94589614868164, "learning_rate": 1.8027978075955953e-08, "logits/chosen": -1.614682912826538, "logits/rejected": -1.5609450340270996, "logps/chosen": -202.69973754882812, "logps/rejected": -275.20806884765625, "loss": 0.5116, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4726990461349487, "rewards/margins": 0.7752552628517151, "rewards/rejected": -2.2479541301727295, "step": 14370 }, { "epoch": 2.4776016540317025, "grad_norm": 29.386205673217773, "learning_rate": 1.7913312539917624e-08, "logits/chosen": -1.7429168224334717, "logits/rejected": -1.6956888437271118, "logps/chosen": -190.35641479492188, "logps/rejected": -274.8843994140625, "loss": 0.4653, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.383814811706543, "rewards/margins": 0.8452625274658203, "rewards/rejected": -2.2290773391723633, "step": 14380 }, { "epoch": 2.4793246037215715, "grad_norm": 28.56850814819336, "learning_rate": 1.7798976943398623e-08, "logits/chosen": -1.6107604503631592, "logits/rejected": -1.5593178272247314, "logps/chosen": -189.3840789794922, "logps/rejected": -281.2845458984375, "loss": 0.4584, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3737777471542358, "rewards/margins": 0.9282294511795044, "rewards/rejected": -2.3020071983337402, "step": 14390 }, { "epoch": 2.4810475534114405, "grad_norm": 37.50496292114258, "learning_rate": 1.7684971745959887e-08, "logits/chosen": -1.651484489440918, "logits/rejected": -1.596335768699646, "logps/chosen": -200.45687866210938, "logps/rejected": -281.0902099609375, "loss": 0.5121, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4362685680389404, "rewards/margins": 0.8774330019950867, "rewards/rejected": -2.313701629638672, "step": 14400 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -1.7252347469329834, "eval_logits/rejected": -1.7016128301620483, "eval_logps/chosen": -200.76998901367188, "eval_logps/rejected": -240.53890991210938, "eval_loss": 0.6308444738388062, "eval_rewards/accuracies": 0.6447490453720093, "eval_rewards/chosen": -1.4205809831619263, "eval_rewards/margins": 0.3530069589614868, "eval_rewards/rejected": -1.7735878229141235, "eval_runtime": 384.6003, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 14400 }, { "epoch": 2.4827705031013094, "grad_norm": 36.72150421142578, "learning_rate": 1.7571297405834328e-08, "logits/chosen": -1.6439129114151, "logits/rejected": -1.5948227643966675, "logps/chosen": -195.40127563476562, "logps/rejected": -278.58148193359375, "loss": 0.5121, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4424898624420166, "rewards/margins": 0.8148199915885925, "rewards/rejected": -2.257310152053833, "step": 14410 }, { "epoch": 2.4844934527911784, "grad_norm": 23.456363677978516, "learning_rate": 1.7457954379924967e-08, "logits/chosen": -1.6720870733261108, "logits/rejected": -1.6245301961898804, "logps/chosen": -199.8477020263672, "logps/rejected": -280.53765869140625, "loss": 0.5153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4815994501113892, "rewards/margins": 0.8005410432815552, "rewards/rejected": -2.2821404933929443, "step": 14420 }, { "epoch": 2.4862164024810474, "grad_norm": 53.96633529663086, "learning_rate": 1.7344943123803126e-08, "logits/chosen": -1.6086504459381104, "logits/rejected": -1.5626440048217773, "logps/chosen": -196.92874145507812, "logps/rejected": -284.5268249511719, "loss": 0.5059, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.446078896522522, "rewards/margins": 0.8915165066719055, "rewards/rejected": -2.3375954627990723, "step": 14430 }, { "epoch": 2.4879393521709168, "grad_norm": 35.11296844482422, "learning_rate": 1.7232264091706682e-08, "logits/chosen": -1.5991556644439697, "logits/rejected": -1.5401017665863037, "logps/chosen": -193.55262756347656, "logps/rejected": -288.7968444824219, "loss": 0.4585, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3911709785461426, "rewards/margins": 0.9518829584121704, "rewards/rejected": -2.3430540561676025, "step": 14440 }, { "epoch": 2.4896623018607857, "grad_norm": 27.75967025756836, "learning_rate": 1.7119917736538115e-08, "logits/chosen": -1.5744739770889282, "logits/rejected": -1.5309337377548218, "logps/chosen": -212.39810180664062, "logps/rejected": -281.50128173828125, "loss": 0.5227, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5675699710845947, "rewards/margins": 0.7253061532974243, "rewards/rejected": -2.2928760051727295, "step": 14450 }, { "epoch": 2.4913852515506547, "grad_norm": 26.898860931396484, "learning_rate": 1.700790450986276e-08, "logits/chosen": -1.5727896690368652, "logits/rejected": -1.531781554222107, "logps/chosen": -201.531005859375, "logps/rejected": -279.4584045410156, "loss": 0.5034, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4766613245010376, "rewards/margins": 0.7838097810745239, "rewards/rejected": -2.2604711055755615, "step": 14460 }, { "epoch": 2.4931082012405237, "grad_norm": 33.584259033203125, "learning_rate": 1.6896224861907004e-08, "logits/chosen": -1.7253456115722656, "logits/rejected": -1.6681970357894897, "logps/chosen": -214.53030395507812, "logps/rejected": -292.70928955078125, "loss": 0.4774, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5555566549301147, "rewards/margins": 0.8685323596000671, "rewards/rejected": -2.424088716506958, "step": 14470 }, { "epoch": 2.4948311509303926, "grad_norm": 28.306344985961914, "learning_rate": 1.6784879241556395e-08, "logits/chosen": -1.617222785949707, "logits/rejected": -1.5918428897857666, "logps/chosen": -213.22500610351562, "logps/rejected": -294.64129638671875, "loss": 0.5175, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5985944271087646, "rewards/margins": 0.7967004776000977, "rewards/rejected": -2.3952949047088623, "step": 14480 }, { "epoch": 2.496554100620262, "grad_norm": 27.062841415405273, "learning_rate": 1.667386809635387e-08, "logits/chosen": -1.5785030126571655, "logits/rejected": -1.539458990097046, "logps/chosen": -203.4307861328125, "logps/rejected": -288.9940490722656, "loss": 0.4953, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.52878737449646, "rewards/margins": 0.8457359075546265, "rewards/rejected": -2.374523401260376, "step": 14490 }, { "epoch": 2.498277050310131, "grad_norm": 20.653133392333984, "learning_rate": 1.6563191872498062e-08, "logits/chosen": -1.5909979343414307, "logits/rejected": -1.5225660800933838, "logps/chosen": -201.03378295898438, "logps/rejected": -291.93817138671875, "loss": 0.4718, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4409816265106201, "rewards/margins": 0.9384520649909973, "rewards/rejected": -2.3794338703155518, "step": 14500 }, { "epoch": 2.5, "grad_norm": 40.51236343383789, "learning_rate": 1.6452851014841374e-08, "logits/chosen": -1.6336681842803955, "logits/rejected": -1.5917491912841797, "logps/chosen": -217.73507690429688, "logps/rejected": -279.9249572753906, "loss": 0.5713, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6216713190078735, "rewards/margins": 0.6607409119606018, "rewards/rejected": -2.28241229057312, "step": 14510 }, { "epoch": 2.501722949689869, "grad_norm": 35.526451110839844, "learning_rate": 1.634284596688823e-08, "logits/chosen": -1.609086036682129, "logits/rejected": -1.5602099895477295, "logps/chosen": -214.7878875732422, "logps/rejected": -291.4499816894531, "loss": 0.5584, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5922327041625977, "rewards/margins": 0.7702957987785339, "rewards/rejected": -2.3625285625457764, "step": 14520 }, { "epoch": 2.503445899379738, "grad_norm": 33.11463928222656, "learning_rate": 1.623317717079328e-08, "logits/chosen": -1.6266065835952759, "logits/rejected": -1.5786244869232178, "logps/chosen": -216.61978149414062, "logps/rejected": -298.9439697265625, "loss": 0.5044, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5895724296569824, "rewards/margins": 0.8475859761238098, "rewards/rejected": -2.4371583461761475, "step": 14530 }, { "epoch": 2.505168849069607, "grad_norm": 27.04874610900879, "learning_rate": 1.6123845067359676e-08, "logits/chosen": -1.606018304824829, "logits/rejected": -1.5490694046020508, "logps/chosen": -201.17922973632812, "logps/rejected": -293.52813720703125, "loss": 0.4742, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4720717668533325, "rewards/margins": 0.9397362470626831, "rewards/rejected": -2.4118080139160156, "step": 14540 }, { "epoch": 2.5068917987594763, "grad_norm": 22.950489044189453, "learning_rate": 1.6014850096037304e-08, "logits/chosen": -1.5970135927200317, "logits/rejected": -1.5471218824386597, "logps/chosen": -195.12013244628906, "logps/rejected": -274.5163879394531, "loss": 0.5116, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3837738037109375, "rewards/margins": 0.829186737537384, "rewards/rejected": -2.2129604816436768, "step": 14550 }, { "epoch": 2.5086147484493453, "grad_norm": 27.70438575744629, "learning_rate": 1.5906192694920883e-08, "logits/chosen": -1.592337965965271, "logits/rejected": -1.5386199951171875, "logps/chosen": -208.83316040039062, "logps/rejected": -304.3836975097656, "loss": 0.4948, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5584619045257568, "rewards/margins": 0.9380912780761719, "rewards/rejected": -2.496553421020508, "step": 14560 }, { "epoch": 2.5103376981392143, "grad_norm": 33.47995376586914, "learning_rate": 1.5797873300748355e-08, "logits/chosen": -1.5123052597045898, "logits/rejected": -1.4835093021392822, "logps/chosen": -198.01016235351562, "logps/rejected": -279.16314697265625, "loss": 0.5221, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4571415185928345, "rewards/margins": 0.7968717813491821, "rewards/rejected": -2.2540130615234375, "step": 14570 }, { "epoch": 2.5120606478290832, "grad_norm": 33.63603973388672, "learning_rate": 1.5689892348899103e-08, "logits/chosen": -1.6348133087158203, "logits/rejected": -1.5937086343765259, "logps/chosen": -198.91029357910156, "logps/rejected": -273.56414794921875, "loss": 0.5254, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4940311908721924, "rewards/margins": 0.7310426831245422, "rewards/rejected": -2.22507381439209, "step": 14580 }, { "epoch": 2.5137835975189526, "grad_norm": 34.1225700378418, "learning_rate": 1.5582250273392107e-08, "logits/chosen": -1.577149748802185, "logits/rejected": -1.5392792224884033, "logps/chosen": -192.0406951904297, "logps/rejected": -273.19183349609375, "loss": 0.5107, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3957537412643433, "rewards/margins": 0.8084688186645508, "rewards/rejected": -2.2042224407196045, "step": 14590 }, { "epoch": 2.5155065472088216, "grad_norm": 39.473167419433594, "learning_rate": 1.547494750688435e-08, "logits/chosen": -1.582103967666626, "logits/rejected": -1.5162458419799805, "logps/chosen": -204.1102294921875, "logps/rejected": -292.2998046875, "loss": 0.4573, "rewards/accuracies": 0.78125, "rewards/chosen": -1.493896722793579, "rewards/margins": 0.9231041073799133, "rewards/rejected": -2.4170007705688477, "step": 14600 }, { "epoch": 2.5172294968986906, "grad_norm": 39.95032501220703, "learning_rate": 1.5367984480668884e-08, "logits/chosen": -1.578777551651001, "logits/rejected": -1.5251179933547974, "logps/chosen": -201.05909729003906, "logps/rejected": -271.13177490234375, "loss": 0.503, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4469878673553467, "rewards/margins": 0.7682374715805054, "rewards/rejected": -2.2152252197265625, "step": 14610 }, { "epoch": 2.5189524465885595, "grad_norm": 33.64727020263672, "learning_rate": 1.526136162467333e-08, "logits/chosen": -1.5359634160995483, "logits/rejected": -1.5078798532485962, "logps/chosen": -215.8882598876953, "logps/rejected": -297.89947509765625, "loss": 0.5483, "rewards/accuracies": 0.75, "rewards/chosen": -1.6469218730926514, "rewards/margins": 0.8006730079650879, "rewards/rejected": -2.4475948810577393, "step": 14620 }, { "epoch": 2.5206753962784285, "grad_norm": 27.750146865844727, "learning_rate": 1.5155079367457925e-08, "logits/chosen": -1.5164506435394287, "logits/rejected": -1.474473237991333, "logps/chosen": -200.38941955566406, "logps/rejected": -277.80364990234375, "loss": 0.5291, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4730513095855713, "rewards/margins": 0.7772534489631653, "rewards/rejected": -2.250304698944092, "step": 14630 }, { "epoch": 2.5223983459682975, "grad_norm": 21.659421920776367, "learning_rate": 1.5049138136213968e-08, "logits/chosen": -1.5548603534698486, "logits/rejected": -1.51322340965271, "logps/chosen": -197.6011199951172, "logps/rejected": -283.56121826171875, "loss": 0.5249, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4577686786651611, "rewards/margins": 0.8481302261352539, "rewards/rejected": -2.305898666381836, "step": 14640 }, { "epoch": 2.524121295658167, "grad_norm": 70.42359161376953, "learning_rate": 1.4943538356762065e-08, "logits/chosen": -1.6177898645401, "logits/rejected": -1.5802927017211914, "logps/chosen": -221.6212921142578, "logps/rejected": -277.2975158691406, "loss": 0.6121, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.673858404159546, "rewards/margins": 0.602361798286438, "rewards/rejected": -2.2762205600738525, "step": 14650 }, { "epoch": 2.525844245348036, "grad_norm": 38.56734848022461, "learning_rate": 1.4838280453550234e-08, "logits/chosen": -1.59617018699646, "logits/rejected": -1.5251657962799072, "logps/chosen": -195.82003784179688, "logps/rejected": -288.07806396484375, "loss": 0.4355, "rewards/accuracies": 0.84375, "rewards/chosen": -1.428985357284546, "rewards/margins": 0.9305770993232727, "rewards/rejected": -2.359562635421753, "step": 14660 }, { "epoch": 2.527567195037905, "grad_norm": 27.20901107788086, "learning_rate": 1.4733364849652518e-08, "logits/chosen": -1.523749589920044, "logits/rejected": -1.4803264141082764, "logps/chosen": -189.8150634765625, "logps/rejected": -276.70794677734375, "loss": 0.4607, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3930755853652954, "rewards/margins": 0.8602371215820312, "rewards/rejected": -2.253312826156616, "step": 14670 }, { "epoch": 2.529290144727774, "grad_norm": 36.2165412902832, "learning_rate": 1.4628791966767095e-08, "logits/chosen": -1.6154565811157227, "logits/rejected": -1.5678001642227173, "logps/chosen": -195.2696990966797, "logps/rejected": -271.886474609375, "loss": 0.5179, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4701576232910156, "rewards/margins": 0.7252773642539978, "rewards/rejected": -2.195435047149658, "step": 14680 }, { "epoch": 2.531013094417643, "grad_norm": 42.185359954833984, "learning_rate": 1.4524562225214532e-08, "logits/chosen": -1.594913125038147, "logits/rejected": -1.5524381399154663, "logps/chosen": -210.17886352539062, "logps/rejected": -305.15618896484375, "loss": 0.4961, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5879902839660645, "rewards/margins": 0.9257308840751648, "rewards/rejected": -2.513720989227295, "step": 14690 }, { "epoch": 2.532736044107512, "grad_norm": 31.354543685913086, "learning_rate": 1.4420676043936198e-08, "logits/chosen": -1.6699739694595337, "logits/rejected": -1.6193714141845703, "logps/chosen": -223.212890625, "logps/rejected": -327.3179931640625, "loss": 0.4993, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6847846508026123, "rewards/margins": 1.0284368991851807, "rewards/rejected": -2.713221311569214, "step": 14700 }, { "epoch": 2.534458993797381, "grad_norm": 32.18533706665039, "learning_rate": 1.4317133840492612e-08, "logits/chosen": -1.6072320938110352, "logits/rejected": -1.5672547817230225, "logps/chosen": -199.07467651367188, "logps/rejected": -282.31182861328125, "loss": 0.5002, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4560476541519165, "rewards/margins": 0.8347952961921692, "rewards/rejected": -2.2908430099487305, "step": 14710 }, { "epoch": 2.53618194348725, "grad_norm": 41.4571533203125, "learning_rate": 1.4213936031061691e-08, "logits/chosen": -1.5518696308135986, "logits/rejected": -1.4998043775558472, "logps/chosen": -218.3801727294922, "logps/rejected": -297.88433837890625, "loss": 0.5296, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6070804595947266, "rewards/margins": 0.8339778184890747, "rewards/rejected": -2.44105863571167, "step": 14720 }, { "epoch": 2.537904893177119, "grad_norm": 36.418697357177734, "learning_rate": 1.411108303043701e-08, "logits/chosen": -1.6525733470916748, "logits/rejected": -1.598491907119751, "logps/chosen": -200.5663604736328, "logps/rejected": -287.1551513671875, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -1.4741586446762085, "rewards/margins": 0.8913132548332214, "rewards/rejected": -2.3654720783233643, "step": 14730 }, { "epoch": 2.539627842866988, "grad_norm": 30.349742889404297, "learning_rate": 1.4008575252026334e-08, "logits/chosen": -1.5415337085723877, "logits/rejected": -1.5147979259490967, "logps/chosen": -219.85714721679688, "logps/rejected": -309.5752868652344, "loss": 0.4906, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6472421884536743, "rewards/margins": 0.897518515586853, "rewards/rejected": -2.5447609424591064, "step": 14740 }, { "epoch": 2.5413507925568575, "grad_norm": 41.62550735473633, "learning_rate": 1.3906413107849757e-08, "logits/chosen": -1.5927969217300415, "logits/rejected": -1.5406432151794434, "logps/chosen": -207.52273559570312, "logps/rejected": -286.8734436035156, "loss": 0.495, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.547933578491211, "rewards/margins": 0.8301254510879517, "rewards/rejected": -2.378058910369873, "step": 14750 }, { "epoch": 2.5430737422467264, "grad_norm": 28.98420524597168, "learning_rate": 1.3804597008538177e-08, "logits/chosen": -1.623771071434021, "logits/rejected": -1.577034831047058, "logps/chosen": -202.60971069335938, "logps/rejected": -285.4917907714844, "loss": 0.4974, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.497370958328247, "rewards/margins": 0.8208389282226562, "rewards/rejected": -2.3182098865509033, "step": 14760 }, { "epoch": 2.5447966919365954, "grad_norm": 25.282695770263672, "learning_rate": 1.3703127363331556e-08, "logits/chosen": -1.6262471675872803, "logits/rejected": -1.5848948955535889, "logps/chosen": -212.24789428710938, "logps/rejected": -291.30816650390625, "loss": 0.5426, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5611674785614014, "rewards/margins": 0.7933932542800903, "rewards/rejected": -2.354560375213623, "step": 14770 }, { "epoch": 2.5465196416264644, "grad_norm": 40.03992462158203, "learning_rate": 1.3602004580077375e-08, "logits/chosen": -1.558164119720459, "logits/rejected": -1.5256003141403198, "logps/chosen": -205.63125610351562, "logps/rejected": -281.8812561035156, "loss": 0.5609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.563673973083496, "rewards/margins": 0.7622488141059875, "rewards/rejected": -2.325922727584839, "step": 14780 }, { "epoch": 2.548242591316334, "grad_norm": 27.992399215698242, "learning_rate": 1.3501229065228892e-08, "logits/chosen": -1.64004647731781, "logits/rejected": -1.5912551879882812, "logps/chosen": -222.78921508789062, "logps/rejected": -298.9998779296875, "loss": 0.5625, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6930469274520874, "rewards/margins": 0.7830427289009094, "rewards/rejected": -2.4760897159576416, "step": 14790 }, { "epoch": 2.5499655410062028, "grad_norm": 43.494083404541016, "learning_rate": 1.3400801223843539e-08, "logits/chosen": -1.5810552835464478, "logits/rejected": -1.5414950847625732, "logps/chosen": -213.85580444335938, "logps/rejected": -310.27008056640625, "loss": 0.4847, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5871661901474, "rewards/margins": 0.9443961977958679, "rewards/rejected": -2.531562328338623, "step": 14800 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -1.7152843475341797, "eval_logits/rejected": -1.6912128925323486, "eval_logps/chosen": -206.8795623779297, "eval_logps/rejected": -248.1588592529297, "eval_loss": 0.6304101347923279, "eval_rewards/accuracies": 0.6442843675613403, "eval_rewards/chosen": -1.4816765785217285, "eval_rewards/margins": 0.3681107759475708, "eval_rewards/rejected": -1.8497872352600098, "eval_runtime": 384.7869, "eval_samples_per_second": 11.185, "eval_steps_per_second": 1.398, "step": 14800 }, { "epoch": 2.5516884906960717, "grad_norm": 24.059415817260742, "learning_rate": 1.3300721459581355e-08, "logits/chosen": -1.619269609451294, "logits/rejected": -1.5576200485229492, "logps/chosen": -222.851806640625, "logps/rejected": -297.45196533203125, "loss": 0.5204, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6409496068954468, "rewards/margins": 0.8452986478805542, "rewards/rejected": -2.48624849319458, "step": 14810 }, { "epoch": 2.5534114403859407, "grad_norm": 43.30677032470703, "learning_rate": 1.3200990174703308e-08, "logits/chosen": -1.784624695777893, "logits/rejected": -1.7225615978240967, "logps/chosen": -204.2576904296875, "logps/rejected": -300.54107666015625, "loss": 0.4428, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5080598592758179, "rewards/margins": 0.9750812649726868, "rewards/rejected": -2.4831411838531494, "step": 14820 }, { "epoch": 2.5551343900758097, "grad_norm": 27.601030349731445, "learning_rate": 1.3101607770069667e-08, "logits/chosen": -1.5990216732025146, "logits/rejected": -1.5478788614273071, "logps/chosen": -207.07339477539062, "logps/rejected": -295.07421875, "loss": 0.5039, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5224343538284302, "rewards/margins": 0.9161651730537415, "rewards/rejected": -2.4385993480682373, "step": 14830 }, { "epoch": 2.5568573397656786, "grad_norm": 26.436817169189453, "learning_rate": 1.3002574645138375e-08, "logits/chosen": -1.6385233402252197, "logits/rejected": -1.5847870111465454, "logps/chosen": -213.31503295898438, "logps/rejected": -308.9871520996094, "loss": 0.4656, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5726587772369385, "rewards/margins": 0.952702522277832, "rewards/rejected": -2.5253615379333496, "step": 14840 }, { "epoch": 2.558580289455548, "grad_norm": 22.54994773864746, "learning_rate": 1.2903891197963568e-08, "logits/chosen": -1.581339955329895, "logits/rejected": -1.5261919498443604, "logps/chosen": -217.6318817138672, "logps/rejected": -309.2292175292969, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.643764853477478, "rewards/margins": 0.9066557884216309, "rewards/rejected": -2.5504205226898193, "step": 14850 }, { "epoch": 2.560303239145417, "grad_norm": 30.38620948791504, "learning_rate": 1.2805557825193857e-08, "logits/chosen": -1.571519374847412, "logits/rejected": -1.5293896198272705, "logps/chosen": -199.9291534423828, "logps/rejected": -292.7502746582031, "loss": 0.5276, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4561383724212646, "rewards/margins": 0.9495272636413574, "rewards/rejected": -2.405665636062622, "step": 14860 }, { "epoch": 2.562026188835286, "grad_norm": 26.666837692260742, "learning_rate": 1.2707574922070708e-08, "logits/chosen": -1.6525710821151733, "logits/rejected": -1.6034488677978516, "logps/chosen": -205.05966186523438, "logps/rejected": -281.1788330078125, "loss": 0.5579, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4710729122161865, "rewards/margins": 0.812432587146759, "rewards/rejected": -2.283505439758301, "step": 14870 }, { "epoch": 2.563749138525155, "grad_norm": 34.56159591674805, "learning_rate": 1.2609942882426938e-08, "logits/chosen": -1.5512102842330933, "logits/rejected": -1.516695499420166, "logps/chosen": -198.02145385742188, "logps/rejected": -282.66790771484375, "loss": 0.5085, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.447977900505066, "rewards/margins": 0.8416982889175415, "rewards/rejected": -2.2896761894226074, "step": 14880 }, { "epoch": 2.5654720882150244, "grad_norm": 39.69472122192383, "learning_rate": 1.2512662098685144e-08, "logits/chosen": -1.5551344156265259, "logits/rejected": -1.5232659578323364, "logps/chosen": -211.43896484375, "logps/rejected": -283.1412048339844, "loss": 0.5285, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5428087711334229, "rewards/margins": 0.7138066291809082, "rewards/rejected": -2.256615400314331, "step": 14890 }, { "epoch": 2.5671950379048933, "grad_norm": 25.781578063964844, "learning_rate": 1.2415732961856006e-08, "logits/chosen": -1.4778220653533936, "logits/rejected": -1.4218331575393677, "logps/chosen": -199.333740234375, "logps/rejected": -285.76885986328125, "loss": 0.5085, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4639042615890503, "rewards/margins": 0.8805838823318481, "rewards/rejected": -2.3444881439208984, "step": 14900 }, { "epoch": 2.5689179875947623, "grad_norm": 28.18239974975586, "learning_rate": 1.2319155861536867e-08, "logits/chosen": -1.6057764291763306, "logits/rejected": -1.5677419900894165, "logps/chosen": -195.48504638671875, "logps/rejected": -280.6958923339844, "loss": 0.4928, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4490611553192139, "rewards/margins": 0.8212985992431641, "rewards/rejected": -2.270359754562378, "step": 14910 }, { "epoch": 2.5706409372846313, "grad_norm": 18.990493774414062, "learning_rate": 1.222293118591008e-08, "logits/chosen": -1.5674563646316528, "logits/rejected": -1.5234514474868774, "logps/chosen": -208.8411102294922, "logps/rejected": -310.14605712890625, "loss": 0.4895, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.560349702835083, "rewards/margins": 0.9858556985855103, "rewards/rejected": -2.5462050437927246, "step": 14920 }, { "epoch": 2.5723638869745002, "grad_norm": 36.361183166503906, "learning_rate": 1.2127059321741417e-08, "logits/chosen": -1.717429757118225, "logits/rejected": -1.6656608581542969, "logps/chosen": -193.1423797607422, "logps/rejected": -294.0716247558594, "loss": 0.4604, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4025070667266846, "rewards/margins": 1.0081617832183838, "rewards/rejected": -2.4106688499450684, "step": 14930 }, { "epoch": 2.574086836664369, "grad_norm": 44.2594108581543, "learning_rate": 1.203154065437857e-08, "logits/chosen": -1.6347858905792236, "logits/rejected": -1.581321120262146, "logps/chosen": -197.11672973632812, "logps/rejected": -276.7389221191406, "loss": 0.5117, "rewards/accuracies": 0.75, "rewards/chosen": -1.444643259048462, "rewards/margins": 0.825234055519104, "rewards/rejected": -2.2698774337768555, "step": 14940 }, { "epoch": 2.575809786354238, "grad_norm": 39.82154846191406, "learning_rate": 1.1936375567749612e-08, "logits/chosen": -1.7075817584991455, "logits/rejected": -1.654722809791565, "logps/chosen": -212.85348510742188, "logps/rejected": -278.1226501464844, "loss": 0.5634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5699443817138672, "rewards/margins": 0.7073904275894165, "rewards/rejected": -2.277334690093994, "step": 14950 }, { "epoch": 2.5775327360441076, "grad_norm": 34.315711975097656, "learning_rate": 1.1841564444361496e-08, "logits/chosen": -1.5428823232650757, "logits/rejected": -1.4934934377670288, "logps/chosen": -212.505859375, "logps/rejected": -296.00543212890625, "loss": 0.5128, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5668630599975586, "rewards/margins": 0.8439539670944214, "rewards/rejected": -2.4108169078826904, "step": 14960 }, { "epoch": 2.5792556857339766, "grad_norm": 37.772743225097656, "learning_rate": 1.1747107665298273e-08, "logits/chosen": -1.623822569847107, "logits/rejected": -1.5672670602798462, "logps/chosen": -201.41970825195312, "logps/rejected": -292.71148681640625, "loss": 0.5005, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4750289916992188, "rewards/margins": 0.9137970805168152, "rewards/rejected": -2.3888261318206787, "step": 14970 }, { "epoch": 2.5809786354238455, "grad_norm": 32.4010124206543, "learning_rate": 1.1653005610219913e-08, "logits/chosen": -1.6380878686904907, "logits/rejected": -1.56973135471344, "logps/chosen": -209.4724884033203, "logps/rejected": -311.81256103515625, "loss": 0.4614, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5535662174224854, "rewards/margins": 1.0467643737792969, "rewards/rejected": -2.6003308296203613, "step": 14980 }, { "epoch": 2.582701585113715, "grad_norm": 23.00949478149414, "learning_rate": 1.155925865736055e-08, "logits/chosen": -1.6744308471679688, "logits/rejected": -1.6327850818634033, "logps/chosen": -198.4084930419922, "logps/rejected": -300.02435302734375, "loss": 0.4787, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4701337814331055, "rewards/margins": 1.0028178691864014, "rewards/rejected": -2.472951650619507, "step": 14990 }, { "epoch": 2.584424534803584, "grad_norm": 44.53309631347656, "learning_rate": 1.146586718352699e-08, "logits/chosen": -1.6657359600067139, "logits/rejected": -1.624629020690918, "logps/chosen": -212.0545196533203, "logps/rejected": -299.84539794921875, "loss": 0.4727, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.559799075126648, "rewards/margins": 0.9001957774162292, "rewards/rejected": -2.4599947929382324, "step": 15000 }, { "epoch": 2.586147484493453, "grad_norm": 41.07697677612305, "learning_rate": 1.1372831564097286e-08, "logits/chosen": -1.6813380718231201, "logits/rejected": -1.6410796642303467, "logps/chosen": -212.0048828125, "logps/rejected": -279.15179443359375, "loss": 0.6014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5703686475753784, "rewards/margins": 0.6888116598129272, "rewards/rejected": -2.2591805458068848, "step": 15010 }, { "epoch": 2.587870434183322, "grad_norm": 35.199371337890625, "learning_rate": 1.1280152173019075e-08, "logits/chosen": -1.5767467021942139, "logits/rejected": -1.5493143796920776, "logps/chosen": -198.18643188476562, "logps/rejected": -283.5013427734375, "loss": 0.5113, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4780842065811157, "rewards/margins": 0.82579505443573, "rewards/rejected": -2.3038792610168457, "step": 15020 }, { "epoch": 2.589593383873191, "grad_norm": 32.93767166137695, "learning_rate": 1.118782938280829e-08, "logits/chosen": -1.5512254238128662, "logits/rejected": -1.5099573135375977, "logps/chosen": -210.15042114257812, "logps/rejected": -285.9954833984375, "loss": 0.5788, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5646612644195557, "rewards/margins": 0.7441153526306152, "rewards/rejected": -2.308776378631592, "step": 15030 }, { "epoch": 2.59131633356306, "grad_norm": 38.56374740600586, "learning_rate": 1.1095863564547436e-08, "logits/chosen": -1.6254005432128906, "logits/rejected": -1.5820324420928955, "logps/chosen": -201.81747436523438, "logps/rejected": -277.28643798828125, "loss": 0.519, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4783657789230347, "rewards/margins": 0.7753399610519409, "rewards/rejected": -2.2537059783935547, "step": 15040 }, { "epoch": 2.5930392832529288, "grad_norm": 28.777212142944336, "learning_rate": 1.1004255087884273e-08, "logits/chosen": -1.6261613368988037, "logits/rejected": -1.5694060325622559, "logps/chosen": -204.78195190429688, "logps/rejected": -285.9472961425781, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": -1.5048892498016357, "rewards/margins": 0.864482045173645, "rewards/rejected": -2.369370937347412, "step": 15050 }, { "epoch": 2.594762232942798, "grad_norm": 36.000404357910156, "learning_rate": 1.0913004321030195e-08, "logits/chosen": -1.6201896667480469, "logits/rejected": -1.5766347646713257, "logps/chosen": -193.09442138671875, "logps/rejected": -274.0902404785156, "loss": 0.5425, "rewards/accuracies": 0.71875, "rewards/chosen": -1.388741135597229, "rewards/margins": 0.808066189289093, "rewards/rejected": -2.196807384490967, "step": 15060 }, { "epoch": 2.596485182632667, "grad_norm": 32.70420455932617, "learning_rate": 1.0822111630758901e-08, "logits/chosen": -1.6917825937271118, "logits/rejected": -1.6308343410491943, "logps/chosen": -199.3158416748047, "logps/rejected": -266.4201354980469, "loss": 0.5367, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4221495389938354, "rewards/margins": 0.727179229259491, "rewards/rejected": -2.1493287086486816, "step": 15070 }, { "epoch": 2.598208132322536, "grad_norm": 41.88467788696289, "learning_rate": 1.0731577382404744e-08, "logits/chosen": -1.6529849767684937, "logits/rejected": -1.5970804691314697, "logps/chosen": -196.10508728027344, "logps/rejected": -292.7115478515625, "loss": 0.4646, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4301972389221191, "rewards/margins": 0.9582144021987915, "rewards/rejected": -2.388411283493042, "step": 15080 }, { "epoch": 2.599931082012405, "grad_norm": 29.691938400268555, "learning_rate": 1.0641401939861417e-08, "logits/chosen": -1.6426990032196045, "logits/rejected": -1.5919253826141357, "logps/chosen": -199.49057006835938, "logps/rejected": -275.2064514160156, "loss": 0.5308, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4478983879089355, "rewards/margins": 0.7710817456245422, "rewards/rejected": -2.218980073928833, "step": 15090 }, { "epoch": 2.6016540317022745, "grad_norm": 38.88888931274414, "learning_rate": 1.0551585665580465e-08, "logits/chosen": -1.5898182392120361, "logits/rejected": -1.5544463396072388, "logps/chosen": -201.89639282226562, "logps/rejected": -270.3229064941406, "loss": 0.5411, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.509132981300354, "rewards/margins": 0.6869701743125916, "rewards/rejected": -2.19610333442688, "step": 15100 }, { "epoch": 2.6033769813921435, "grad_norm": 39.20042037963867, "learning_rate": 1.0462128920569635e-08, "logits/chosen": -1.6190464496612549, "logits/rejected": -1.5810520648956299, "logps/chosen": -207.54110717773438, "logps/rejected": -276.8843994140625, "loss": 0.5685, "rewards/accuracies": 0.65625, "rewards/chosen": -1.545078158378601, "rewards/margins": 0.7065862417221069, "rewards/rejected": -2.251664638519287, "step": 15110 }, { "epoch": 2.6050999310820124, "grad_norm": 34.681983947753906, "learning_rate": 1.0373032064391729e-08, "logits/chosen": -1.6022884845733643, "logits/rejected": -1.5622366666793823, "logps/chosen": -213.93154907226562, "logps/rejected": -290.47772216796875, "loss": 0.5159, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5572923421859741, "rewards/margins": 0.7834555506706238, "rewards/rejected": -2.340747833251953, "step": 15120 }, { "epoch": 2.6068228807718814, "grad_norm": 30.58574867248535, "learning_rate": 1.0284295455162995e-08, "logits/chosen": -1.5416325330734253, "logits/rejected": -1.4799580574035645, "logps/chosen": -194.72889709472656, "logps/rejected": -278.31378173828125, "loss": 0.4966, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4176585674285889, "rewards/margins": 0.8688579797744751, "rewards/rejected": -2.2865166664123535, "step": 15130 }, { "epoch": 2.6085458304617504, "grad_norm": 23.715627670288086, "learning_rate": 1.0195919449551637e-08, "logits/chosen": -1.6180346012115479, "logits/rejected": -1.56999933719635, "logps/chosen": -209.47787475585938, "logps/rejected": -301.15570068359375, "loss": 0.4746, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5380821228027344, "rewards/margins": 0.9100778698921204, "rewards/rejected": -2.44815993309021, "step": 15140 }, { "epoch": 2.6102687801516193, "grad_norm": 44.19576644897461, "learning_rate": 1.0107904402776468e-08, "logits/chosen": -1.7632591724395752, "logits/rejected": -1.707419991493225, "logps/chosen": -197.52490234375, "logps/rejected": -273.18719482421875, "loss": 0.5293, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4399224519729614, "rewards/margins": 0.7631253004074097, "rewards/rejected": -2.203047752380371, "step": 15150 }, { "epoch": 2.6119917298414888, "grad_norm": 18.194103240966797, "learning_rate": 1.002025066860549e-08, "logits/chosen": -1.5769050121307373, "logits/rejected": -1.540554165840149, "logps/chosen": -198.5594482421875, "logps/rejected": -292.1970520019531, "loss": 0.4724, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.442894697189331, "rewards/margins": 0.927615761756897, "rewards/rejected": -2.3705103397369385, "step": 15160 }, { "epoch": 2.6137146795313577, "grad_norm": 38.291282653808594, "learning_rate": 9.932958599354457e-09, "logits/chosen": -1.5755432844161987, "logits/rejected": -1.5298999547958374, "logps/chosen": -190.64517211914062, "logps/rejected": -277.8343200683594, "loss": 0.5209, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.376599669456482, "rewards/margins": 0.863747239112854, "rewards/rejected": -2.240346670150757, "step": 15170 }, { "epoch": 2.6154376292212267, "grad_norm": 23.653581619262695, "learning_rate": 9.846028545885376e-09, "logits/chosen": -1.6484168767929077, "logits/rejected": -1.611707091331482, "logps/chosen": -209.3505401611328, "logps/rejected": -297.0075988769531, "loss": 0.5176, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5349688529968262, "rewards/margins": 0.8758178949356079, "rewards/rejected": -2.4107866287231445, "step": 15180 }, { "epoch": 2.6171605789110957, "grad_norm": 30.477632522583008, "learning_rate": 9.75946085760524e-09, "logits/chosen": -1.5608577728271484, "logits/rejected": -1.5307698249816895, "logps/chosen": -197.48294067382812, "logps/rejected": -272.7490234375, "loss": 0.5087, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4481613636016846, "rewards/margins": 0.755672037601471, "rewards/rejected": -2.2038331031799316, "step": 15190 }, { "epoch": 2.618883528600965, "grad_norm": 18.196041107177734, "learning_rate": 9.673255882464504e-09, "logits/chosen": -1.6353098154067993, "logits/rejected": -1.5825080871582031, "logps/chosen": -202.96937561035156, "logps/rejected": -289.22760009765625, "loss": 0.4701, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4741370677947998, "rewards/margins": 0.8842340707778931, "rewards/rejected": -2.3583710193634033, "step": 15200 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -1.732445240020752, "eval_logits/rejected": -1.7090314626693726, "eval_logps/chosen": -200.1665496826172, "eval_logps/rejected": -239.77322387695312, "eval_loss": 0.6306227445602417, "eval_rewards/accuracies": 0.6445167064666748, "eval_rewards/chosen": -1.4145464897155762, "eval_rewards/margins": 0.3513844311237335, "eval_rewards/rejected": -1.7659310102462769, "eval_runtime": 384.213, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 15200 }, { "epoch": 2.620606478290834, "grad_norm": 34.808109283447266, "learning_rate": 9.587413966955737e-09, "logits/chosen": -1.5300031900405884, "logits/rejected": -1.4728825092315674, "logps/chosen": -209.06027221679688, "logps/rejected": -287.71697998046875, "loss": 0.5237, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.533287763595581, "rewards/margins": 0.8262912034988403, "rewards/rejected": -2.359578847885132, "step": 15210 }, { "epoch": 2.622329427980703, "grad_norm": 39.4126091003418, "learning_rate": 9.501935456112254e-09, "logits/chosen": -1.5830267667770386, "logits/rejected": -1.5235631465911865, "logps/chosen": -188.13392639160156, "logps/rejected": -269.7197570800781, "loss": 0.4734, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3591792583465576, "rewards/margins": 0.8418970108032227, "rewards/rejected": -2.2010762691497803, "step": 15220 }, { "epoch": 2.624052377670572, "grad_norm": 22.56211280822754, "learning_rate": 9.416820693506677e-09, "logits/chosen": -1.5887609720230103, "logits/rejected": -1.5454912185668945, "logps/chosen": -204.38143920898438, "logps/rejected": -287.50439453125, "loss": 0.5086, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.494003176689148, "rewards/margins": 0.8419147729873657, "rewards/rejected": -2.3359179496765137, "step": 15230 }, { "epoch": 2.625775327360441, "grad_norm": 29.985095977783203, "learning_rate": 9.332070021249595e-09, "logits/chosen": -1.578778862953186, "logits/rejected": -1.524937629699707, "logps/chosen": -205.76651000976562, "logps/rejected": -284.15264892578125, "loss": 0.4948, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.467347502708435, "rewards/margins": 0.8353103399276733, "rewards/rejected": -2.3026576042175293, "step": 15240 }, { "epoch": 2.62749827705031, "grad_norm": 37.65681076049805, "learning_rate": 9.247683779988113e-09, "logits/chosen": -1.5963375568389893, "logits/rejected": -1.5480496883392334, "logps/chosen": -192.90602111816406, "logps/rejected": -280.87103271484375, "loss": 0.5067, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4011046886444092, "rewards/margins": 0.8682317733764648, "rewards/rejected": -2.269336223602295, "step": 15250 }, { "epoch": 2.6292212267401793, "grad_norm": 32.64960479736328, "learning_rate": 9.163662308904608e-09, "logits/chosen": -1.5931379795074463, "logits/rejected": -1.5530445575714111, "logps/chosen": -202.58602905273438, "logps/rejected": -269.53875732421875, "loss": 0.5357, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4842182397842407, "rewards/margins": 0.7100173830986023, "rewards/rejected": -2.194235324859619, "step": 15260 }, { "epoch": 2.6309441764300483, "grad_norm": 24.36440086364746, "learning_rate": 9.080005945715307e-09, "logits/chosen": -1.6525249481201172, "logits/rejected": -1.5804433822631836, "logps/chosen": -206.2389678955078, "logps/rejected": -297.2176208496094, "loss": 0.4859, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.512003779411316, "rewards/margins": 0.9533550143241882, "rewards/rejected": -2.4653584957122803, "step": 15270 }, { "epoch": 2.6326671261199173, "grad_norm": 28.877296447753906, "learning_rate": 8.996715026668867e-09, "logits/chosen": -1.7158002853393555, "logits/rejected": -1.671775460243225, "logps/chosen": -196.52822875976562, "logps/rejected": -288.4090576171875, "loss": 0.4447, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4435256719589233, "rewards/margins": 0.8979686498641968, "rewards/rejected": -2.34149432182312, "step": 15280 }, { "epoch": 2.6343900758097862, "grad_norm": 24.51552963256836, "learning_rate": 8.913789886545064e-09, "logits/chosen": -1.6201568841934204, "logits/rejected": -1.5507919788360596, "logps/chosen": -204.2157440185547, "logps/rejected": -295.75091552734375, "loss": 0.4992, "rewards/accuracies": 0.78125, "rewards/chosen": -1.494683027267456, "rewards/margins": 0.9509680867195129, "rewards/rejected": -2.445650815963745, "step": 15290 }, { "epoch": 2.6361130254996556, "grad_norm": 22.843257904052734, "learning_rate": 8.831230858653538e-09, "logits/chosen": -1.5143969058990479, "logits/rejected": -1.4569345712661743, "logps/chosen": -199.69515991210938, "logps/rejected": -293.4846496582031, "loss": 0.508, "rewards/accuracies": 0.75, "rewards/chosen": -1.477146029472351, "rewards/margins": 0.9601956605911255, "rewards/rejected": -2.4373416900634766, "step": 15300 }, { "epoch": 2.6378359751895246, "grad_norm": 26.822872161865234, "learning_rate": 8.749038274832343e-09, "logits/chosen": -1.6880228519439697, "logits/rejected": -1.6334871053695679, "logps/chosen": -199.8614501953125, "logps/rejected": -290.93609619140625, "loss": 0.4692, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4571337699890137, "rewards/margins": 0.9172709584236145, "rewards/rejected": -2.3744044303894043, "step": 15310 }, { "epoch": 2.6395589248793936, "grad_norm": 40.18165588378906, "learning_rate": 8.667212465446617e-09, "logits/chosen": -1.5984302759170532, "logits/rejected": -1.5583341121673584, "logps/chosen": -202.61941528320312, "logps/rejected": -285.7087707519531, "loss": 0.5183, "rewards/accuracies": 0.75, "rewards/chosen": -1.5141230821609497, "rewards/margins": 0.8147637248039246, "rewards/rejected": -2.3288865089416504, "step": 15320 }, { "epoch": 2.6412818745692626, "grad_norm": 39.94595718383789, "learning_rate": 8.585753759387292e-09, "logits/chosen": -1.600642442703247, "logits/rejected": -1.5455397367477417, "logps/chosen": -206.8169403076172, "logps/rejected": -294.69873046875, "loss": 0.4851, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5080393552780151, "rewards/margins": 0.8920080065727234, "rewards/rejected": -2.4000473022460938, "step": 15330 }, { "epoch": 2.6430048242591315, "grad_norm": 28.69721031188965, "learning_rate": 8.504662484069824e-09, "logits/chosen": -1.5988765954971313, "logits/rejected": -1.5588778257369995, "logps/chosen": -204.5358428955078, "logps/rejected": -289.7530822753906, "loss": 0.5067, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.49817955493927, "rewards/margins": 0.8466202616691589, "rewards/rejected": -2.344799757003784, "step": 15340 }, { "epoch": 2.6447277739490005, "grad_norm": 40.40675354003906, "learning_rate": 8.423938965432708e-09, "logits/chosen": -1.4938302040100098, "logits/rejected": -1.4552674293518066, "logps/chosen": -207.461669921875, "logps/rejected": -299.3456115722656, "loss": 0.4925, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5299715995788574, "rewards/margins": 0.9237580299377441, "rewards/rejected": -2.4537301063537598, "step": 15350 }, { "epoch": 2.64645072363887, "grad_norm": 30.506824493408203, "learning_rate": 8.343583527936382e-09, "logits/chosen": -1.6167596578598022, "logits/rejected": -1.5800806283950806, "logps/chosen": -202.3173065185547, "logps/rejected": -288.4965515136719, "loss": 0.5202, "rewards/accuracies": 0.75, "rewards/chosen": -1.5074043273925781, "rewards/margins": 0.8156595230102539, "rewards/rejected": -2.323063850402832, "step": 15360 }, { "epoch": 2.648173673328739, "grad_norm": 36.25282287597656, "learning_rate": 8.263596494561765e-09, "logits/chosen": -1.646561861038208, "logits/rejected": -1.5939476490020752, "logps/chosen": -211.3754119873047, "logps/rejected": -285.7420654296875, "loss": 0.528, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5526502132415771, "rewards/margins": 0.7893895506858826, "rewards/rejected": -2.3420395851135254, "step": 15370 }, { "epoch": 2.649896623018608, "grad_norm": 31.11897850036621, "learning_rate": 8.183978186809026e-09, "logits/chosen": -1.6422828435897827, "logits/rejected": -1.59429132938385, "logps/chosen": -207.92440795898438, "logps/rejected": -287.43621826171875, "loss": 0.5049, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5371867418289185, "rewards/margins": 0.8130367398262024, "rewards/rejected": -2.3502230644226074, "step": 15380 }, { "epoch": 2.651619572708477, "grad_norm": 23.215652465820312, "learning_rate": 8.104728924696237e-09, "logits/chosen": -1.6959645748138428, "logits/rejected": -1.651354432106018, "logps/chosen": -203.30502319335938, "logps/rejected": -292.2448425292969, "loss": 0.5014, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4707846641540527, "rewards/margins": 0.8943023681640625, "rewards/rejected": -2.3650870323181152, "step": 15390 }, { "epoch": 2.6533425223983462, "grad_norm": 22.313430786132812, "learning_rate": 8.02584902675818e-09, "logits/chosen": -1.638462781906128, "logits/rejected": -1.5840169191360474, "logps/chosen": -217.51998901367188, "logps/rejected": -277.68402099609375, "loss": 0.5732, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5877636671066284, "rewards/margins": 0.6705672740936279, "rewards/rejected": -2.258330821990967, "step": 15400 }, { "epoch": 2.655065472088215, "grad_norm": 25.50736427307129, "learning_rate": 7.947338810045035e-09, "logits/chosen": -1.6300718784332275, "logits/rejected": -1.5662131309509277, "logps/chosen": -216.51229858398438, "logps/rejected": -276.90521240234375, "loss": 0.5589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5693070888519287, "rewards/margins": 0.6847835779190063, "rewards/rejected": -2.2540907859802246, "step": 15410 }, { "epoch": 2.656788421778084, "grad_norm": 41.6089973449707, "learning_rate": 7.869198590120962e-09, "logits/chosen": -1.5842618942260742, "logits/rejected": -1.5435030460357666, "logps/chosen": -202.87576293945312, "logps/rejected": -299.6231384277344, "loss": 0.4889, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.504333257675171, "rewards/margins": 0.955193042755127, "rewards/rejected": -2.459526538848877, "step": 15420 }, { "epoch": 2.658511371467953, "grad_norm": 21.34117889404297, "learning_rate": 7.791428681063084e-09, "logits/chosen": -1.738438367843628, "logits/rejected": -1.6839052438735962, "logps/chosen": -210.18295288085938, "logps/rejected": -294.5263977050781, "loss": 0.4797, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5176045894622803, "rewards/margins": 0.8913189172744751, "rewards/rejected": -2.408923625946045, "step": 15430 }, { "epoch": 2.660234321157822, "grad_norm": 29.64019203186035, "learning_rate": 7.714029395460054e-09, "logits/chosen": -1.7610969543457031, "logits/rejected": -1.7188001871109009, "logps/chosen": -202.62686157226562, "logps/rejected": -272.075927734375, "loss": 0.5267, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4838390350341797, "rewards/margins": 0.7032070755958557, "rewards/rejected": -2.1870460510253906, "step": 15440 }, { "epoch": 2.661957270847691, "grad_norm": 30.35210609436035, "learning_rate": 7.637001044410784e-09, "logits/chosen": -1.4572802782058716, "logits/rejected": -1.4177887439727783, "logps/chosen": -201.9217529296875, "logps/rejected": -273.5601806640625, "loss": 0.5322, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4936710596084595, "rewards/margins": 0.7113657593727112, "rewards/rejected": -2.2050366401672363, "step": 15450 }, { "epoch": 2.66368022053756, "grad_norm": 32.20455551147461, "learning_rate": 7.560343937523361e-09, "logits/chosen": -1.6862075328826904, "logits/rejected": -1.6477339267730713, "logps/chosen": -198.38482666015625, "logps/rejected": -276.956298828125, "loss": 0.5051, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3834189176559448, "rewards/margins": 0.8093355298042297, "rewards/rejected": -2.1927542686462402, "step": 15460 }, { "epoch": 2.6654031702274295, "grad_norm": 33.88570022583008, "learning_rate": 7.484058382913583e-09, "logits/chosen": -1.6792227029800415, "logits/rejected": -1.6291191577911377, "logps/chosen": -215.4248046875, "logps/rejected": -297.33941650390625, "loss": 0.521, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5585341453552246, "rewards/margins": 0.8606500625610352, "rewards/rejected": -2.419184446334839, "step": 15470 }, { "epoch": 2.6671261199172984, "grad_norm": 22.154661178588867, "learning_rate": 7.40814468720391e-09, "logits/chosen": -1.6908563375473022, "logits/rejected": -1.6321758031845093, "logps/chosen": -191.12051391601562, "logps/rejected": -273.941650390625, "loss": 0.5031, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3590161800384521, "rewards/margins": 0.8672925233840942, "rewards/rejected": -2.226309061050415, "step": 15480 }, { "epoch": 2.6688490696071674, "grad_norm": 26.001617431640625, "learning_rate": 7.332603155522066e-09, "logits/chosen": -1.644089937210083, "logits/rejected": -1.6148462295532227, "logps/chosen": -208.5677947998047, "logps/rejected": -270.153564453125, "loss": 0.5564, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5410025119781494, "rewards/margins": 0.6423856616020203, "rewards/rejected": -2.1833882331848145, "step": 15490 }, { "epoch": 2.670572019297037, "grad_norm": 45.97538757324219, "learning_rate": 7.257434091500014e-09, "logits/chosen": -1.621949553489685, "logits/rejected": -1.594242811203003, "logps/chosen": -222.2292938232422, "logps/rejected": -283.860107421875, "loss": 0.5922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6816177368164062, "rewards/margins": 0.6131669282913208, "rewards/rejected": -2.2947845458984375, "step": 15500 }, { "epoch": 2.6722949689869058, "grad_norm": 30.593849182128906, "learning_rate": 7.182637797272506e-09, "logits/chosen": -1.5719969272613525, "logits/rejected": -1.5192309617996216, "logps/chosen": -206.0696258544922, "logps/rejected": -281.87786865234375, "loss": 0.5382, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.477163314819336, "rewards/margins": 0.8039218187332153, "rewards/rejected": -2.2810850143432617, "step": 15510 }, { "epoch": 2.6740179186767747, "grad_norm": 32.63137435913086, "learning_rate": 7.108214573476035e-09, "logits/chosen": -1.500911831855774, "logits/rejected": -1.4573032855987549, "logps/chosen": -202.20135498046875, "logps/rejected": -273.4117126464844, "loss": 0.5347, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4793106317520142, "rewards/margins": 0.7624852657318115, "rewards/rejected": -2.2417960166931152, "step": 15520 }, { "epoch": 2.6757408683666437, "grad_norm": 25.706695556640625, "learning_rate": 7.0341647192475704e-09, "logits/chosen": -1.5359480381011963, "logits/rejected": -1.4948341846466064, "logps/chosen": -190.4417266845703, "logps/rejected": -271.4991455078125, "loss": 0.494, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3646535873413086, "rewards/margins": 0.8184051513671875, "rewards/rejected": -2.183058261871338, "step": 15530 }, { "epoch": 2.6774638180565127, "grad_norm": 31.82769775390625, "learning_rate": 6.960488532223374e-09, "logits/chosen": -1.5960495471954346, "logits/rejected": -1.548031210899353, "logps/chosen": -205.06552124023438, "logps/rejected": -281.83062744140625, "loss": 0.5418, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.527539849281311, "rewards/margins": 0.7699499726295471, "rewards/rejected": -2.297489643096924, "step": 15540 }, { "epoch": 2.6791867677463816, "grad_norm": 38.12778091430664, "learning_rate": 6.887186308537763e-09, "logits/chosen": -1.6880691051483154, "logits/rejected": -1.6379327774047852, "logps/chosen": -213.2513885498047, "logps/rejected": -289.7648010253906, "loss": 0.5181, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.579298734664917, "rewards/margins": 0.7917166948318481, "rewards/rejected": -2.3710153102874756, "step": 15550 }, { "epoch": 2.6809097174362506, "grad_norm": 32.01270294189453, "learning_rate": 6.814258342821932e-09, "logits/chosen": -1.6256210803985596, "logits/rejected": -1.5979753732681274, "logps/chosen": -202.05966186523438, "logps/rejected": -276.04132080078125, "loss": 0.5253, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5108765363693237, "rewards/margins": 0.7135692834854126, "rewards/rejected": -2.2244458198547363, "step": 15560 }, { "epoch": 2.68263266712612, "grad_norm": 29.39885711669922, "learning_rate": 6.741704928202807e-09, "logits/chosen": -1.6568377017974854, "logits/rejected": -1.6141636371612549, "logps/chosen": -205.1513214111328, "logps/rejected": -288.5816955566406, "loss": 0.5194, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5274347066879272, "rewards/margins": 0.8483465313911438, "rewards/rejected": -2.3757810592651367, "step": 15570 }, { "epoch": 2.684355616815989, "grad_norm": 30.44341278076172, "learning_rate": 6.669526356301869e-09, "logits/chosen": -1.6794891357421875, "logits/rejected": -1.6423813104629517, "logps/chosen": -204.3780975341797, "logps/rejected": -287.107421875, "loss": 0.5083, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4868040084838867, "rewards/margins": 0.8067952394485474, "rewards/rejected": -2.2935991287231445, "step": 15580 }, { "epoch": 2.686078566505858, "grad_norm": 40.704795837402344, "learning_rate": 6.597722917233894e-09, "logits/chosen": -1.6327779293060303, "logits/rejected": -1.5902550220489502, "logps/chosen": -189.83932495117188, "logps/rejected": -263.6131286621094, "loss": 0.5163, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3530352115631104, "rewards/margins": 0.7421896457672119, "rewards/rejected": -2.0952248573303223, "step": 15590 }, { "epoch": 2.687801516195727, "grad_norm": 29.427024841308594, "learning_rate": 6.526294899605878e-09, "logits/chosen": -1.6281979084014893, "logits/rejected": -1.5792288780212402, "logps/chosen": -198.7489776611328, "logps/rejected": -291.39019775390625, "loss": 0.5011, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4688153266906738, "rewards/margins": 0.9297361373901367, "rewards/rejected": -2.3985512256622314, "step": 15600 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -1.7368842363357544, "eval_logits/rejected": -1.7134876251220703, "eval_logps/chosen": -199.51185607910156, "eval_logps/rejected": -238.93487548828125, "eval_loss": 0.6304026246070862, "eval_rewards/accuracies": 0.6433550119400024, "eval_rewards/chosen": -1.4079997539520264, "eval_rewards/margins": 0.3495476245880127, "eval_rewards/rejected": -1.7575472593307495, "eval_runtime": 384.5525, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 15600 }, { "epoch": 2.6895244658855963, "grad_norm": 31.768056869506836, "learning_rate": 6.455242590515842e-09, "logits/chosen": -1.7051315307617188, "logits/rejected": -1.6663964986801147, "logps/chosen": -202.0169677734375, "logps/rejected": -275.9730529785156, "loss": 0.5375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4596657752990723, "rewards/margins": 0.7587411403656006, "rewards/rejected": -2.218406915664673, "step": 15610 }, { "epoch": 2.6912474155754653, "grad_norm": 40.55280303955078, "learning_rate": 6.384566275551717e-09, "logits/chosen": -1.5791552066802979, "logits/rejected": -1.5466220378875732, "logps/chosen": -186.74722290039062, "logps/rejected": -279.75335693359375, "loss": 0.476, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3374412059783936, "rewards/margins": 0.9051253199577332, "rewards/rejected": -2.2425665855407715, "step": 15620 }, { "epoch": 2.6929703652653343, "grad_norm": 40.39755630493164, "learning_rate": 6.314266238790089e-09, "logits/chosen": -1.620631217956543, "logits/rejected": -1.5486552715301514, "logps/chosen": -205.1244354248047, "logps/rejected": -295.75579833984375, "loss": 0.4499, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.499033808708191, "rewards/margins": 0.9564018249511719, "rewards/rejected": -2.4554355144500732, "step": 15630 }, { "epoch": 2.6946933149552033, "grad_norm": 33.87063980102539, "learning_rate": 6.244342762795207e-09, "logits/chosen": -1.6153934001922607, "logits/rejected": -1.566318392753601, "logps/chosen": -202.0092315673828, "logps/rejected": -296.7959289550781, "loss": 0.465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4862453937530518, "rewards/margins": 0.9477277994155884, "rewards/rejected": -2.433973550796509, "step": 15640 }, { "epoch": 2.6964162646450722, "grad_norm": 49.840118408203125, "learning_rate": 6.1747961286177205e-09, "logits/chosen": -1.6278855800628662, "logits/rejected": -1.5861130952835083, "logps/chosen": -197.191650390625, "logps/rejected": -273.5580139160156, "loss": 0.5397, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4384510517120361, "rewards/margins": 0.7653406858444214, "rewards/rejected": -2.203791856765747, "step": 15650 }, { "epoch": 2.698139214334941, "grad_norm": 28.98862075805664, "learning_rate": 6.105626615793602e-09, "logits/chosen": -1.682408094406128, "logits/rejected": -1.6390424966812134, "logps/chosen": -197.5651397705078, "logps/rejected": -286.9451904296875, "loss": 0.4832, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4492084980010986, "rewards/margins": 0.8780032396316528, "rewards/rejected": -2.327211856842041, "step": 15660 }, { "epoch": 2.6998621640248106, "grad_norm": 26.868104934692383, "learning_rate": 6.036834502343058e-09, "logits/chosen": -1.5527050495147705, "logits/rejected": -1.4929711818695068, "logps/chosen": -196.7747039794922, "logps/rejected": -273.2555236816406, "loss": 0.4775, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4153461456298828, "rewards/margins": 0.8030064702033997, "rewards/rejected": -2.218352794647217, "step": 15670 }, { "epoch": 2.7015851137146796, "grad_norm": 27.163881301879883, "learning_rate": 5.968420064769342e-09, "logits/chosen": -1.5822670459747314, "logits/rejected": -1.5396705865859985, "logps/chosen": -219.1418914794922, "logps/rejected": -301.70843505859375, "loss": 0.5029, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6476762294769287, "rewards/margins": 0.8261844515800476, "rewards/rejected": -2.473860263824463, "step": 15680 }, { "epoch": 2.7033080634045485, "grad_norm": 22.595386505126953, "learning_rate": 5.9003835780576774e-09, "logits/chosen": -1.6135778427124023, "logits/rejected": -1.5688902139663696, "logps/chosen": -195.43197631835938, "logps/rejected": -283.2395324707031, "loss": 0.4875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4298336505889893, "rewards/margins": 0.8611891865730286, "rewards/rejected": -2.291022777557373, "step": 15690 }, { "epoch": 2.7050310130944175, "grad_norm": 24.350574493408203, "learning_rate": 5.832725315674147e-09, "logits/chosen": -1.6457306146621704, "logits/rejected": -1.5960192680358887, "logps/chosen": -206.0160675048828, "logps/rejected": -292.96295166015625, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": -1.539738416671753, "rewards/margins": 0.8633926510810852, "rewards/rejected": -2.4031310081481934, "step": 15700 }, { "epoch": 2.706753962784287, "grad_norm": 31.922224044799805, "learning_rate": 5.76544554956463e-09, "logits/chosen": -1.6166181564331055, "logits/rejected": -1.5642515420913696, "logps/chosen": -209.6160430908203, "logps/rejected": -300.5673828125, "loss": 0.4717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5211459398269653, "rewards/margins": 0.9228206872940063, "rewards/rejected": -2.4439666271209717, "step": 15710 }, { "epoch": 2.708476912474156, "grad_norm": 30.285572052001953, "learning_rate": 5.698544550153661e-09, "logits/chosen": -1.64878249168396, "logits/rejected": -1.613804817199707, "logps/chosen": -204.9235076904297, "logps/rejected": -276.12042236328125, "loss": 0.5199, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5211926698684692, "rewards/margins": 0.71763014793396, "rewards/rejected": -2.2388229370117188, "step": 15720 }, { "epoch": 2.710199862164025, "grad_norm": 23.428640365600586, "learning_rate": 5.632022586343333e-09, "logits/chosen": -1.718743085861206, "logits/rejected": -1.6677840948104858, "logps/chosen": -201.1768798828125, "logps/rejected": -291.433349609375, "loss": 0.488, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.487426519393921, "rewards/margins": 0.8930130004882812, "rewards/rejected": -2.380439281463623, "step": 15730 }, { "epoch": 2.711922811853894, "grad_norm": 29.324687957763672, "learning_rate": 5.565879925512252e-09, "logits/chosen": -1.6265957355499268, "logits/rejected": -1.5733954906463623, "logps/chosen": -205.82687377929688, "logps/rejected": -276.8540954589844, "loss": 0.5602, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.527649164199829, "rewards/margins": 0.7465609312057495, "rewards/rejected": -2.274210214614868, "step": 15740 }, { "epoch": 2.713645761543763, "grad_norm": 40.81529998779297, "learning_rate": 5.50011683351449e-09, "logits/chosen": -1.624692678451538, "logits/rejected": -1.571045160293579, "logps/chosen": -222.0780487060547, "logps/rejected": -313.7108459472656, "loss": 0.4624, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6946592330932617, "rewards/margins": 0.8892682194709778, "rewards/rejected": -2.583927631378174, "step": 15750 }, { "epoch": 2.7153687112336318, "grad_norm": 24.664043426513672, "learning_rate": 5.434733574678418e-09, "logits/chosen": -1.5452042818069458, "logits/rejected": -1.5038487911224365, "logps/chosen": -198.94723510742188, "logps/rejected": -272.55206298828125, "loss": 0.5557, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4844261407852173, "rewards/margins": 0.7378342747688293, "rewards/rejected": -2.2222602367401123, "step": 15760 }, { "epoch": 2.717091660923501, "grad_norm": 23.349504470825195, "learning_rate": 5.369730411805762e-09, "logits/chosen": -1.6020376682281494, "logits/rejected": -1.5571599006652832, "logps/chosen": -187.35842895507812, "logps/rejected": -284.2367248535156, "loss": 0.4332, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.3557246923446655, "rewards/margins": 0.9550348520278931, "rewards/rejected": -2.3107595443725586, "step": 15770 }, { "epoch": 2.71881461061337, "grad_norm": 29.679035186767578, "learning_rate": 5.3051076061704445e-09, "logits/chosen": -1.743186593055725, "logits/rejected": -1.7102177143096924, "logps/chosen": -215.1242218017578, "logps/rejected": -272.3497619628906, "loss": 0.5729, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.607500672340393, "rewards/margins": 0.5912182331085205, "rewards/rejected": -2.1987192630767822, "step": 15780 }, { "epoch": 2.720537560303239, "grad_norm": 24.34191131591797, "learning_rate": 5.240865417517604e-09, "logits/chosen": -1.5436227321624756, "logits/rejected": -1.5051971673965454, "logps/chosen": -208.3878173828125, "logps/rejected": -282.0325622558594, "loss": 0.5088, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5434716939926147, "rewards/margins": 0.7497387528419495, "rewards/rejected": -2.293210506439209, "step": 15790 }, { "epoch": 2.722260509993108, "grad_norm": 25.835796356201172, "learning_rate": 5.177004104062521e-09, "logits/chosen": -1.6853927373886108, "logits/rejected": -1.6165730953216553, "logps/chosen": -198.42984008789062, "logps/rejected": -282.55657958984375, "loss": 0.4685, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4018248319625854, "rewards/margins": 0.9172322154045105, "rewards/rejected": -2.319056987762451, "step": 15800 }, { "epoch": 2.7239834596829775, "grad_norm": 47.76166534423828, "learning_rate": 5.113523922489571e-09, "logits/chosen": -1.678775429725647, "logits/rejected": -1.6477625370025635, "logps/chosen": -203.407958984375, "logps/rejected": -277.275390625, "loss": 0.5445, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5462872982025146, "rewards/margins": 0.7036650776863098, "rewards/rejected": -2.249952554702759, "step": 15810 }, { "epoch": 2.7257064093728465, "grad_norm": 33.46269607543945, "learning_rate": 5.0504251279512415e-09, "logits/chosen": -1.5308297872543335, "logits/rejected": -1.4846632480621338, "logps/chosen": -209.490234375, "logps/rejected": -292.7735595703125, "loss": 0.5237, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5785003900527954, "rewards/margins": 0.8312740325927734, "rewards/rejected": -2.4097743034362793, "step": 15820 }, { "epoch": 2.7274293590627154, "grad_norm": 31.780555725097656, "learning_rate": 4.987707974067046e-09, "logits/chosen": -1.6617063283920288, "logits/rejected": -1.6254870891571045, "logps/chosen": -199.11273193359375, "logps/rejected": -274.83306884765625, "loss": 0.5586, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4930975437164307, "rewards/margins": 0.7451966404914856, "rewards/rejected": -2.2382943630218506, "step": 15830 }, { "epoch": 2.7291523087525844, "grad_norm": 38.7813606262207, "learning_rate": 4.9253727129224934e-09, "logits/chosen": -1.6716371774673462, "logits/rejected": -1.6329853534698486, "logps/chosen": -222.65463256835938, "logps/rejected": -313.23272705078125, "loss": 0.5259, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6936982870101929, "rewards/margins": 0.882973849773407, "rewards/rejected": -2.576672077178955, "step": 15840 }, { "epoch": 2.7308752584424534, "grad_norm": 36.57883071899414, "learning_rate": 4.863419595068197e-09, "logits/chosen": -1.6573702096939087, "logits/rejected": -1.609955072402954, "logps/chosen": -194.7985076904297, "logps/rejected": -280.5198669433594, "loss": 0.5139, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4152289628982544, "rewards/margins": 0.8435710668563843, "rewards/rejected": -2.2588000297546387, "step": 15850 }, { "epoch": 2.7325982081323223, "grad_norm": 27.437578201293945, "learning_rate": 4.801848869518721e-09, "logits/chosen": -1.6103976964950562, "logits/rejected": -1.5662734508514404, "logps/chosen": -205.65383911132812, "logps/rejected": -264.7209167480469, "loss": 0.5863, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5253267288208008, "rewards/margins": 0.6179733276367188, "rewards/rejected": -2.1433000564575195, "step": 15860 }, { "epoch": 2.7343211578221913, "grad_norm": 23.60274887084961, "learning_rate": 4.740660783751638e-09, "logits/chosen": -1.628843903541565, "logits/rejected": -1.5704540014266968, "logps/chosen": -211.80722045898438, "logps/rejected": -304.2124938964844, "loss": 0.4875, "rewards/accuracies": 0.75, "rewards/chosen": -1.565833330154419, "rewards/margins": 0.9499748945236206, "rewards/rejected": -2.515808343887329, "step": 15870 }, { "epoch": 2.7360441075120607, "grad_norm": 31.625076293945312, "learning_rate": 4.679855583706571e-09, "logits/chosen": -1.5856298208236694, "logits/rejected": -1.5474889278411865, "logps/chosen": -198.1217041015625, "logps/rejected": -290.2977294921875, "loss": 0.4763, "rewards/accuracies": 0.75, "rewards/chosen": -1.462418794631958, "rewards/margins": 0.9268255233764648, "rewards/rejected": -2.3892440795898438, "step": 15880 }, { "epoch": 2.7377670572019297, "grad_norm": 30.186508178710938, "learning_rate": 4.619433513784166e-09, "logits/chosen": -1.6528924703598022, "logits/rejected": -1.5979883670806885, "logps/chosen": -201.7820281982422, "logps/rejected": -274.7981262207031, "loss": 0.5298, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4912642240524292, "rewards/margins": 0.7730330228805542, "rewards/rejected": -2.2642972469329834, "step": 15890 }, { "epoch": 2.7394900068917987, "grad_norm": 33.469242095947266, "learning_rate": 4.559394816845075e-09, "logits/chosen": -1.6455352306365967, "logits/rejected": -1.5722758769989014, "logps/chosen": -216.37216186523438, "logps/rejected": -298.2059020996094, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": -1.5656859874725342, "rewards/margins": 0.8968521356582642, "rewards/rejected": -2.462538003921509, "step": 15900 }, { "epoch": 2.741212956581668, "grad_norm": 30.317359924316406, "learning_rate": 4.499739734209074e-09, "logits/chosen": -1.5669691562652588, "logits/rejected": -1.5114362239837646, "logps/chosen": -192.9110870361328, "logps/rejected": -276.04083251953125, "loss": 0.5079, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3991148471832275, "rewards/margins": 0.8640725016593933, "rewards/rejected": -2.2631874084472656, "step": 15910 }, { "epoch": 2.742935906271537, "grad_norm": 35.19529342651367, "learning_rate": 4.440468505653982e-09, "logits/chosen": -1.5765380859375, "logits/rejected": -1.5384092330932617, "logps/chosen": -215.87570190429688, "logps/rejected": -299.07562255859375, "loss": 0.5069, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6332439184188843, "rewards/margins": 0.8158270120620728, "rewards/rejected": -2.449071168899536, "step": 15920 }, { "epoch": 2.744658855961406, "grad_norm": 30.254179000854492, "learning_rate": 4.381581369414822e-09, "logits/chosen": -1.510341763496399, "logits/rejected": -1.4571070671081543, "logps/chosen": -192.68174743652344, "logps/rejected": -274.46771240234375, "loss": 0.4713, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3748412132263184, "rewards/margins": 0.8734277486801147, "rewards/rejected": -2.2482690811157227, "step": 15930 }, { "epoch": 2.746381805651275, "grad_norm": 22.98348617553711, "learning_rate": 4.323078562182702e-09, "logits/chosen": -1.60586678981781, "logits/rejected": -1.5448791980743408, "logps/chosen": -201.58407592773438, "logps/rejected": -300.69329833984375, "loss": 0.4591, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4241517782211304, "rewards/margins": 0.9938325881958008, "rewards/rejected": -2.4179844856262207, "step": 15940 }, { "epoch": 2.748104755341144, "grad_norm": 37.307708740234375, "learning_rate": 4.2649603191040715e-09, "logits/chosen": -1.6714118719100952, "logits/rejected": -1.6232936382293701, "logps/chosen": -197.21530151367188, "logps/rejected": -277.82293701171875, "loss": 0.4869, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3933839797973633, "rewards/margins": 0.8573741912841797, "rewards/rejected": -2.250758171081543, "step": 15950 }, { "epoch": 2.749827705031013, "grad_norm": 18.794313430786133, "learning_rate": 4.207226873779557e-09, "logits/chosen": -1.6430130004882812, "logits/rejected": -1.5901249647140503, "logps/chosen": -202.3380126953125, "logps/rejected": -289.830810546875, "loss": 0.4925, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4588900804519653, "rewards/margins": 0.9070623517036438, "rewards/rejected": -2.365952253341675, "step": 15960 }, { "epoch": 2.751550654720882, "grad_norm": 35.651344299316406, "learning_rate": 4.149878458263179e-09, "logits/chosen": -1.62399423122406, "logits/rejected": -1.5761115550994873, "logps/chosen": -201.8713836669922, "logps/rejected": -293.91705322265625, "loss": 0.4755, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4667004346847534, "rewards/margins": 0.9146144986152649, "rewards/rejected": -2.381314754486084, "step": 15970 }, { "epoch": 2.7532736044107513, "grad_norm": 34.21991729736328, "learning_rate": 4.092915303061372e-09, "logits/chosen": -1.59518563747406, "logits/rejected": -1.5532639026641846, "logps/chosen": -209.7283477783203, "logps/rejected": -278.43890380859375, "loss": 0.5247, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5316201448440552, "rewards/margins": 0.6966302990913391, "rewards/rejected": -2.22825026512146, "step": 15980 }, { "epoch": 2.7549965541006203, "grad_norm": 30.082977294921875, "learning_rate": 4.0363376371320366e-09, "logits/chosen": -1.7484419345855713, "logits/rejected": -1.7310978174209595, "logps/chosen": -204.43405151367188, "logps/rejected": -275.6298828125, "loss": 0.5516, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4974677562713623, "rewards/margins": 0.7014755606651306, "rewards/rejected": -2.1989433765411377, "step": 15990 }, { "epoch": 2.7567195037904892, "grad_norm": 29.51515769958496, "learning_rate": 3.98014568788364e-09, "logits/chosen": -1.5735108852386475, "logits/rejected": -1.5224225521087646, "logps/chosen": -201.62033081054688, "logps/rejected": -282.30157470703125, "loss": 0.4936, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4778904914855957, "rewards/margins": 0.8260573148727417, "rewards/rejected": -2.303947925567627, "step": 16000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -1.724768042564392, "eval_logits/rejected": -1.7010468244552612, "eval_logps/chosen": -203.6143341064453, "eval_logps/rejected": -244.05953979492188, "eval_loss": 0.6303529143333435, "eval_rewards/accuracies": 0.6435873508453369, "eval_rewards/chosen": -1.4490246772766113, "eval_rewards/margins": 0.3597696125507355, "eval_rewards/rejected": -1.8087942600250244, "eval_runtime": 384.5273, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 16000 }, { "epoch": 2.758442453480358, "grad_norm": 46.15574645996094, "learning_rate": 3.924339681174293e-09, "logits/chosen": -1.6742016077041626, "logits/rejected": -1.640960454940796, "logps/chosen": -217.3235626220703, "logps/rejected": -288.28643798828125, "loss": 0.5747, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6208339929580688, "rewards/margins": 0.7245651483535767, "rewards/rejected": -2.3453993797302246, "step": 16010 }, { "epoch": 2.7601654031702276, "grad_norm": 20.357431411743164, "learning_rate": 3.868919841310858e-09, "logits/chosen": -1.7137123346328735, "logits/rejected": -1.6709020137786865, "logps/chosen": -208.59683227539062, "logps/rejected": -290.96697998046875, "loss": 0.5329, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5549236536026, "rewards/margins": 0.8437626957893372, "rewards/rejected": -2.398686170578003, "step": 16020 }, { "epoch": 2.7618883528600966, "grad_norm": 32.056190490722656, "learning_rate": 3.81388639104806e-09, "logits/chosen": -1.7042852640151978, "logits/rejected": -1.659950852394104, "logps/chosen": -208.49032592773438, "logps/rejected": -288.7596740722656, "loss": 0.5151, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5546530485153198, "rewards/margins": 0.8066908717155457, "rewards/rejected": -2.3613438606262207, "step": 16030 }, { "epoch": 2.7636113025499656, "grad_norm": 45.362579345703125, "learning_rate": 3.759239551587512e-09, "logits/chosen": -1.6354458332061768, "logits/rejected": -1.5884088277816772, "logps/chosen": -208.31723022460938, "logps/rejected": -302.8401794433594, "loss": 0.4802, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5453264713287354, "rewards/margins": 0.9585615396499634, "rewards/rejected": -2.50388765335083, "step": 16040 }, { "epoch": 2.7653342522398345, "grad_norm": 23.472545623779297, "learning_rate": 3.7049795425769027e-09, "logits/chosen": -1.6082700490951538, "logits/rejected": -1.5660271644592285, "logps/chosen": -194.77102661132812, "logps/rejected": -287.49371337890625, "loss": 0.4379, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.4037247896194458, "rewards/margins": 0.9330763816833496, "rewards/rejected": -2.336801052093506, "step": 16050 }, { "epoch": 2.7670572019297035, "grad_norm": 27.753414154052734, "learning_rate": 3.6511065821091314e-09, "logits/chosen": -1.6502840518951416, "logits/rejected": -1.6107053756713867, "logps/chosen": -196.86233520507812, "logps/rejected": -278.1816101074219, "loss": 0.497, "rewards/accuracies": 0.75, "rewards/chosen": -1.4114711284637451, "rewards/margins": 0.819719135761261, "rewards/rejected": -2.2311902046203613, "step": 16060 }, { "epoch": 2.7687801516195725, "grad_norm": 27.338544845581055, "learning_rate": 3.597620886721342e-09, "logits/chosen": -1.5682957172393799, "logits/rejected": -1.5182629823684692, "logps/chosen": -199.6961212158203, "logps/rejected": -282.4446105957031, "loss": 0.478, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4262382984161377, "rewards/margins": 0.8562216758728027, "rewards/rejected": -2.2824597358703613, "step": 16070 }, { "epoch": 2.770503101309442, "grad_norm": 29.13926124572754, "learning_rate": 3.5445226713941457e-09, "logits/chosen": -1.6415565013885498, "logits/rejected": -1.5768063068389893, "logps/chosen": -211.66091918945312, "logps/rejected": -297.61773681640625, "loss": 0.4922, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5455818176269531, "rewards/margins": 0.914897084236145, "rewards/rejected": -2.4604787826538086, "step": 16080 }, { "epoch": 2.772226050999311, "grad_norm": 42.47835922241211, "learning_rate": 3.491812149550688e-09, "logits/chosen": -1.6251806020736694, "logits/rejected": -1.57179856300354, "logps/chosen": -195.59005737304688, "logps/rejected": -282.0909118652344, "loss": 0.4861, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4269095659255981, "rewards/margins": 0.8857747316360474, "rewards/rejected": -2.3126845359802246, "step": 16090 }, { "epoch": 2.77394900068918, "grad_norm": 32.899444580078125, "learning_rate": 3.4394895330558284e-09, "logits/chosen": -1.6690833568572998, "logits/rejected": -1.6237341165542603, "logps/chosen": -193.7056427001953, "logps/rejected": -303.7026672363281, "loss": 0.4389, "rewards/accuracies": 0.75, "rewards/chosen": -1.414886236190796, "rewards/margins": 1.076838731765747, "rewards/rejected": -2.491725206375122, "step": 16100 }, { "epoch": 2.775671950379049, "grad_norm": 36.35084915161133, "learning_rate": 3.3875550322152503e-09, "logits/chosen": -1.5023618936538696, "logits/rejected": -1.4406607151031494, "logps/chosen": -200.91744995117188, "logps/rejected": -295.1684265136719, "loss": 0.4929, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4381941556930542, "rewards/margins": 0.9750955700874329, "rewards/rejected": -2.4132895469665527, "step": 16110 }, { "epoch": 2.777394900068918, "grad_norm": 37.906803131103516, "learning_rate": 3.3360088557746856e-09, "logits/chosen": -1.5955053567886353, "logits/rejected": -1.5656285285949707, "logps/chosen": -195.3086700439453, "logps/rejected": -264.56658935546875, "loss": 0.548, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4471538066864014, "rewards/margins": 0.6703465580940247, "rewards/rejected": -2.1175003051757812, "step": 16120 }, { "epoch": 2.779117849758787, "grad_norm": 30.308712005615234, "learning_rate": 3.2848512109190375e-09, "logits/chosen": -1.6084339618682861, "logits/rejected": -1.5634233951568604, "logps/chosen": -203.5264892578125, "logps/rejected": -282.7039794921875, "loss": 0.5127, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5111953020095825, "rewards/margins": 0.7994351983070374, "rewards/rejected": -2.3106303215026855, "step": 16130 }, { "epoch": 2.780840799448656, "grad_norm": 28.674461364746094, "learning_rate": 3.2340823032715125e-09, "logits/chosen": -1.722328782081604, "logits/rejected": -1.672581672668457, "logps/chosen": -196.27162170410156, "logps/rejected": -284.0904235839844, "loss": 0.4844, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.435058355331421, "rewards/margins": 0.8832355737686157, "rewards/rejected": -2.318293809890747, "step": 16140 }, { "epoch": 2.782563749138525, "grad_norm": 30.645694732666016, "learning_rate": 3.1837023368928017e-09, "logits/chosen": -1.6766061782836914, "logits/rejected": -1.6359221935272217, "logps/chosen": -211.22470092773438, "logps/rejected": -286.9867858886719, "loss": 0.525, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5677381753921509, "rewards/margins": 0.7787638902664185, "rewards/rejected": -2.3465020656585693, "step": 16150 }, { "epoch": 2.784286698828394, "grad_norm": 31.85881996154785, "learning_rate": 3.133711514280357e-09, "logits/chosen": -1.685119867324829, "logits/rejected": -1.6261520385742188, "logps/chosen": -196.38050842285156, "logps/rejected": -295.7599792480469, "loss": 0.4314, "rewards/accuracies": 0.84375, "rewards/chosen": -1.42711341381073, "rewards/margins": 0.9946566820144653, "rewards/rejected": -2.4217700958251953, "step": 16160 }, { "epoch": 2.786009648518263, "grad_norm": 23.079660415649414, "learning_rate": 3.084110036367449e-09, "logits/chosen": -1.5471277236938477, "logits/rejected": -1.499712586402893, "logps/chosen": -210.12258911132812, "logps/rejected": -288.97479248046875, "loss": 0.5042, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5617667436599731, "rewards/margins": 0.7905761003494263, "rewards/rejected": -2.3523428440093994, "step": 16170 }, { "epoch": 2.7877325982081325, "grad_norm": 40.84262466430664, "learning_rate": 3.034898102522454e-09, "logits/chosen": -1.5986557006835938, "logits/rejected": -1.5298030376434326, "logps/chosen": -221.5828399658203, "logps/rejected": -308.65576171875, "loss": 0.5076, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.627313256263733, "rewards/margins": 0.9360467195510864, "rewards/rejected": -2.5633597373962402, "step": 16180 }, { "epoch": 2.7894555478980014, "grad_norm": 30.877662658691406, "learning_rate": 2.9860759105479582e-09, "logits/chosen": -1.6073898077011108, "logits/rejected": -1.5686115026474, "logps/chosen": -209.8683624267578, "logps/rejected": -286.5576171875, "loss": 0.5461, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5662927627563477, "rewards/margins": 0.7508216500282288, "rewards/rejected": -2.3171145915985107, "step": 16190 }, { "epoch": 2.7911784975878704, "grad_norm": 30.29778480529785, "learning_rate": 2.9376436566800667e-09, "logits/chosen": -1.558695673942566, "logits/rejected": -1.4981310367584229, "logps/chosen": -203.7272491455078, "logps/rejected": -279.0574645996094, "loss": 0.5288, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4882779121398926, "rewards/margins": 0.8002364039421082, "rewards/rejected": -2.2885146141052246, "step": 16200 }, { "epoch": 2.7929014472777394, "grad_norm": 42.685707092285156, "learning_rate": 2.8896015355875492e-09, "logits/chosen": -1.5141369104385376, "logits/rejected": -1.4735002517700195, "logps/chosen": -201.10067749023438, "logps/rejected": -282.42083740234375, "loss": 0.508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.491695523262024, "rewards/margins": 0.7959169149398804, "rewards/rejected": -2.2876124382019043, "step": 16210 }, { "epoch": 2.794624396967609, "grad_norm": 23.677894592285156, "learning_rate": 2.841949740371086e-09, "logits/chosen": -1.5899922847747803, "logits/rejected": -1.5405527353286743, "logps/chosen": -198.48683166503906, "logps/rejected": -298.21026611328125, "loss": 0.481, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4205026626586914, "rewards/margins": 1.0091335773468018, "rewards/rejected": -2.4296364784240723, "step": 16220 }, { "epoch": 2.7963473466574778, "grad_norm": 20.0400447845459, "learning_rate": 2.7946884625624556e-09, "logits/chosen": -1.6403640508651733, "logits/rejected": -1.589577078819275, "logps/chosen": -198.72225952148438, "logps/rejected": -291.4017639160156, "loss": 0.4779, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4717034101486206, "rewards/margins": 0.9280723333358765, "rewards/rejected": -2.399775981903076, "step": 16230 }, { "epoch": 2.7980702963473467, "grad_norm": 33.879493713378906, "learning_rate": 2.747817892123816e-09, "logits/chosen": -1.5945513248443604, "logits/rejected": -1.5502314567565918, "logps/chosen": -212.589599609375, "logps/rejected": -299.84051513671875, "loss": 0.5071, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6102514266967773, "rewards/margins": 0.8573668599128723, "rewards/rejected": -2.467618227005005, "step": 16240 }, { "epoch": 2.7997932460372157, "grad_norm": 27.390350341796875, "learning_rate": 2.7013382174468914e-09, "logits/chosen": -1.6416511535644531, "logits/rejected": -1.5986303091049194, "logps/chosen": -207.4696502685547, "logps/rejected": -279.26654052734375, "loss": 0.5334, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4999459981918335, "rewards/margins": 0.7439717054367065, "rewards/rejected": -2.243917942047119, "step": 16250 }, { "epoch": 2.8015161957270847, "grad_norm": 33.779144287109375, "learning_rate": 2.6552496253522518e-09, "logits/chosen": -1.6039321422576904, "logits/rejected": -1.5535815954208374, "logps/chosen": -209.19546508789062, "logps/rejected": -307.2878112792969, "loss": 0.4992, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5651133060455322, "rewards/margins": 0.9539181590080261, "rewards/rejected": -2.519031047821045, "step": 16260 }, { "epoch": 2.8032391454169536, "grad_norm": 25.58933448791504, "learning_rate": 2.609552301088558e-09, "logits/chosen": -1.6411972045898438, "logits/rejected": -1.598671555519104, "logps/chosen": -211.4327392578125, "logps/rejected": -287.0559997558594, "loss": 0.5453, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5608909130096436, "rewards/margins": 0.775378942489624, "rewards/rejected": -2.3362698554992676, "step": 16270 }, { "epoch": 2.804962095106823, "grad_norm": 37.81081771850586, "learning_rate": 2.5642464283317733e-09, "logits/chosen": -1.7280786037445068, "logits/rejected": -1.6881927251815796, "logps/chosen": -210.5127716064453, "logps/rejected": -286.2266540527344, "loss": 0.525, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5488097667694092, "rewards/margins": 0.7880204916000366, "rewards/rejected": -2.3368306159973145, "step": 16280 }, { "epoch": 2.806685044796692, "grad_norm": 21.164901733398438, "learning_rate": 2.5193321891844866e-09, "logits/chosen": -1.6742864847183228, "logits/rejected": -1.6348425149917603, "logps/chosen": -201.8052215576172, "logps/rejected": -290.950927734375, "loss": 0.5032, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4995896816253662, "rewards/margins": 0.8645893931388855, "rewards/rejected": -2.3641791343688965, "step": 16290 }, { "epoch": 2.808407994486561, "grad_norm": 30.592893600463867, "learning_rate": 2.4748097641751787e-09, "logits/chosen": -1.6843475103378296, "logits/rejected": -1.6307868957519531, "logps/chosen": -211.84707641601562, "logps/rejected": -290.2081604003906, "loss": 0.5295, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5801098346710205, "rewards/margins": 0.807979941368103, "rewards/rejected": -2.388089656829834, "step": 16300 }, { "epoch": 2.81013094417643, "grad_norm": 35.39067077636719, "learning_rate": 2.4306793322574014e-09, "logits/chosen": -1.577487826347351, "logits/rejected": -1.5398495197296143, "logps/chosen": -210.31735229492188, "logps/rejected": -289.4979248046875, "loss": 0.5332, "rewards/accuracies": 0.75, "rewards/chosen": -1.5722347497940063, "rewards/margins": 0.7788803577423096, "rewards/rejected": -2.3511152267456055, "step": 16310 }, { "epoch": 2.8118538938662994, "grad_norm": 37.08136749267578, "learning_rate": 2.3869410708091787e-09, "logits/chosen": -1.6066745519638062, "logits/rejected": -1.5592342615127563, "logps/chosen": -207.6752471923828, "logps/rejected": -298.2518310546875, "loss": 0.4863, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5351449251174927, "rewards/margins": 0.8967401385307312, "rewards/rejected": -2.431885242462158, "step": 16320 }, { "epoch": 2.8135768435561683, "grad_norm": 20.299909591674805, "learning_rate": 2.3435951556322386e-09, "logits/chosen": -1.6628811359405518, "logits/rejected": -1.6167129278182983, "logps/chosen": -200.5544891357422, "logps/rejected": -271.65625, "loss": 0.5158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4268579483032227, "rewards/margins": 0.7481969594955444, "rewards/rejected": -2.1750550270080566, "step": 16330 }, { "epoch": 2.8152997932460373, "grad_norm": 30.356172561645508, "learning_rate": 2.3006417609513053e-09, "logits/chosen": -1.531884789466858, "logits/rejected": -1.496141791343689, "logps/chosen": -196.16905212402344, "logps/rejected": -286.60443115234375, "loss": 0.4647, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4173381328582764, "rewards/margins": 0.9166313409805298, "rewards/rejected": -2.3339695930480957, "step": 16340 }, { "epoch": 2.8170227429359063, "grad_norm": 27.82748794555664, "learning_rate": 2.258081059413397e-09, "logits/chosen": -1.7319214344024658, "logits/rejected": -1.6799014806747437, "logps/chosen": -209.30575561523438, "logps/rejected": -284.62188720703125, "loss": 0.5076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5352132320404053, "rewards/margins": 0.7685121297836304, "rewards/rejected": -2.303725242614746, "step": 16350 }, { "epoch": 2.8187456926257752, "grad_norm": 39.15815353393555, "learning_rate": 2.2159132220871623e-09, "logits/chosen": -1.6009477376937866, "logits/rejected": -1.5514745712280273, "logps/chosen": -199.82054138183594, "logps/rejected": -291.30218505859375, "loss": 0.4756, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4737600088119507, "rewards/margins": 0.9305243492126465, "rewards/rejected": -2.404284715652466, "step": 16360 }, { "epoch": 2.820468642315644, "grad_norm": 38.97136306762695, "learning_rate": 2.174138418462135e-09, "logits/chosen": -1.5708199739456177, "logits/rejected": -1.5296893119812012, "logps/chosen": -204.8894805908203, "logps/rejected": -271.0523986816406, "loss": 0.5688, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5094388723373413, "rewards/margins": 0.6989701986312866, "rewards/rejected": -2.208408832550049, "step": 16370 }, { "epoch": 2.822191592005513, "grad_norm": 43.11518478393555, "learning_rate": 2.132756816448111e-09, "logits/chosen": -1.6394847631454468, "logits/rejected": -1.6021696329116821, "logps/chosen": -198.41256713867188, "logps/rejected": -283.98065185546875, "loss": 0.4834, "rewards/accuracies": 0.78125, "rewards/chosen": -1.441158413887024, "rewards/margins": 0.8487105369567871, "rewards/rejected": -2.2898690700531006, "step": 16380 }, { "epoch": 2.8239145416953826, "grad_norm": 24.530704498291016, "learning_rate": 2.0917685823744426e-09, "logits/chosen": -1.5554282665252686, "logits/rejected": -1.5096482038497925, "logps/chosen": -193.83892822265625, "logps/rejected": -281.29248046875, "loss": 0.5268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4140729904174805, "rewards/margins": 0.876665472984314, "rewards/rejected": -2.290738582611084, "step": 16390 }, { "epoch": 2.8256374913852516, "grad_norm": 34.35622787475586, "learning_rate": 2.0511738809894097e-09, "logits/chosen": -1.5805435180664062, "logits/rejected": -1.538383960723877, "logps/chosen": -198.9628448486328, "logps/rejected": -282.85791015625, "loss": 0.4952, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4447994232177734, "rewards/margins": 0.8364402651786804, "rewards/rejected": -2.2812397480010986, "step": 16400 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -1.7279173135757446, "eval_logits/rejected": -1.704264760017395, "eval_logps/chosen": -203.5388946533203, "eval_logps/rejected": -243.77938842773438, "eval_loss": 0.6312301158905029, "eval_rewards/accuracies": 0.6438196897506714, "eval_rewards/chosen": -1.4482700824737549, "eval_rewards/margins": 0.3577224314212799, "eval_rewards/rejected": -1.8059924840927124, "eval_runtime": 384.8549, "eval_samples_per_second": 11.183, "eval_steps_per_second": 1.398, "step": 16400 }, { "epoch": 2.8273604410751205, "grad_norm": 32.33806228637695, "learning_rate": 2.0109728754594713e-09, "logits/chosen": -1.6441633701324463, "logits/rejected": -1.5825875997543335, "logps/chosen": -215.2042999267578, "logps/rejected": -294.2441711425781, "loss": 0.4817, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5997352600097656, "rewards/margins": 0.8535000085830688, "rewards/rejected": -2.453235149383545, "step": 16410 }, { "epoch": 2.82908339076499, "grad_norm": 36.8965950012207, "learning_rate": 1.9711657273686844e-09, "logits/chosen": -1.5145912170410156, "logits/rejected": -1.4785223007202148, "logps/chosen": -207.9575653076172, "logps/rejected": -281.3797912597656, "loss": 0.5481, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5369865894317627, "rewards/margins": 0.7095264792442322, "rewards/rejected": -2.2465128898620605, "step": 16420 }, { "epoch": 2.830806340454859, "grad_norm": 47.45038604736328, "learning_rate": 1.93175259671805e-09, "logits/chosen": -1.5235130786895752, "logits/rejected": -1.4861727952957153, "logps/chosen": -211.5547332763672, "logps/rejected": -287.1858825683594, "loss": 0.5383, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.575508713722229, "rewards/margins": 0.7593789100646973, "rewards/rejected": -2.3348875045776367, "step": 16430 }, { "epoch": 2.832529290144728, "grad_norm": 23.88429832458496, "learning_rate": 1.8927336419248596e-09, "logits/chosen": -1.6044429540634155, "logits/rejected": -1.5553555488586426, "logps/chosen": -198.04505920410156, "logps/rejected": -294.0986633300781, "loss": 0.4818, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.468567132949829, "rewards/margins": 0.954384982585907, "rewards/rejected": -2.422952175140381, "step": 16440 }, { "epoch": 2.834252239834597, "grad_norm": 33.20879364013672, "learning_rate": 1.8541090198220144e-09, "logits/chosen": -1.6844584941864014, "logits/rejected": -1.629431962966919, "logps/chosen": -202.9315185546875, "logps/rejected": -305.4527282714844, "loss": 0.4223, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4656628370285034, "rewards/margins": 1.037574052810669, "rewards/rejected": -2.503237009048462, "step": 16450 }, { "epoch": 2.835975189524466, "grad_norm": 29.70263671875, "learning_rate": 1.8158788856574624e-09, "logits/chosen": -1.4988144636154175, "logits/rejected": -1.461374282836914, "logps/chosen": -189.18653869628906, "logps/rejected": -283.64410400390625, "loss": 0.4796, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3982244729995728, "rewards/margins": 0.9302972555160522, "rewards/rejected": -2.328521966934204, "step": 16460 }, { "epoch": 2.837698139214335, "grad_norm": 39.5974235534668, "learning_rate": 1.7780433930935312e-09, "logits/chosen": -1.60671865940094, "logits/rejected": -1.5581773519515991, "logps/chosen": -194.26234436035156, "logps/rejected": -291.94342041015625, "loss": 0.4924, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4329856634140015, "rewards/margins": 0.9630458950996399, "rewards/rejected": -2.396031618118286, "step": 16470 }, { "epoch": 2.8394210889042037, "grad_norm": 30.109529495239258, "learning_rate": 1.74060269420635e-09, "logits/chosen": -1.5139915943145752, "logits/rejected": -1.465388536453247, "logps/chosen": -195.1603546142578, "logps/rejected": -292.16363525390625, "loss": 0.4479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4241392612457275, "rewards/margins": 0.9566676020622253, "rewards/rejected": -2.3808071613311768, "step": 16480 }, { "epoch": 2.841144038594073, "grad_norm": 30.47223663330078, "learning_rate": 1.7035569394851955e-09, "logits/chosen": -1.6082112789154053, "logits/rejected": -1.570446252822876, "logps/chosen": -199.833984375, "logps/rejected": -267.7076721191406, "loss": 0.5319, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.477171540260315, "rewards/margins": 0.6900812387466431, "rewards/rejected": -2.167252779006958, "step": 16490 }, { "epoch": 2.842866988283942, "grad_norm": 21.7894287109375, "learning_rate": 1.6669062778318698e-09, "logits/chosen": -1.649877905845642, "logits/rejected": -1.5828702449798584, "logps/chosen": -198.849365234375, "logps/rejected": -262.10211181640625, "loss": 0.5217, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4084839820861816, "rewards/margins": 0.7134264707565308, "rewards/rejected": -2.121910572052002, "step": 16500 }, { "epoch": 2.844589937973811, "grad_norm": 36.085540771484375, "learning_rate": 1.6306508565602228e-09, "logits/chosen": -1.6468229293823242, "logits/rejected": -1.607865571975708, "logps/chosen": -201.9567108154297, "logps/rejected": -278.9889221191406, "loss": 0.5053, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4504163265228271, "rewards/margins": 0.7838245034217834, "rewards/rejected": -2.234240770339966, "step": 16510 }, { "epoch": 2.84631288766368, "grad_norm": 32.19239807128906, "learning_rate": 1.5947908213953753e-09, "logits/chosen": -1.7317917346954346, "logits/rejected": -1.6700223684310913, "logps/chosen": -210.5098114013672, "logps/rejected": -311.977783203125, "loss": 0.4399, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.544738531112671, "rewards/margins": 1.038854956626892, "rewards/rejected": -2.5835938453674316, "step": 16520 }, { "epoch": 2.8480358373535495, "grad_norm": 43.740455627441406, "learning_rate": 1.5593263164732972e-09, "logits/chosen": -1.563025712966919, "logits/rejected": -1.519561767578125, "logps/chosen": -209.86190795898438, "logps/rejected": -267.63372802734375, "loss": 0.5561, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5369549989700317, "rewards/margins": 0.6146339178085327, "rewards/rejected": -2.1515889167785645, "step": 16530 }, { "epoch": 2.8497587870434185, "grad_norm": 29.081884384155273, "learning_rate": 1.5242574843401524e-09, "logits/chosen": -1.608612298965454, "logits/rejected": -1.5656510591506958, "logps/chosen": -207.4720001220703, "logps/rejected": -281.773193359375, "loss": 0.5244, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5441038608551025, "rewards/margins": 0.7779989838600159, "rewards/rejected": -2.3221025466918945, "step": 16540 }, { "epoch": 2.8514817367332874, "grad_norm": 33.823883056640625, "learning_rate": 1.489584465951721e-09, "logits/chosen": -1.5540907382965088, "logits/rejected": -1.5084948539733887, "logps/chosen": -210.51278686523438, "logps/rejected": -310.1302185058594, "loss": 0.4586, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5683314800262451, "rewards/margins": 0.985478401184082, "rewards/rejected": -2.553809642791748, "step": 16550 }, { "epoch": 2.8532046864231564, "grad_norm": 31.734004974365234, "learning_rate": 1.455307400672845e-09, "logits/chosen": -1.6425979137420654, "logits/rejected": -1.5943950414657593, "logps/chosen": -197.49183654785156, "logps/rejected": -283.3627624511719, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": -1.4330264329910278, "rewards/margins": 0.8302183151245117, "rewards/rejected": -2.263244152069092, "step": 16560 }, { "epoch": 2.8549276361130254, "grad_norm": 26.054500579833984, "learning_rate": 1.421426426276895e-09, "logits/chosen": -1.691537618637085, "logits/rejected": -1.6329768896102905, "logps/chosen": -201.1520233154297, "logps/rejected": -289.4585876464844, "loss": 0.4781, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4781467914581299, "rewards/margins": 0.9025301933288574, "rewards/rejected": -2.380676746368408, "step": 16570 }, { "epoch": 2.8566505858028943, "grad_norm": 35.0569953918457, "learning_rate": 1.3879416789451815e-09, "logits/chosen": -1.5840961933135986, "logits/rejected": -1.5487277507781982, "logps/chosen": -207.152099609375, "logps/rejected": -277.3089904785156, "loss": 0.5871, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5626401901245117, "rewards/margins": 0.6830005049705505, "rewards/rejected": -2.245640754699707, "step": 16580 }, { "epoch": 2.8583735354927637, "grad_norm": 52.74144744873047, "learning_rate": 1.3548532932663891e-09, "logits/chosen": -1.5398337841033936, "logits/rejected": -1.4927947521209717, "logps/chosen": -215.7863006591797, "logps/rejected": -278.1468200683594, "loss": 0.5743, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6190001964569092, "rewards/margins": 0.6568753123283386, "rewards/rejected": -2.2758755683898926, "step": 16590 }, { "epoch": 2.8600964851826327, "grad_norm": 35.544647216796875, "learning_rate": 1.3221614022361105e-09, "logits/chosen": -1.610530138015747, "logits/rejected": -1.5752967596054077, "logps/chosen": -213.9752197265625, "logps/rejected": -285.7201232910156, "loss": 0.536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5679616928100586, "rewards/margins": 0.7312275767326355, "rewards/rejected": -2.2991890907287598, "step": 16600 }, { "epoch": 2.8618194348725017, "grad_norm": 32.12127685546875, "learning_rate": 1.289866137256257e-09, "logits/chosen": -1.5730324983596802, "logits/rejected": -1.5143897533416748, "logps/chosen": -218.294189453125, "logps/rejected": -306.3438720703125, "loss": 0.5074, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6378536224365234, "rewards/margins": 0.8943823575973511, "rewards/rejected": -2.532235622406006, "step": 16610 }, { "epoch": 2.8635423845623706, "grad_norm": 24.77232551574707, "learning_rate": 1.2579676281345042e-09, "logits/chosen": -1.5598160028457642, "logits/rejected": -1.5061019659042358, "logps/chosen": -201.98837280273438, "logps/rejected": -283.95306396484375, "loss": 0.5158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.448168396949768, "rewards/margins": 0.8431085348129272, "rewards/rejected": -2.291276693344116, "step": 16620 }, { "epoch": 2.86526533425224, "grad_norm": 62.111820220947266, "learning_rate": 1.2264660030838592e-09, "logits/chosen": -1.550568699836731, "logits/rejected": -1.4934922456741333, "logps/chosen": -213.20541381835938, "logps/rejected": -285.7199401855469, "loss": 0.535, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5055577754974365, "rewards/margins": 0.8336159586906433, "rewards/rejected": -2.3391735553741455, "step": 16630 }, { "epoch": 2.866988283942109, "grad_norm": 46.97129821777344, "learning_rate": 1.195361388722038e-09, "logits/chosen": -1.6127160787582397, "logits/rejected": -1.5536266565322876, "logps/chosen": -221.4212188720703, "logps/rejected": -303.21820068359375, "loss": 0.5219, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6521766185760498, "rewards/margins": 0.8641535043716431, "rewards/rejected": -2.5163302421569824, "step": 16640 }, { "epoch": 2.868711233631978, "grad_norm": 33.028594970703125, "learning_rate": 1.1646539100710562e-09, "logits/chosen": -1.5643160343170166, "logits/rejected": -1.5265575647354126, "logps/chosen": -190.27337646484375, "logps/rejected": -276.12451171875, "loss": 0.477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3563764095306396, "rewards/margins": 0.8850062489509583, "rewards/rejected": -2.2413830757141113, "step": 16650 }, { "epoch": 2.870434183321847, "grad_norm": 30.924272537231445, "learning_rate": 1.1343436905566495e-09, "logits/chosen": -1.6248385906219482, "logits/rejected": -1.5855129957199097, "logps/chosen": -205.64205932617188, "logps/rejected": -295.14398193359375, "loss": 0.4859, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5187439918518066, "rewards/margins": 0.8884479403495789, "rewards/rejected": -2.407191753387451, "step": 16660 }, { "epoch": 2.872157133011716, "grad_norm": 28.749929428100586, "learning_rate": 1.1044308520078316e-09, "logits/chosen": -1.5990875959396362, "logits/rejected": -1.5556060075759888, "logps/chosen": -194.87875366210938, "logps/rejected": -295.0609130859375, "loss": 0.4493, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.434263825416565, "rewards/margins": 0.9827602505683899, "rewards/rejected": -2.4170238971710205, "step": 16670 }, { "epoch": 2.873880082701585, "grad_norm": 49.628292083740234, "learning_rate": 1.0749155146563493e-09, "logits/chosen": -1.5153428316116333, "logits/rejected": -1.4865665435791016, "logps/chosen": -206.2247314453125, "logps/rejected": -273.4560852050781, "loss": 0.5717, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5732797384262085, "rewards/margins": 0.673078179359436, "rewards/rejected": -2.2463576793670654, "step": 16680 }, { "epoch": 2.8756030323914543, "grad_norm": 31.62681007385254, "learning_rate": 1.0457977971362831e-09, "logits/chosen": -1.652151107788086, "logits/rejected": -1.6131757497787476, "logps/chosen": -202.0923614501953, "logps/rejected": -270.86669921875, "loss": 0.5374, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4801733493804932, "rewards/margins": 0.692054033279419, "rewards/rejected": -2.172227382659912, "step": 16690 }, { "epoch": 2.8773259820813233, "grad_norm": 44.19800567626953, "learning_rate": 1.0170778164834581e-09, "logits/chosen": -1.728794813156128, "logits/rejected": -1.6931133270263672, "logps/chosen": -209.34469604492188, "logps/rejected": -271.6888427734375, "loss": 0.5925, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5546461343765259, "rewards/margins": 0.6288726329803467, "rewards/rejected": -2.183518648147583, "step": 16700 }, { "epoch": 2.8790489317711923, "grad_norm": 34.03745651245117, "learning_rate": 9.887556881350901e-10, "logits/chosen": -1.6251108646392822, "logits/rejected": -1.5769708156585693, "logps/chosen": -209.3572998046875, "logps/rejected": -299.29962158203125, "loss": 0.5208, "rewards/accuracies": 0.75, "rewards/chosen": -1.5848195552825928, "rewards/margins": 0.8981760740280151, "rewards/rejected": -2.4829957485198975, "step": 16710 }, { "epoch": 2.8807718814610612, "grad_norm": 41.1098747253418, "learning_rate": 9.608315259292288e-10, "logits/chosen": -1.5905859470367432, "logits/rejected": -1.5348126888275146, "logps/chosen": -210.2429656982422, "logps/rejected": -285.15679931640625, "loss": 0.5151, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5592763423919678, "rewards/margins": 0.8024843335151672, "rewards/rejected": -2.3617606163024902, "step": 16720 }, { "epoch": 2.8824948311509306, "grad_norm": 38.210693359375, "learning_rate": 9.333054421043484e-10, "logits/chosen": -1.576797604560852, "logits/rejected": -1.5253446102142334, "logps/chosen": -192.76980590820312, "logps/rejected": -279.8339538574219, "loss": 0.5093, "rewards/accuracies": 0.75, "rewards/chosen": -1.4076998233795166, "rewards/margins": 0.8691729307174683, "rewards/rejected": -2.2768726348876953, "step": 16730 }, { "epoch": 2.8842177808407996, "grad_norm": 37.603309631347656, "learning_rate": 9.06177547298892e-10, "logits/chosen": -1.5681672096252441, "logits/rejected": -1.5280646085739136, "logps/chosen": -204.23959350585938, "logps/rejected": -271.69366455078125, "loss": 0.5554, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4879558086395264, "rewards/margins": 0.6997446417808533, "rewards/rejected": -2.1877007484436035, "step": 16740 }, { "epoch": 2.8859407305306686, "grad_norm": 31.856191635131836, "learning_rate": 8.794479505508268e-10, "logits/chosen": -1.6667163372039795, "logits/rejected": -1.6266586780548096, "logps/chosen": -197.18392944335938, "logps/rejected": -294.75921630859375, "loss": 0.4606, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4334996938705444, "rewards/margins": 0.9352189898490906, "rewards/rejected": -2.3687186241149902, "step": 16750 }, { "epoch": 2.8876636802205375, "grad_norm": 26.064311981201172, "learning_rate": 8.531167592971566e-10, "logits/chosen": -1.5521303415298462, "logits/rejected": -1.509796380996704, "logps/chosen": -211.45907592773438, "logps/rejected": -292.41741943359375, "loss": 0.5033, "rewards/accuracies": 0.75, "rewards/chosen": -1.5620230436325073, "rewards/margins": 0.8341752886772156, "rewards/rejected": -2.396198272705078, "step": 16760 }, { "epoch": 2.8893866299104065, "grad_norm": 27.30735969543457, "learning_rate": 8.271840793735884e-10, "logits/chosen": -1.6859509944915771, "logits/rejected": -1.6200199127197266, "logps/chosen": -213.5120391845703, "logps/rejected": -292.6845703125, "loss": 0.5537, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5698108673095703, "rewards/margins": 0.8569631576538086, "rewards/rejected": -2.4267737865448, "step": 16770 }, { "epoch": 2.8911095796002755, "grad_norm": 41.18815231323242, "learning_rate": 8.016500150140215e-10, "logits/chosen": -1.6033258438110352, "logits/rejected": -1.5633761882781982, "logps/chosen": -202.39730834960938, "logps/rejected": -287.2729797363281, "loss": 0.5114, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4809529781341553, "rewards/margins": 0.8144389986991882, "rewards/rejected": -2.2953920364379883, "step": 16780 }, { "epoch": 2.892832529290145, "grad_norm": 27.463346481323242, "learning_rate": 7.765146688501589e-10, "logits/chosen": -1.6069574356079102, "logits/rejected": -1.5416642427444458, "logps/chosen": -205.8412628173828, "logps/rejected": -273.67486572265625, "loss": 0.5541, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4828940629959106, "rewards/margins": 0.7489983439445496, "rewards/rejected": -2.2318923473358154, "step": 16790 }, { "epoch": 2.894555478980014, "grad_norm": 33.9183235168457, "learning_rate": 7.51778141911108e-10, "logits/chosen": -1.6445128917694092, "logits/rejected": -1.6061605215072632, "logps/chosen": -202.0955810546875, "logps/rejected": -287.78717041015625, "loss": 0.5024, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.530413269996643, "rewards/margins": 0.8305818438529968, "rewards/rejected": -2.360995292663574, "step": 16800 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -1.7274123430252075, "eval_logits/rejected": -1.703658938407898, "eval_logps/chosen": -203.63075256347656, "eval_logps/rejected": -244.1201171875, "eval_loss": 0.6304458379745483, "eval_rewards/accuracies": 0.6428903341293335, "eval_rewards/chosen": -1.4491883516311646, "eval_rewards/margins": 0.3602113127708435, "eval_rewards/rejected": -1.8093998432159424, "eval_runtime": 384.5599, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 16800 }, { "epoch": 2.896278428669883, "grad_norm": 29.492122650146484, "learning_rate": 7.274405336229361e-10, "logits/chosen": -1.5904494524002075, "logits/rejected": -1.5278266668319702, "logps/chosen": -187.41946411132812, "logps/rejected": -273.6427917480469, "loss": 0.4848, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3402408361434937, "rewards/margins": 0.9011996388435364, "rewards/rejected": -2.241440534591675, "step": 16810 }, { "epoch": 2.898001378359752, "grad_norm": 30.319656372070312, "learning_rate": 7.035019418083376e-10, "logits/chosen": -1.65801203250885, "logits/rejected": -1.610608696937561, "logps/chosen": -205.1194305419922, "logps/rejected": -271.00213623046875, "loss": 0.5268, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.499645471572876, "rewards/margins": 0.689041018486023, "rewards/rejected": -2.1886868476867676, "step": 16820 }, { "epoch": 2.899724328049621, "grad_norm": 33.45829772949219, "learning_rate": 6.799624626861456e-10, "logits/chosen": -1.7170978784561157, "logits/rejected": -1.6595051288604736, "logps/chosen": -221.271728515625, "logps/rejected": -323.4475402832031, "loss": 0.4892, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6707754135131836, "rewards/margins": 1.010047197341919, "rewards/rejected": -2.6808226108551025, "step": 16830 }, { "epoch": 2.90144727773949, "grad_norm": 43.579429626464844, "learning_rate": 6.568221908710314e-10, "logits/chosen": -1.548855185508728, "logits/rejected": -1.5003758668899536, "logps/chosen": -199.71316528320312, "logps/rejected": -278.1864013671875, "loss": 0.4935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4677910804748535, "rewards/margins": 0.8026076555252075, "rewards/rejected": -2.2703986167907715, "step": 16840 }, { "epoch": 2.903170227429359, "grad_norm": 41.76044845581055, "learning_rate": 6.340812193730949e-10, "logits/chosen": -1.6095333099365234, "logits/rejected": -1.5596811771392822, "logps/chosen": -218.263427734375, "logps/rejected": -290.07415771484375, "loss": 0.4987, "rewards/accuracies": 0.75, "rewards/chosen": -1.6301006078720093, "rewards/margins": 0.7804665565490723, "rewards/rejected": -2.410567283630371, "step": 16850 }, { "epoch": 2.904893177119228, "grad_norm": 27.137157440185547, "learning_rate": 6.117396395974749e-10, "logits/chosen": -1.6121982336044312, "logits/rejected": -1.5542309284210205, "logps/chosen": -211.4744873046875, "logps/rejected": -285.07366943359375, "loss": 0.5225, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5618984699249268, "rewards/margins": 0.7801223993301392, "rewards/rejected": -2.3420207500457764, "step": 16860 }, { "epoch": 2.906616126809097, "grad_norm": 52.22926330566406, "learning_rate": 5.897975413439837e-10, "logits/chosen": -1.6715095043182373, "logits/rejected": -1.6253564357757568, "logps/chosen": -204.73321533203125, "logps/rejected": -276.33856201171875, "loss": 0.5613, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5167036056518555, "rewards/margins": 0.738450288772583, "rewards/rejected": -2.2551538944244385, "step": 16870 }, { "epoch": 2.908339076498966, "grad_norm": 36.053855895996094, "learning_rate": 5.682550128067731e-10, "logits/chosen": -1.6771320104599, "logits/rejected": -1.628893494606018, "logps/chosen": -200.6133575439453, "logps/rejected": -296.2360534667969, "loss": 0.4839, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5162712335586548, "rewards/margins": 0.9246581196784973, "rewards/rejected": -2.440929412841797, "step": 16880 }, { "epoch": 2.910062026188835, "grad_norm": 39.545101165771484, "learning_rate": 5.471121405739687e-10, "logits/chosen": -1.5920666456222534, "logits/rejected": -1.5557562112808228, "logps/chosen": -213.32894897460938, "logps/rejected": -285.2569274902344, "loss": 0.5177, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5840203762054443, "rewards/margins": 0.7276771664619446, "rewards/rejected": -2.311697483062744, "step": 16890 }, { "epoch": 2.9117849758787044, "grad_norm": 46.50544357299805, "learning_rate": 5.263690096273033e-10, "logits/chosen": -1.664858102798462, "logits/rejected": -1.6239697933197021, "logps/chosen": -207.4394989013672, "logps/rejected": -278.8927001953125, "loss": 0.5072, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5422718524932861, "rewards/margins": 0.7455980181694031, "rewards/rejected": -2.287869930267334, "step": 16900 }, { "epoch": 2.9135079255685734, "grad_norm": 24.831132888793945, "learning_rate": 5.060257033417725e-10, "logits/chosen": -1.6837489604949951, "logits/rejected": -1.640244722366333, "logps/chosen": -203.83706665039062, "logps/rejected": -283.9046325683594, "loss": 0.5361, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5224006175994873, "rewards/margins": 0.7805654406547546, "rewards/rejected": -2.3029661178588867, "step": 16910 }, { "epoch": 2.9152308752584424, "grad_norm": 26.15068817138672, "learning_rate": 4.860823034853468e-10, "logits/chosen": -1.5878162384033203, "logits/rejected": -1.5520457029342651, "logps/chosen": -203.78700256347656, "logps/rejected": -264.70123291015625, "loss": 0.5722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.490502953529358, "rewards/margins": 0.6168953776359558, "rewards/rejected": -2.107398271560669, "step": 16920 }, { "epoch": 2.9169538249483113, "grad_norm": 29.456026077270508, "learning_rate": 4.66538890218593e-10, "logits/chosen": -1.6750223636627197, "logits/rejected": -1.6385116577148438, "logps/chosen": -193.2615203857422, "logps/rejected": -263.14617919921875, "loss": 0.5374, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3699278831481934, "rewards/margins": 0.7440475225448608, "rewards/rejected": -2.1139755249023438, "step": 16930 }, { "epoch": 2.9186767746381808, "grad_norm": 25.665470123291016, "learning_rate": 4.4739554209437536e-10, "logits/chosen": -1.646510124206543, "logits/rejected": -1.607150673866272, "logps/chosen": -201.77035522460938, "logps/rejected": -275.34112548828125, "loss": 0.5214, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4379570484161377, "rewards/margins": 0.7657677531242371, "rewards/rejected": -2.2037246227264404, "step": 16940 }, { "epoch": 2.9203997243280497, "grad_norm": 29.399169921875, "learning_rate": 4.286523360575334e-10, "logits/chosen": -1.585037112236023, "logits/rejected": -1.55281662940979, "logps/chosen": -205.02139282226562, "logps/rejected": -292.2770690917969, "loss": 0.5452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5497676134109497, "rewards/margins": 0.8234025239944458, "rewards/rejected": -2.3731701374053955, "step": 16950 }, { "epoch": 2.9221226740179187, "grad_norm": 31.07048988342285, "learning_rate": 4.103093474445818e-10, "logits/chosen": -1.623678207397461, "logits/rejected": -1.5673491954803467, "logps/chosen": -203.4460906982422, "logps/rejected": -308.8279113769531, "loss": 0.4582, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.483617901802063, "rewards/margins": 1.052154541015625, "rewards/rejected": -2.5357725620269775, "step": 16960 }, { "epoch": 2.9238456237077877, "grad_norm": 33.71870422363281, "learning_rate": 3.9236664998338885e-10, "logits/chosen": -1.696584939956665, "logits/rejected": -1.6441404819488525, "logps/chosen": -213.5078582763672, "logps/rejected": -295.99932861328125, "loss": 0.5039, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.577401041984558, "rewards/margins": 0.8441799283027649, "rewards/rejected": -2.4215807914733887, "step": 16970 }, { "epoch": 2.9255685733976566, "grad_norm": 31.795190811157227, "learning_rate": 3.7482431579289873e-10, "logits/chosen": -1.687766432762146, "logits/rejected": -1.6378850936889648, "logps/chosen": -198.27952575683594, "logps/rejected": -275.29119873046875, "loss": 0.493, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4469237327575684, "rewards/margins": 0.7793303728103638, "rewards/rejected": -2.2262539863586426, "step": 16980 }, { "epoch": 2.9272915230875256, "grad_norm": 26.28533172607422, "learning_rate": 3.5768241538282064e-10, "logits/chosen": -1.7197927236557007, "logits/rejected": -1.6632732152938843, "logps/chosen": -184.9141387939453, "logps/rejected": -280.8218688964844, "loss": 0.4643, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3191200494766235, "rewards/margins": 0.9638843536376953, "rewards/rejected": -2.2830045223236084, "step": 16990 }, { "epoch": 2.929014472777395, "grad_norm": 47.96595001220703, "learning_rate": 3.4094101765338446e-10, "logits/chosen": -1.6959985494613647, "logits/rejected": -1.6554844379425049, "logps/chosen": -206.0132293701172, "logps/rejected": -269.5046081542969, "loss": 0.5616, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5034517049789429, "rewards/margins": 0.6630902290344238, "rewards/rejected": -2.1665420532226562, "step": 17000 }, { "epoch": 2.930737422467264, "grad_norm": 26.475934982299805, "learning_rate": 3.24600189895019e-10, "logits/chosen": -1.6051123142242432, "logits/rejected": -1.5607998371124268, "logps/chosen": -209.7019805908203, "logps/rejected": -293.5786437988281, "loss": 0.5056, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5677287578582764, "rewards/margins": 0.8212777376174927, "rewards/rejected": -2.3890066146850586, "step": 17010 }, { "epoch": 2.932460372157133, "grad_norm": 39.69713592529297, "learning_rate": 3.086599977880855e-10, "logits/chosen": -1.5805712938308716, "logits/rejected": -1.5696442127227783, "logps/chosen": -215.4978790283203, "logps/rejected": -271.7910461425781, "loss": 0.6024, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.621180534362793, "rewards/margins": 0.5600312352180481, "rewards/rejected": -2.1812117099761963, "step": 17020 }, { "epoch": 2.934183321847002, "grad_norm": 35.4920539855957, "learning_rate": 2.931205054026775e-10, "logits/chosen": -1.6319687366485596, "logits/rejected": -1.586881160736084, "logps/chosen": -210.61434936523438, "logps/rejected": -277.21807861328125, "loss": 0.5437, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5412108898162842, "rewards/margins": 0.7014018893241882, "rewards/rejected": -2.242612838745117, "step": 17030 }, { "epoch": 2.9359062715368713, "grad_norm": 27.13518714904785, "learning_rate": 2.7798177519826605e-10, "logits/chosen": -1.6823952198028564, "logits/rejected": -1.6340644359588623, "logps/chosen": -220.3932342529297, "logps/rejected": -293.453369140625, "loss": 0.512, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6454026699066162, "rewards/margins": 0.7773019075393677, "rewards/rejected": -2.4227046966552734, "step": 17040 }, { "epoch": 2.9376292212267403, "grad_norm": 29.53461456298828, "learning_rate": 2.632438680235216e-10, "logits/chosen": -1.6405704021453857, "logits/rejected": -1.6034597158432007, "logps/chosen": -218.55917358398438, "logps/rejected": -279.91717529296875, "loss": 0.5688, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6312099695205688, "rewards/margins": 0.6329344511032104, "rewards/rejected": -2.2641444206237793, "step": 17050 }, { "epoch": 2.9393521709166093, "grad_norm": 40.60226821899414, "learning_rate": 2.4890684311603683e-10, "logits/chosen": -1.6730968952178955, "logits/rejected": -1.6203582286834717, "logps/chosen": -212.64511108398438, "logps/rejected": -283.8674011230469, "loss": 0.5634, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5824534893035889, "rewards/margins": 0.7069231271743774, "rewards/rejected": -2.289376735687256, "step": 17060 }, { "epoch": 2.9410751206064782, "grad_norm": 34.00909423828125, "learning_rate": 2.3497075810210433e-10, "logits/chosen": -1.6038997173309326, "logits/rejected": -1.5644137859344482, "logps/chosen": -219.3387908935547, "logps/rejected": -276.1908264160156, "loss": 0.6046, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6372060775756836, "rewards/margins": 0.6086788177490234, "rewards/rejected": -2.245885133743286, "step": 17070 }, { "epoch": 2.942798070296347, "grad_norm": 27.89004135131836, "learning_rate": 2.2143566899647248e-10, "logits/chosen": -1.5212467908859253, "logits/rejected": -1.4685356616973877, "logps/chosen": -210.71841430664062, "logps/rejected": -314.895751953125, "loss": 0.4377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5750494003295898, "rewards/margins": 1.0299382209777832, "rewards/rejected": -2.604987859725952, "step": 17080 }, { "epoch": 2.944521019986216, "grad_norm": 28.67696762084961, "learning_rate": 2.0830163020212344e-10, "logits/chosen": -1.6012256145477295, "logits/rejected": -1.553945779800415, "logps/chosen": -202.9490509033203, "logps/rejected": -297.46917724609375, "loss": 0.4804, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4996212720870972, "rewards/margins": 0.9452611207962036, "rewards/rejected": -2.44488263130188, "step": 17090 }, { "epoch": 2.9462439696760856, "grad_norm": 31.051746368408203, "learning_rate": 1.955686945100621e-10, "logits/chosen": -1.5455687046051025, "logits/rejected": -1.499463677406311, "logps/chosen": -213.981689453125, "logps/rejected": -288.32861328125, "loss": 0.5257, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5765442848205566, "rewards/margins": 0.7793576121330261, "rewards/rejected": -2.3559021949768066, "step": 17100 }, { "epoch": 2.9479669193659546, "grad_norm": 35.02011489868164, "learning_rate": 1.8323691309909407e-10, "logits/chosen": -1.5887330770492554, "logits/rejected": -1.5520597696304321, "logps/chosen": -220.20315551757812, "logps/rejected": -309.24505615234375, "loss": 0.49, "rewards/accuracies": 0.75, "rewards/chosen": -1.6530447006225586, "rewards/margins": 0.8980328440666199, "rewards/rejected": -2.551077365875244, "step": 17110 }, { "epoch": 2.9496898690558235, "grad_norm": 27.6148624420166, "learning_rate": 1.7130633553561479e-10, "logits/chosen": -1.6821790933609009, "logits/rejected": -1.627488136291504, "logps/chosen": -201.37051391601562, "logps/rejected": -285.5783996582031, "loss": 0.466, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.494118094444275, "rewards/margins": 0.8731333613395691, "rewards/rejected": -2.367251396179199, "step": 17120 }, { "epoch": 2.9514128187456925, "grad_norm": 47.40973663330078, "learning_rate": 1.597770097734541e-10, "logits/chosen": -1.5655953884124756, "logits/rejected": -1.5067824125289917, "logps/chosen": -211.8823699951172, "logps/rejected": -292.6919250488281, "loss": 0.5046, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5727508068084717, "rewards/margins": 0.8204353451728821, "rewards/rejected": -2.393186092376709, "step": 17130 }, { "epoch": 2.953135768435562, "grad_norm": 20.614322662353516, "learning_rate": 1.4864898215359857e-10, "logits/chosen": -1.5765509605407715, "logits/rejected": -1.5283482074737549, "logps/chosen": -192.6062469482422, "logps/rejected": -282.79718017578125, "loss": 0.4635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3896197080612183, "rewards/margins": 0.9155749082565308, "rewards/rejected": -2.30519437789917, "step": 17140 }, { "epoch": 2.954858718125431, "grad_norm": 43.842124938964844, "learning_rate": 1.3792229740409166e-10, "logits/chosen": -1.6786420345306396, "logits/rejected": -1.6254281997680664, "logps/chosen": -216.43814086914062, "logps/rejected": -293.46466064453125, "loss": 0.5554, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6223779916763306, "rewards/margins": 0.7882981896400452, "rewards/rejected": -2.4106762409210205, "step": 17150 }, { "epoch": 2.9565816678153, "grad_norm": 26.545705795288086, "learning_rate": 1.2759699863980067e-10, "logits/chosen": -1.715623140335083, "logits/rejected": -1.6613988876342773, "logps/chosen": -190.52456665039062, "logps/rejected": -308.39959716796875, "loss": 0.4311, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.404829740524292, "rewards/margins": 1.162705659866333, "rewards/rejected": -2.567535400390625, "step": 17160 }, { "epoch": 2.958304617505169, "grad_norm": 26.82274627685547, "learning_rate": 1.1767312736228329e-10, "logits/chosen": -1.7410396337509155, "logits/rejected": -1.6930043697357178, "logps/chosen": -230.6710205078125, "logps/rejected": -296.1482849121094, "loss": 0.5957, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7154839038848877, "rewards/margins": 0.6694554090499878, "rewards/rejected": -2.3849387168884277, "step": 17170 }, { "epoch": 2.960027567195038, "grad_norm": 33.76654052734375, "learning_rate": 1.0815072345957688e-10, "logits/chosen": -1.6340773105621338, "logits/rejected": -1.5861009359359741, "logps/chosen": -210.44271850585938, "logps/rejected": -292.2772216796875, "loss": 0.5186, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5934721231460571, "rewards/margins": 0.8232981562614441, "rewards/rejected": -2.4167699813842773, "step": 17180 }, { "epoch": 2.9617505168849068, "grad_norm": 27.789745330810547, "learning_rate": 9.902982520605396e-11, "logits/chosen": -1.6218135356903076, "logits/rejected": -1.5870753526687622, "logps/chosen": -186.03189086914062, "logps/rejected": -263.25030517578125, "loss": 0.4961, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3469972610473633, "rewards/margins": 0.7608731985092163, "rewards/rejected": -2.10787034034729, "step": 17190 }, { "epoch": 2.963473466574776, "grad_norm": 34.324520111083984, "learning_rate": 9.031046926230024e-11, "logits/chosen": -1.629669189453125, "logits/rejected": -1.6002832651138306, "logps/chosen": -197.7524871826172, "logps/rejected": -279.38873291015625, "loss": 0.5054, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4329588413238525, "rewards/margins": 0.8157581090927124, "rewards/rejected": -2.2487170696258545, "step": 17200 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -1.7261788845062256, "eval_logits/rejected": -1.7024034261703491, "eval_logps/chosen": -203.55084228515625, "eval_logps/rejected": -243.9775848388672, "eval_loss": 0.6303371787071228, "eval_rewards/accuracies": 0.6435873508453369, "eval_rewards/chosen": -1.4483894109725952, "eval_rewards/margins": 0.35958507657051086, "eval_rewards/rejected": -1.8079745769500732, "eval_runtime": 384.5784, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 17200 }, { "epoch": 2.965196416264645, "grad_norm": 35.75117111206055, "learning_rate": 8.199269067491466e-11, "logits/chosen": -1.587419033050537, "logits/rejected": -1.5489394664764404, "logps/chosen": -213.60415649414062, "logps/rejected": -301.45562744140625, "loss": 0.5117, "rewards/accuracies": 0.75, "rewards/chosen": -1.6136400699615479, "rewards/margins": 0.8584750294685364, "rewards/rejected": -2.4721150398254395, "step": 17210 }, { "epoch": 2.966919365954514, "grad_norm": 29.765380859375, "learning_rate": 7.407652287640953e-11, "logits/chosen": -1.5780322551727295, "logits/rejected": -1.5399539470672607, "logps/chosen": -216.28274536132812, "logps/rejected": -317.44146728515625, "loss": 0.4935, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6082651615142822, "rewards/margins": 0.9990278482437134, "rewards/rejected": -2.607292890548706, "step": 17220 }, { "epoch": 2.968642315644383, "grad_norm": 23.371280670166016, "learning_rate": 6.656199768505511e-11, "logits/chosen": -1.6521990299224854, "logits/rejected": -1.6265060901641846, "logps/chosen": -206.3870086669922, "logps/rejected": -290.2902526855469, "loss": 0.4938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5403478145599365, "rewards/margins": 0.8099233508110046, "rewards/rejected": -2.350271224975586, "step": 17230 }, { "epoch": 2.9703652653342525, "grad_norm": 29.65461540222168, "learning_rate": 5.944914530475742e-11, "logits/chosen": -1.6170860528945923, "logits/rejected": -1.5837604999542236, "logps/chosen": -198.25338745117188, "logps/rejected": -274.77642822265625, "loss": 0.5417, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.443676233291626, "rewards/margins": 0.7559821009635925, "rewards/rejected": -2.199658155441284, "step": 17240 }, { "epoch": 2.9720882150241215, "grad_norm": 33.59632110595703, "learning_rate": 5.2737994324958403e-11, "logits/chosen": -1.5519667863845825, "logits/rejected": -1.512790560722351, "logps/chosen": -204.31234741210938, "logps/rejected": -287.47869873046875, "loss": 0.4811, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5204417705535889, "rewards/margins": 0.8285139799118042, "rewards/rejected": -2.3489556312561035, "step": 17250 }, { "epoch": 2.9738111647139904, "grad_norm": 27.488880157470703, "learning_rate": 4.642857172045822e-11, "logits/chosen": -1.5949409008026123, "logits/rejected": -1.544858694076538, "logps/chosen": -201.71656799316406, "logps/rejected": -292.2054748535156, "loss": 0.482, "rewards/accuracies": 0.78125, "rewards/chosen": -1.46438729763031, "rewards/margins": 0.9317871332168579, "rewards/rejected": -2.396174430847168, "step": 17260 }, { "epoch": 2.9755341144038594, "grad_norm": 39.49324417114258, "learning_rate": 4.052090285138199e-11, "logits/chosen": -1.604092001914978, "logits/rejected": -1.560024380683899, "logps/chosen": -220.26065063476562, "logps/rejected": -285.1950988769531, "loss": 0.5878, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6459325551986694, "rewards/margins": 0.6614850163459778, "rewards/rejected": -2.307417631149292, "step": 17270 }, { "epoch": 2.9772570640937284, "grad_norm": 25.67490577697754, "learning_rate": 3.501501146304653e-11, "logits/chosen": -1.5917980670928955, "logits/rejected": -1.5360498428344727, "logps/chosen": -196.49559020996094, "logps/rejected": -287.30413818359375, "loss": 0.4694, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4197368621826172, "rewards/margins": 0.9354475140571594, "rewards/rejected": -2.355184316635132, "step": 17280 }, { "epoch": 2.9789800137835973, "grad_norm": 27.564085006713867, "learning_rate": 2.991091968582715e-11, "logits/chosen": -1.6404955387115479, "logits/rejected": -1.596842885017395, "logps/chosen": -212.32986450195312, "logps/rejected": -295.46478271484375, "loss": 0.4701, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5325970649719238, "rewards/margins": 0.8477786183357239, "rewards/rejected": -2.380375862121582, "step": 17290 }, { "epoch": 2.9807029634734663, "grad_norm": 30.048521041870117, "learning_rate": 2.5208648035146553e-11, "logits/chosen": -1.658026099205017, "logits/rejected": -1.6178169250488281, "logps/chosen": -203.4453125, "logps/rejected": -279.23419189453125, "loss": 0.519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4819501638412476, "rewards/margins": 0.7884173393249512, "rewards/rejected": -2.2703678607940674, "step": 17300 }, { "epoch": 2.9824259131633357, "grad_norm": 36.83639907836914, "learning_rate": 2.0908215411330477e-11, "logits/chosen": -1.6300697326660156, "logits/rejected": -1.5847969055175781, "logps/chosen": -209.23526000976562, "logps/rejected": -297.26318359375, "loss": 0.4792, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5416957139968872, "rewards/margins": 0.9089337587356567, "rewards/rejected": -2.450629472732544, "step": 17310 }, { "epoch": 2.9841488628532047, "grad_norm": 34.16779708862305, "learning_rate": 1.7009639099541118e-11, "logits/chosen": -1.6238996982574463, "logits/rejected": -1.5776101350784302, "logps/chosen": -212.78903198242188, "logps/rejected": -294.0577087402344, "loss": 0.513, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.587726354598999, "rewards/margins": 0.8322674036026001, "rewards/rejected": -2.4199938774108887, "step": 17320 }, { "epoch": 2.9858718125430737, "grad_norm": 33.81337356567383, "learning_rate": 1.35129347697438e-11, "logits/chosen": -1.5871931314468384, "logits/rejected": -1.5417367219924927, "logps/chosen": -206.4938507080078, "logps/rejected": -279.7281188964844, "loss": 0.5444, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.512385606765747, "rewards/margins": 0.7700924277305603, "rewards/rejected": -2.282477855682373, "step": 17330 }, { "epoch": 2.987594762232943, "grad_norm": 22.616756439208984, "learning_rate": 1.0418116476584859e-11, "logits/chosen": -1.687488317489624, "logits/rejected": -1.6277202367782593, "logps/chosen": -185.19866943359375, "logps/rejected": -278.3511047363281, "loss": 0.4499, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3271843194961548, "rewards/margins": 0.938696563243866, "rewards/rejected": -2.265880823135376, "step": 17340 }, { "epoch": 2.989317711922812, "grad_norm": 33.05019760131836, "learning_rate": 7.725196659413847e-12, "logits/chosen": -1.6483237743377686, "logits/rejected": -1.5888398885726929, "logps/chosen": -191.623779296875, "logps/rejected": -276.997802734375, "loss": 0.488, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3805105686187744, "rewards/margins": 0.8659530878067017, "rewards/rejected": -2.2464635372161865, "step": 17350 }, { "epoch": 2.991040661612681, "grad_norm": 39.97116470336914, "learning_rate": 5.4341861421391965e-12, "logits/chosen": -1.7078800201416016, "logits/rejected": -1.6725330352783203, "logps/chosen": -197.40243530273438, "logps/rejected": -285.66595458984375, "loss": 0.4817, "rewards/accuracies": 0.75, "rewards/chosen": -1.4299650192260742, "rewards/margins": 0.8771953582763672, "rewards/rejected": -2.3071603775024414, "step": 17360 }, { "epoch": 2.99276361130255, "grad_norm": 40.23675537109375, "learning_rate": 3.5450941332726415e-12, "logits/chosen": -1.6248849630355835, "logits/rejected": -1.5938169956207275, "logps/chosen": -205.1375732421875, "logps/rejected": -274.900146484375, "loss": 0.5501, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5296798944473267, "rewards/margins": 0.6712183356285095, "rewards/rejected": -2.2008984088897705, "step": 17370 }, { "epoch": 2.994486560992419, "grad_norm": 36.696414947509766, "learning_rate": 2.0579282258292862e-12, "logits/chosen": -1.6086667776107788, "logits/rejected": -1.573327660560608, "logps/chosen": -213.1057586669922, "logps/rejected": -290.3735046386719, "loss": 0.5333, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.601100206375122, "rewards/margins": 0.7504446506500244, "rewards/rejected": -2.3515448570251465, "step": 17380 }, { "epoch": 2.996209510682288, "grad_norm": 56.76747512817383, "learning_rate": 9.726943973387137e-13, "logits/chosen": -1.6698760986328125, "logits/rejected": -1.626619577407837, "logps/chosen": -204.91049194335938, "logps/rejected": -282.6231689453125, "loss": 0.5274, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5097030401229858, "rewards/margins": 0.782251238822937, "rewards/rejected": -2.2919540405273438, "step": 17390 }, { "epoch": 2.997932460372157, "grad_norm": 28.173402786254883, "learning_rate": 2.8939700977836934e-13, "logits/chosen": -1.6111886501312256, "logits/rejected": -1.5667483806610107, "logps/chosen": -220.0554656982422, "logps/rejected": -298.34857177734375, "loss": 0.5284, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6483074426651, "rewards/margins": 0.8398798704147339, "rewards/rejected": -2.488187313079834, "step": 17400 }, { "epoch": 2.9996554100620263, "grad_norm": 41.602970123291016, "learning_rate": 8.038809595767305e-15, "logits/chosen": -1.5945218801498413, "logits/rejected": -1.548925518989563, "logps/chosen": -201.5911865234375, "logps/rejected": -293.29644775390625, "loss": 0.4551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.475780725479126, "rewards/margins": 0.907615065574646, "rewards/rejected": -2.3833956718444824, "step": 17410 }, { "epoch": 3.0, "step": 17412, "total_flos": 0.0, "train_loss": 0.5724969553024556, "train_runtime": 86264.9537, "train_samples_per_second": 3.229, "train_steps_per_second": 0.202 } ], "logging_steps": 10, "max_steps": 17412, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }